19876179924

Committed 02 Dec 2025 10:54PM UTC coverage: 59.51% (+2.1%) from 57.374%

Build # 19876179924

Build Type

push

github

Committed by

web-flow

Commit Message

refactor: modularize drive audit commands (#24)

Coverage Stats

212 of 736 new or added lines in 13 files covered. (28.8%)

4 existing lines in 2 files now uncovered.

1408 of 2366 relevant lines covered (59.51%)

0.6 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

73.58

/src/drive_audit/location_normalizer.py

"""Location normalization module for comparing file locations.

This module provides simplified location normalization that replaces
basic forbidden characters (/, :, ?) with underscores, without complex
path flattening/unflattening logic.
"""

import re
from pathlib import Path
from typing import Dict

from .compare import (
    MIME_NORMALIZATION_GROUPS,
    normalize_file_name,
    normalize_unicode,
    remove_duplicate_suffix,
)


class LocationNormalizer:
    """Simplified location normalizer that replaces basic characters."""

    def __init__(
        self,
        normalize_file_names: bool = False,
        ignore_duplicate_suffixes: bool = False,
    ):
        """
        Initialize the normalizer.

        Args:
            normalize_file_names: If True, normalize file/folder names in path
            ignore_duplicate_suffixes: If True, remove duplicate suffixes like (1), (2) from file names
        """
        self.normalize_file_names = normalize_file_names
        self.ignore_duplicate_suffixes = ignore_duplicate_suffixes

    def normalize(self, row: Dict[str, str]) -> str:
        """
        Normalize location for comparison.

        Simplified algorithm:
        1. Normalize Unicode
        2. Replace all / and - with _ (for comparison, we don't care where they were)
        3. Apply normalize_file_name to replace other forbidden characters (:, ? → _)
        4. Remove duplicate suffixes (if requested)
        5. Normalize MIME types (remove extension if in same group) - once
        6. Final cleanup of trailing spaces/underscores

        Args:
            row: CSV row with location, mimeType fields

        Returns:
            Normalized location string
        """
        location = (row.get("location") or "").strip()
        if not location:
            return location

        # Step 1: Normalize Unicode
        location = normalize_unicode(location)

        # Step 2: Replace all / and - with _ for comparison
        # This simplifies comparison - we don't care where / or - was (path separator or in filename)
        # Replace / first, then - to ensure consistent normalization
        location = location.replace("/", "_")
        location = location.replace("-", "_")

        # Step 3: Apply normalize_file_name to replace other forbidden characters
        # This replaces :, ? and other forbidden chars with _
        if self.normalize_file_names:
            # Normalize the entire location string
            location = normalize_file_name(location, replace_with="_")
            # Normalize spaces around underscores: " _ " -> "_", " _" -> "_", "_ " -> "_"
            location = re.sub(r"\s+_\s+", "_", location)
            location = location.replace(" _", "_").replace("_ ", "_")
        else:
            # Even without normalize_file_names, we still need to handle URL patterns
            # Replace :// and :/ patterns
            location = re.sub(r":/+/?", "_", location)
            # Replace other forbidden characters that might cause issues
            location = location.replace("?", "_").replace(":", "_")

        # Step 4: Remove duplicate suffixes if requested
        if self.ignore_duplicate_suffixes:
            # Since we already replaced all / with _, we can work with the entire location
            # Find the last part (after last _) or use entire location if no _
            last_underscore = location.rfind("_")
            if last_underscore >= 0:
                last_part = location[last_underscore + 1 :]
                if "." in last_part:
                    normalized_last = remove_duplicate_suffix(last_part)
                    location = location[: last_underscore + 1] + normalized_last
            else:
                # No underscore, entire location is the filename
                if "." in location:
                    location = remove_duplicate_suffix(location)

        # Step 5: Normalize MIME types (remove extension if in same group) - once
        # Also remove all extensions recursively (handles cases like .csv.xlsx)
        mime_type = (row.get("mimeType") or row.get("mime_type") or "").strip()

        # Remove all extensions recursively until no more extensions found
        # This handles cases like "file.csv.xlsx" -> "file"
        # We remove all extensions for comparison, regardless of MIME groups
        max_iterations = 10  # Safety limit to avoid infinite loops
        iteration = 0
        while iteration < max_iterations:
            location_for_suffix = location.rstrip("_")
            suffix = Path(location_for_suffix).suffix.lower()

            if not suffix or len(suffix) <= 1:
                break  # No extension found

            # Remove the extension
            suffix_len = len(suffix)
            # Handle trailing underscore before extension (e.g., "file_.xlsx")
            if location.lower().endswith("_" + suffix):
                location = location[: -(suffix_len + 1)]
            elif location.lower().endswith(suffix + "_"):
                location = location[: -(suffix_len + 1)]
            elif location.lower().endswith(suffix):
                location = location[:-suffix_len]
            location = location.rstrip("_")
            iteration += 1

        # Step 6: Final cleanup - remove trailing underscores before extensions
        location = re.sub(r"_(\.[^.]+)$", r"\1", location)

        # Step 7: Strip trailing spaces, underscores, and dots
        # This handles cases like "file." -> "file" and "file_." -> "file"
        location = location.rstrip(" _.")

        return location


def normalize_location(
    row: Dict[str, str],
    normalize_file_names: bool = False,
    ignore_duplicate_suffixes: bool = False,
) -> str:
    """
    Normalize location for comparison - simplified version.

    This is a simplified version that replaces all / and - with _ (regardless of where
    they appear), and other forbidden characters (:, ?) with underscores, without
    complex path flattening/unflattening logic.

    Args:
        row: CSV row with location, mimeType fields
        normalize_file_names: If True, normalize file/folder names in path
        ignore_duplicate_suffixes: If True, remove duplicate suffixes like (1), (2) from file names

    Returns:
        Normalized location string
    """
    normalizer = LocationNormalizer(
        normalize_file_names=normalize_file_names,
        ignore_duplicate_suffixes=ignore_duplicate_suffixes,
    )
    return normalizer.normalize(row)

1	"""Location normalization module for comparing file locations.
2
3	This module provides simplified location normalization that replaces
4	basic forbidden characters (/, :, ?) with underscores, without complex
5	path flattening/unflattening logic.
6	"""
7
8	import re	1✔
9	from pathlib import Path	1✔
10	from typing import Dict	1✔
11
12	from .compare import (	1✔
13	MIME_NORMALIZATION_GROUPS,
14	normalize_file_name,
15	normalize_unicode,
16	remove_duplicate_suffix,
17	)
18
19
20	class LocationNormalizer:	1✔
21	"""Simplified location normalizer that replaces basic characters."""
22
23	def __init__(	1✔
24	self,
25	normalize_file_names: bool = False,
26	ignore_duplicate_suffixes: bool = False,
27	):
28	"""
29	Initialize the normalizer.
30
31	Args:
32	normalize_file_names: If True, normalize file/folder names in path
33	ignore_duplicate_suffixes: If True, remove duplicate suffixes like (1), (2) from file names
34	"""
35	self.normalize_file_names = normalize_file_names	1✔
36	self.ignore_duplicate_suffixes = ignore_duplicate_suffixes	1✔
37
38	def normalize(self, row: Dict[str, str]) -> str:	1✔
39	"""
40	Normalize location for comparison.
41
42	Simplified algorithm:
43	1. Normalize Unicode
44	2. Replace all / and - with _ (for comparison, we don't care where they were)
45	3. Apply normalize_file_name to replace other forbidden characters (:, ? → _)
46	4. Remove duplicate suffixes (if requested)
47	5. Normalize MIME types (remove extension if in same group) - once
48	6. Final cleanup of trailing spaces/underscores
49
50	Args:
51	row: CSV row with location, mimeType fields
52
53	Returns:
54	Normalized location string
55	"""
56	location = (row.get("location") or "").strip()	1✔
57	if not location:	1✔
58	return location	×
59
60	# Step 1: Normalize Unicode
61	location = normalize_unicode(location)	1✔
62
63	# Step 2: Replace all / and - with _ for comparison
64	# This simplifies comparison - we don't care where / or - was (path separator or in filename)
65	# Replace / first, then - to ensure consistent normalization
66	location = location.replace("/", "_")	1✔
67	location = location.replace("-", "_")	1✔
68
69	# Step 3: Apply normalize_file_name to replace other forbidden characters
70	# This replaces :, ? and other forbidden chars with _
71	if self.normalize_file_names:	1✔
72	# Normalize the entire location string
73	location = normalize_file_name(location, replace_with="_")	×
74	# Normalize spaces around underscores: " _ " -> "_", " _" -> "_", "_ " -> "_"
75	location = re.sub(r"\s+_\s+", "_", location)	×
76	location = location.replace(" _", "_").replace("_ ", "_")	×
77	else:
78	# Even without normalize_file_names, we still need to handle URL patterns
79	# Replace :// and :/ patterns
80	location = re.sub(r":/+/?", "_", location)	1✔
81	# Replace other forbidden characters that might cause issues
82	location = location.replace("?", "_").replace(":", "_")	1✔
83
84	# Step 4: Remove duplicate suffixes if requested
85	if self.ignore_duplicate_suffixes:	1✔
86	# Since we already replaced all / with _, we can work with the entire location
87	# Find the last part (after last _) or use entire location if no _
88	last_underscore = location.rfind("_")	×
89	if last_underscore >= 0:	×
NEW 90	last_part = location[last_underscore + 1 :]	×
91	if "." in last_part:	×
92	normalized_last = remove_duplicate_suffix(last_part)	×
93	location = location[: last_underscore + 1] + normalized_last	×
94	else:
95	# No underscore, entire location is the filename
96	if "." in location:	×
97	location = remove_duplicate_suffix(location)	×
98
99	# Step 5: Normalize MIME types (remove extension if in same group) - once
100	# Also remove all extensions recursively (handles cases like .csv.xlsx)
101	mime_type = (row.get("mimeType") or row.get("mime_type") or "").strip()	1✔
102
103	# Remove all extensions recursively until no more extensions found
104	# This handles cases like "file.csv.xlsx" -> "file"
105	# We remove all extensions for comparison, regardless of MIME groups
106	max_iterations = 10 # Safety limit to avoid infinite loops	1✔
107	iteration = 0	1✔
108	while iteration < max_iterations:	1✔
109	location_for_suffix = location.rstrip("_")	1✔
110	suffix = Path(location_for_suffix).suffix.lower()	1✔
111
112	if not suffix or len(suffix) <= 1:	1✔
113	break # No extension found	1✔
114
115	# Remove the extension
116	suffix_len = len(suffix)	1✔
117	# Handle trailing underscore before extension (e.g., "file_.xlsx")
118	if location.lower().endswith("_" + suffix):	1✔
119	location = location[: -(suffix_len + 1)]	×
120	elif location.lower().endswith(suffix + "_"):	1✔
121	location = location[: -(suffix_len + 1)]	×
122	elif location.lower().endswith(suffix):	1✔
123	location = location[:-suffix_len]	1✔
124	location = location.rstrip("_")	1✔
125	iteration += 1	1✔
126
127	# Step 6: Final cleanup - remove trailing underscores before extensions
128	location = re.sub(r"_(\.[^.]+)$", r"\1", location)	1✔
129
130	# Step 7: Strip trailing spaces, underscores, and dots
131	# This handles cases like "file." -> "file" and "file_." -> "file"
132	location = location.rstrip(" _.")	1✔
133
134	return location	1✔
135
136
137	def normalize_location(	1✔
138	row: Dict[str, str],
139	normalize_file_names: bool = False,
140	ignore_duplicate_suffixes: bool = False,
141	) -> str:
142	"""
143	Normalize location for comparison - simplified version.
144
145	This is a simplified version that replaces all / and - with _ (regardless of where
146	they appear), and other forbidden characters (:, ?) with underscores, without
147	complex path flattening/unflattening logic.
148
149	Args:
150	row: CSV row with location, mimeType fields
151	normalize_file_names: If True, normalize file/folder names in path
152	ignore_duplicate_suffixes: If True, remove duplicate suffixes like (1), (2) from file names
153
154	Returns:
155	Normalized location string
156	"""
157	normalizer = LocationNormalizer(	1✔
158	normalize_file_names=normalize_file_names,
159	ignore_duplicate_suffixes=ignore_duplicate_suffixes,
160	)
161	return normalizer.normalize(row)	1✔

popstas / google-drive-access / 19876179924

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous