19828361489

Committed 01 Dec 2025 03:41PM UTC coverage: 69.697% (-7.8%) from 77.49%

Build # 19828361489

Build Type

push

github

Committed by

web-flow

Commit Message

feat: add CSV compare_files command (#23)

* Add CSV compare_files command

* Update compare outputs to data directory

* fix: normalize compare formats

* many comparsion fixes

* fix: update tests to match new compare_files_by_location output format

- Update test_compare_files_by_location_writes_differences to expect new column format (location, client_name, mime_type, modified)
- Update test_compare_files_by_location_normalizes_google_and_office_formats to use mime_type (snake_case) instead of mimeType
- Fix test_ensure_public_subdir_delegates_to_permissions by providing 2 list() responses for find_child_folder fallback

Run Details

180 of 404 new or added lines in 5 files covered. (44.55%)

1 existing line in 1 file now uncovered.

1196 of 1716 relevant lines covered (69.7%)

0.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

73.58

/src/drive_audit/location_normalizer.py

"""Location normalization module for comparing file locations.

This module provides simplified location normalization that replaces
basic forbidden characters (/, :, ?) with underscores, without complex
path flattening/unflattening logic.
"""
import re
from pathlib import Path
from typing import Dict

from .compare import (
    MIME_NORMALIZATION_GROUPS,
    normalize_file_name,
    normalize_unicode,
    remove_duplicate_suffix,
)


class LocationNormalizer:
    """Simplified location normalizer that replaces basic characters."""

    def __init__(
        self,
        normalize_file_names: bool = False,
        ignore_duplicate_suffixes: bool = False,
    ):
        """
        Initialize the normalizer.

        Args:
            normalize_file_names: If True, normalize file/folder names in path
            ignore_duplicate_suffixes: If True, remove duplicate suffixes like (1), (2) from file names
        """
        self.normalize_file_names = normalize_file_names
        self.ignore_duplicate_suffixes = ignore_duplicate_suffixes

    def normalize(self, row: Dict[str, str]) -> str:
        """
        Normalize location for comparison.

        Simplified algorithm:
        1. Normalize Unicode
        2. Replace all / and - with _ (for comparison, we don't care where they were)
        3. Apply normalize_file_name to replace other forbidden characters (:, ? → _)
        4. Remove duplicate suffixes (if requested)
        5. Normalize MIME types (remove extension if in same group) - once
        6. Final cleanup of trailing spaces/underscores

        Args:
            row: CSV row with location, mimeType fields

        Returns:
            Normalized location string
        """
        location = (row.get("location") or "").strip()
        if not location:
            return location

        # Step 1: Normalize Unicode
        location = normalize_unicode(location)

        # Step 2: Replace all / and - with _ for comparison
        # This simplifies comparison - we don't care where / or - was (path separator or in filename)
        # Replace / first, then - to ensure consistent normalization
        location = location.replace("/", "_")
        location = location.replace("-", "_")
        
        # Step 3: Apply normalize_file_name to replace other forbidden characters
        # This replaces :, ? and other forbidden chars with _
        if self.normalize_file_names:
            # Normalize the entire location string
            location = normalize_file_name(location, replace_with="_")
            # Normalize spaces around underscores: " _ " -> "_", " _" -> "_", "_ " -> "_"
            location = re.sub(r"\s+_\s+", "_", location)
            location = location.replace(" _", "_").replace("_ ", "_")
        else:
            # Even without normalize_file_names, we still need to handle URL patterns
            # Replace :// and :/ patterns
            location = re.sub(r':/+/?', '_', location)
            # Replace other forbidden characters that might cause issues
            location = location.replace('?', '_').replace(':', '_')

        # Step 4: Remove duplicate suffixes if requested
        if self.ignore_duplicate_suffixes:
            # Since we already replaced all / with _, we can work with the entire location
            # Find the last part (after last _) or use entire location if no _
            last_underscore = location.rfind("_")
            if last_underscore >= 0:
                last_part = location[last_underscore + 1:]
                if "." in last_part:
                    normalized_last = remove_duplicate_suffix(last_part)
                    location = location[: last_underscore + 1] + normalized_last
            else:
                # No underscore, entire location is the filename
                if "." in location:
                    location = remove_duplicate_suffix(location)

        # Step 5: Normalize MIME types (remove extension if in same group) - once
        # Also remove all extensions recursively (handles cases like .csv.xlsx)
        mime_type = (row.get("mimeType") or row.get("mime_type") or "").strip()
        
        # Remove all extensions recursively until no more extensions found
        # This handles cases like "file.csv.xlsx" -> "file"
        # We remove all extensions for comparison, regardless of MIME groups
        max_iterations = 10  # Safety limit to avoid infinite loops
        iteration = 0
        while iteration < max_iterations:
            location_for_suffix = location.rstrip("_")
            suffix = Path(location_for_suffix).suffix.lower()
            
            if not suffix or len(suffix) <= 1:
                break  # No extension found
            
            # Remove the extension
            suffix_len = len(suffix)
            # Handle trailing underscore before extension (e.g., "file_.xlsx")
            if location.lower().endswith("_" + suffix):
                location = location[: -(suffix_len + 1)]
            elif location.lower().endswith(suffix + "_"):
                location = location[: -(suffix_len + 1)]
            elif location.lower().endswith(suffix):
                location = location[: -suffix_len]
            location = location.rstrip("_")
            iteration += 1

        # Step 6: Final cleanup - remove trailing underscores before extensions
        location = re.sub(r"_(\.[^.]+)$", r"\1", location)

        # Step 7: Strip trailing spaces, underscores, and dots
        # This handles cases like "file." -> "file" and "file_." -> "file"
        location = location.rstrip(" _.")

        return location


def normalize_location(
    row: Dict[str, str],
    normalize_file_names: bool = False,
    ignore_duplicate_suffixes: bool = False,
) -> str:
    """
    Normalize location for comparison - simplified version.

    This is a simplified version that replaces all / and - with _ (regardless of where
    they appear), and other forbidden characters (:, ?) with underscores, without
    complex path flattening/unflattening logic.

    Args:
        row: CSV row with location, mimeType fields
        normalize_file_names: If True, normalize file/folder names in path
        ignore_duplicate_suffixes: If True, remove duplicate suffixes like (1), (2) from file names

    Returns:
        Normalized location string
    """
    normalizer = LocationNormalizer(
        normalize_file_names=normalize_file_names,
        ignore_duplicate_suffixes=ignore_duplicate_suffixes,
    )
    return normalizer.normalize(row)


1	"""Location normalization module for comparing file locations.
2
3	This module provides simplified location normalization that replaces
4	basic forbidden characters (/, :, ?) with underscores, without complex
5	path flattening/unflattening logic.
6	"""
7	import re	1✔
8	from pathlib import Path	1✔
9	from typing import Dict	1✔
10
11	from .compare import (	1✔
12	MIME_NORMALIZATION_GROUPS,
13	normalize_file_name,
14	normalize_unicode,
15	remove_duplicate_suffix,
16	)
17
18
19	class LocationNormalizer:	1✔
20	"""Simplified location normalizer that replaces basic characters."""
21
22	def __init__(	1✔
23	self,
24	normalize_file_names: bool = False,
25	ignore_duplicate_suffixes: bool = False,
26	):
27	"""
28	Initialize the normalizer.
29
30	Args:
31	normalize_file_names: If True, normalize file/folder names in path
32	ignore_duplicate_suffixes: If True, remove duplicate suffixes like (1), (2) from file names
33	"""
34	self.normalize_file_names = normalize_file_names	1✔
35	self.ignore_duplicate_suffixes = ignore_duplicate_suffixes	1✔
36
37	def normalize(self, row: Dict[str, str]) -> str:	1✔
38	"""
39	Normalize location for comparison.
40
41	Simplified algorithm:
42	1. Normalize Unicode
43	2. Replace all / and - with _ (for comparison, we don't care where they were)
44	3. Apply normalize_file_name to replace other forbidden characters (:, ? → _)
45	4. Remove duplicate suffixes (if requested)
46	5. Normalize MIME types (remove extension if in same group) - once
47	6. Final cleanup of trailing spaces/underscores
48
49	Args:
50	row: CSV row with location, mimeType fields
51
52	Returns:
53	Normalized location string
54	"""
55	location = (row.get("location") or "").strip()	1✔
56	if not location:	1✔
NEW 57	return location	×
58
59	# Step 1: Normalize Unicode
60	location = normalize_unicode(location)	1✔
61
62	# Step 2: Replace all / and - with _ for comparison
63	# This simplifies comparison - we don't care where / or - was (path separator or in filename)
64	# Replace / first, then - to ensure consistent normalization
65	location = location.replace("/", "_")	1✔
66	location = location.replace("-", "_")	1✔
67
68	# Step 3: Apply normalize_file_name to replace other forbidden characters
69	# This replaces :, ? and other forbidden chars with _
70	if self.normalize_file_names:	1✔
71	# Normalize the entire location string
NEW 72	location = normalize_file_name(location, replace_with="_")	×
73	# Normalize spaces around underscores: " _ " -> "_", " _" -> "_", "_ " -> "_"
NEW 74	location = re.sub(r"\s+_\s+", "_", location)	×
NEW 75	location = location.replace(" _", "_").replace("_ ", "_")	×
76	else:
77	# Even without normalize_file_names, we still need to handle URL patterns
78	# Replace :// and :/ patterns
79	location = re.sub(r':/+/?', '_', location)	1✔
80	# Replace other forbidden characters that might cause issues
81	location = location.replace('?', '_').replace(':', '_')	1✔
82
83	# Step 4: Remove duplicate suffixes if requested
84	if self.ignore_duplicate_suffixes:	1✔
85	# Since we already replaced all / with _, we can work with the entire location
86	# Find the last part (after last _) or use entire location if no _
NEW 87	last_underscore = location.rfind("_")	×
NEW 88	if last_underscore >= 0:	×
NEW 89	last_part = location[last_underscore + 1:]	×
NEW 90	if "." in last_part:	×
NEW 91	normalized_last = remove_duplicate_suffix(last_part)	×
NEW 92	location = location[: last_underscore + 1] + normalized_last	×
93	else:
94	# No underscore, entire location is the filename
NEW 95	if "." in location:	×
NEW 96	location = remove_duplicate_suffix(location)	×
97
98	# Step 5: Normalize MIME types (remove extension if in same group) - once
99	# Also remove all extensions recursively (handles cases like .csv.xlsx)
100	mime_type = (row.get("mimeType") or row.get("mime_type") or "").strip()	1✔
101
102	# Remove all extensions recursively until no more extensions found
103	# This handles cases like "file.csv.xlsx" -> "file"
104	# We remove all extensions for comparison, regardless of MIME groups
105	max_iterations = 10 # Safety limit to avoid infinite loops	1✔
106	iteration = 0	1✔
107	while iteration < max_iterations:	1✔
108	location_for_suffix = location.rstrip("_")	1✔
109	suffix = Path(location_for_suffix).suffix.lower()	1✔
110
111	if not suffix or len(suffix) <= 1:	1✔
112	break # No extension found	1✔
113
114	# Remove the extension
115	suffix_len = len(suffix)	1✔
116	# Handle trailing underscore before extension (e.g., "file_.xlsx")
117	if location.lower().endswith("_" + suffix):	1✔
NEW 118	location = location[: -(suffix_len + 1)]	×
119	elif location.lower().endswith(suffix + "_"):	1✔
NEW 120	location = location[: -(suffix_len + 1)]	×
121	elif location.lower().endswith(suffix):	1✔
122	location = location[: -suffix_len]	1✔
123	location = location.rstrip("_")	1✔
124	iteration += 1	1✔
125
126	# Step 6: Final cleanup - remove trailing underscores before extensions
127	location = re.sub(r"_(\.[^.]+)$", r"\1", location)	1✔
128
129	# Step 7: Strip trailing spaces, underscores, and dots
130	# This handles cases like "file." -> "file" and "file_." -> "file"
131	location = location.rstrip(" _.")	1✔
132
133	return location	1✔
134
135
136	def normalize_location(	1✔
137	row: Dict[str, str],
138	normalize_file_names: bool = False,
139	ignore_duplicate_suffixes: bool = False,
140	) -> str:
141	"""
142	Normalize location for comparison - simplified version.
143
144	This is a simplified version that replaces all / and - with _ (regardless of where
145	they appear), and other forbidden characters (:, ?) with underscores, without
146	complex path flattening/unflattening logic.
147
148	Args:
149	row: CSV row with location, mimeType fields
150	normalize_file_names: If True, normalize file/folder names in path
151	ignore_duplicate_suffixes: If True, remove duplicate suffixes like (1), (2) from file names
152
153	Returns:
154	Normalized location string
155	"""
156	normalizer = LocationNormalizer(	1✔
157	normalize_file_names=normalize_file_names,
158	ignore_duplicate_suffixes=ignore_duplicate_suffixes,
159	)
160	return normalizer.normalize(row)	1✔
161

popstas / google-drive-access / 19828361489

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous