• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

popstas / google-drive-access / 19828361489

01 Dec 2025 03:41PM UTC coverage: 69.697% (-7.8%) from 77.49%
19828361489

push

github

web-flow
feat: add CSV compare_files command (#23)

* Add CSV compare_files command

* Update compare outputs to data directory

* fix: normalize compare formats

* many comparsion fixes

* fix: update tests to match new compare_files_by_location output format

- Update test_compare_files_by_location_writes_differences to expect new column format (location, client_name, mime_type, modified)
- Update test_compare_files_by_location_normalizes_google_and_office_formats to use mime_type (snake_case) instead of mimeType
- Fix test_ensure_public_subdir_delegates_to_permissions by providing 2 list() responses for find_child_folder fallback

180 of 404 new or added lines in 5 files covered. (44.55%)

1 existing line in 1 file now uncovered.

1196 of 1716 relevant lines covered (69.7%)

0.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

73.58
/src/drive_audit/location_normalizer.py
1
"""Location normalization module for comparing file locations.
2

3
This module provides simplified location normalization that replaces
4
basic forbidden characters (/, :, ?) with underscores, without complex
5
path flattening/unflattening logic.
6
"""
7
import re
1✔
8
from pathlib import Path
1✔
9
from typing import Dict
1✔
10

11
from .compare import (
1✔
12
    MIME_NORMALIZATION_GROUPS,
13
    normalize_file_name,
14
    normalize_unicode,
15
    remove_duplicate_suffix,
16
)
17

18

19
class LocationNormalizer:
1✔
20
    """Simplified location normalizer that replaces basic characters."""
21

22
    def __init__(
1✔
23
        self,
24
        normalize_file_names: bool = False,
25
        ignore_duplicate_suffixes: bool = False,
26
    ):
27
        """
28
        Initialize the normalizer.
29

30
        Args:
31
            normalize_file_names: If True, normalize file/folder names in path
32
            ignore_duplicate_suffixes: If True, remove duplicate suffixes like (1), (2) from file names
33
        """
34
        self.normalize_file_names = normalize_file_names
1✔
35
        self.ignore_duplicate_suffixes = ignore_duplicate_suffixes
1✔
36

37
    def normalize(self, row: Dict[str, str]) -> str:
1✔
38
        """
39
        Normalize location for comparison.
40

41
        Simplified algorithm:
42
        1. Normalize Unicode
43
        2. Replace all / and - with _ (for comparison, we don't care where they were)
44
        3. Apply normalize_file_name to replace other forbidden characters (:, ? → _)
45
        4. Remove duplicate suffixes (if requested)
46
        5. Normalize MIME types (remove extension if in same group) - once
47
        6. Final cleanup of trailing spaces/underscores
48

49
        Args:
50
            row: CSV row with location, mimeType fields
51

52
        Returns:
53
            Normalized location string
54
        """
55
        location = (row.get("location") or "").strip()
1✔
56
        if not location:
1✔
NEW
57
            return location
×
58

59
        # Step 1: Normalize Unicode
60
        location = normalize_unicode(location)
1✔
61

62
        # Step 2: Replace all / and - with _ for comparison
63
        # This simplifies comparison - we don't care where / or - was (path separator or in filename)
64
        # Replace / first, then - to ensure consistent normalization
65
        location = location.replace("/", "_")
1✔
66
        location = location.replace("-", "_")
1✔
67
        
68
        # Step 3: Apply normalize_file_name to replace other forbidden characters
69
        # This replaces :, ? and other forbidden chars with _
70
        if self.normalize_file_names:
1✔
71
            # Normalize the entire location string
NEW
72
            location = normalize_file_name(location, replace_with="_")
×
73
            # Normalize spaces around underscores: " _ " -> "_", " _" -> "_", "_ " -> "_"
NEW
74
            location = re.sub(r"\s+_\s+", "_", location)
×
NEW
75
            location = location.replace(" _", "_").replace("_ ", "_")
×
76
        else:
77
            # Even without normalize_file_names, we still need to handle URL patterns
78
            # Replace :// and :/ patterns
79
            location = re.sub(r':/+/?', '_', location)
1✔
80
            # Replace other forbidden characters that might cause issues
81
            location = location.replace('?', '_').replace(':', '_')
1✔
82

83
        # Step 4: Remove duplicate suffixes if requested
84
        if self.ignore_duplicate_suffixes:
1✔
85
            # Since we already replaced all / with _, we can work with the entire location
86
            # Find the last part (after last _) or use entire location if no _
NEW
87
            last_underscore = location.rfind("_")
×
NEW
88
            if last_underscore >= 0:
×
NEW
89
                last_part = location[last_underscore + 1:]
×
NEW
90
                if "." in last_part:
×
NEW
91
                    normalized_last = remove_duplicate_suffix(last_part)
×
NEW
92
                    location = location[: last_underscore + 1] + normalized_last
×
93
            else:
94
                # No underscore, entire location is the filename
NEW
95
                if "." in location:
×
NEW
96
                    location = remove_duplicate_suffix(location)
×
97

98
        # Step 5: Normalize MIME types (remove extension if in same group) - once
99
        # Also remove all extensions recursively (handles cases like .csv.xlsx)
100
        mime_type = (row.get("mimeType") or row.get("mime_type") or "").strip()
1✔
101
        
102
        # Remove all extensions recursively until no more extensions found
103
        # This handles cases like "file.csv.xlsx" -> "file"
104
        # We remove all extensions for comparison, regardless of MIME groups
105
        max_iterations = 10  # Safety limit to avoid infinite loops
1✔
106
        iteration = 0
1✔
107
        while iteration < max_iterations:
1✔
108
            location_for_suffix = location.rstrip("_")
1✔
109
            suffix = Path(location_for_suffix).suffix.lower()
1✔
110
            
111
            if not suffix or len(suffix) <= 1:
1✔
112
                break  # No extension found
1✔
113
            
114
            # Remove the extension
115
            suffix_len = len(suffix)
1✔
116
            # Handle trailing underscore before extension (e.g., "file_.xlsx")
117
            if location.lower().endswith("_" + suffix):
1✔
NEW
118
                location = location[: -(suffix_len + 1)]
×
119
            elif location.lower().endswith(suffix + "_"):
1✔
NEW
120
                location = location[: -(suffix_len + 1)]
×
121
            elif location.lower().endswith(suffix):
1✔
122
                location = location[: -suffix_len]
1✔
123
            location = location.rstrip("_")
1✔
124
            iteration += 1
1✔
125

126
        # Step 6: Final cleanup - remove trailing underscores before extensions
127
        location = re.sub(r"_(\.[^.]+)$", r"\1", location)
1✔
128

129
        # Step 7: Strip trailing spaces, underscores, and dots
130
        # This handles cases like "file." -> "file" and "file_." -> "file"
131
        location = location.rstrip(" _.")
1✔
132

133
        return location
1✔
134

135

136
def normalize_location(
1✔
137
    row: Dict[str, str],
138
    normalize_file_names: bool = False,
139
    ignore_duplicate_suffixes: bool = False,
140
) -> str:
141
    """
142
    Normalize location for comparison - simplified version.
143

144
    This is a simplified version that replaces all / and - with _ (regardless of where
145
    they appear), and other forbidden characters (:, ?) with underscores, without
146
    complex path flattening/unflattening logic.
147

148
    Args:
149
        row: CSV row with location, mimeType fields
150
        normalize_file_names: If True, normalize file/folder names in path
151
        ignore_duplicate_suffixes: If True, remove duplicate suffixes like (1), (2) from file names
152

153
    Returns:
154
        Normalized location string
155
    """
156
    normalizer = LocationNormalizer(
1✔
157
        normalize_file_names=normalize_file_names,
158
        ignore_duplicate_suffixes=ignore_duplicate_suffixes,
159
    )
160
    return normalizer.normalize(row)
1✔
161

STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc