• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

popstas / google-drive-access / 19876179924

02 Dec 2025 10:54PM UTC coverage: 59.51% (+2.1%) from 57.374%
19876179924

push

github

web-flow
refactor: modularize drive audit commands (#24)

212 of 736 new or added lines in 13 files covered. (28.8%)

4 existing lines in 2 files now uncovered.

1408 of 2366 relevant lines covered (59.51%)

0.6 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

73.58
/src/drive_audit/location_normalizer.py
1
"""Location normalization module for comparing file locations.
2

3
This module provides simplified location normalization that replaces
4
basic forbidden characters (/, :, ?) with underscores, without complex
5
path flattening/unflattening logic.
6
"""
7

8
import re
1✔
9
from pathlib import Path
1✔
10
from typing import Dict
1✔
11

12
from .compare import (
1✔
13
    MIME_NORMALIZATION_GROUPS,
14
    normalize_file_name,
15
    normalize_unicode,
16
    remove_duplicate_suffix,
17
)
18

19

20
class LocationNormalizer:
1✔
21
    """Simplified location normalizer that replaces basic characters."""
22

23
    def __init__(
1✔
24
        self,
25
        normalize_file_names: bool = False,
26
        ignore_duplicate_suffixes: bool = False,
27
    ):
28
        """
29
        Initialize the normalizer.
30

31
        Args:
32
            normalize_file_names: If True, normalize file/folder names in path
33
            ignore_duplicate_suffixes: If True, remove duplicate suffixes like (1), (2) from file names
34
        """
35
        self.normalize_file_names = normalize_file_names
1✔
36
        self.ignore_duplicate_suffixes = ignore_duplicate_suffixes
1✔
37

38
    def normalize(self, row: Dict[str, str]) -> str:
1✔
39
        """
40
        Normalize location for comparison.
41

42
        Simplified algorithm:
43
        1. Normalize Unicode
44
        2. Replace all / and - with _ (for comparison, we don't care where they were)
45
        3. Apply normalize_file_name to replace other forbidden characters (:, ? → _)
46
        4. Remove duplicate suffixes (if requested)
47
        5. Normalize MIME types (remove extension if in same group) - once
48
        6. Final cleanup of trailing spaces/underscores
49

50
        Args:
51
            row: CSV row with location, mimeType fields
52

53
        Returns:
54
            Normalized location string
55
        """
56
        location = (row.get("location") or "").strip()
1✔
57
        if not location:
1✔
58
            return location
×
59

60
        # Step 1: Normalize Unicode
61
        location = normalize_unicode(location)
1✔
62

63
        # Step 2: Replace all / and - with _ for comparison
64
        # This simplifies comparison - we don't care where / or - was (path separator or in filename)
65
        # Replace / first, then - to ensure consistent normalization
66
        location = location.replace("/", "_")
1✔
67
        location = location.replace("-", "_")
1✔
68

69
        # Step 3: Apply normalize_file_name to replace other forbidden characters
70
        # This replaces :, ? and other forbidden chars with _
71
        if self.normalize_file_names:
1✔
72
            # Normalize the entire location string
73
            location = normalize_file_name(location, replace_with="_")
×
74
            # Normalize spaces around underscores: " _ " -> "_", " _" -> "_", "_ " -> "_"
75
            location = re.sub(r"\s+_\s+", "_", location)
×
76
            location = location.replace(" _", "_").replace("_ ", "_")
×
77
        else:
78
            # Even without normalize_file_names, we still need to handle URL patterns
79
            # Replace :// and :/ patterns
80
            location = re.sub(r":/+/?", "_", location)
1✔
81
            # Replace other forbidden characters that might cause issues
82
            location = location.replace("?", "_").replace(":", "_")
1✔
83

84
        # Step 4: Remove duplicate suffixes if requested
85
        if self.ignore_duplicate_suffixes:
1✔
86
            # Since we already replaced all / with _, we can work with the entire location
87
            # Find the last part (after last _) or use entire location if no _
88
            last_underscore = location.rfind("_")
×
89
            if last_underscore >= 0:
×
NEW
90
                last_part = location[last_underscore + 1 :]
×
91
                if "." in last_part:
×
92
                    normalized_last = remove_duplicate_suffix(last_part)
×
93
                    location = location[: last_underscore + 1] + normalized_last
×
94
            else:
95
                # No underscore, entire location is the filename
96
                if "." in location:
×
97
                    location = remove_duplicate_suffix(location)
×
98

99
        # Step 5: Normalize MIME types (remove extension if in same group) - once
100
        # Also remove all extensions recursively (handles cases like .csv.xlsx)
101
        mime_type = (row.get("mimeType") or row.get("mime_type") or "").strip()
1✔
102

103
        # Remove all extensions recursively until no more extensions found
104
        # This handles cases like "file.csv.xlsx" -> "file"
105
        # We remove all extensions for comparison, regardless of MIME groups
106
        max_iterations = 10  # Safety limit to avoid infinite loops
1✔
107
        iteration = 0
1✔
108
        while iteration < max_iterations:
1✔
109
            location_for_suffix = location.rstrip("_")
1✔
110
            suffix = Path(location_for_suffix).suffix.lower()
1✔
111

112
            if not suffix or len(suffix) <= 1:
1✔
113
                break  # No extension found
1✔
114

115
            # Remove the extension
116
            suffix_len = len(suffix)
1✔
117
            # Handle trailing underscore before extension (e.g., "file_.xlsx")
118
            if location.lower().endswith("_" + suffix):
1✔
119
                location = location[: -(suffix_len + 1)]
×
120
            elif location.lower().endswith(suffix + "_"):
1✔
121
                location = location[: -(suffix_len + 1)]
×
122
            elif location.lower().endswith(suffix):
1✔
123
                location = location[:-suffix_len]
1✔
124
            location = location.rstrip("_")
1✔
125
            iteration += 1
1✔
126

127
        # Step 6: Final cleanup - remove trailing underscores before extensions
128
        location = re.sub(r"_(\.[^.]+)$", r"\1", location)
1✔
129

130
        # Step 7: Strip trailing spaces, underscores, and dots
131
        # This handles cases like "file." -> "file" and "file_." -> "file"
132
        location = location.rstrip(" _.")
1✔
133

134
        return location
1✔
135

136

137
def normalize_location(
1✔
138
    row: Dict[str, str],
139
    normalize_file_names: bool = False,
140
    ignore_duplicate_suffixes: bool = False,
141
) -> str:
142
    """
143
    Normalize location for comparison - simplified version.
144

145
    This is a simplified version that replaces all / and - with _ (regardless of where
146
    they appear), and other forbidden characters (:, ?) with underscores, without
147
    complex path flattening/unflattening logic.
148

149
    Args:
150
        row: CSV row with location, mimeType fields
151
        normalize_file_names: If True, normalize file/folder names in path
152
        ignore_duplicate_suffixes: If True, remove duplicate suffixes like (1), (2) from file names
153

154
    Returns:
155
        Normalized location string
156
    """
157
    normalizer = LocationNormalizer(
1✔
158
        normalize_file_names=normalize_file_names,
159
        ignore_duplicate_suffixes=ignore_duplicate_suffixes,
160
    )
161
    return normalizer.normalize(row)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc