• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pkiraly / metadata-qa-marc / #1527

22 Aug 2025 02:21PM UTC coverage: 90.345%. Remained the same
#1527

push

pkiraly
Improve timeline handling

5191 of 6416 new or added lines in 219 files covered. (80.91%)

886 existing lines in 78 files now uncovered.

36717 of 40641 relevant lines covered (90.34%)

0.9 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

92.11
/src/main/java/de/gwdg/metadataqa/marc/cli/utils/placename/PlaceNameNormaliser.java
1
package de.gwdg.metadataqa.marc.cli.utils.placename;
2

3
import com.opencsv.CSVReader;
4
import com.opencsv.CSVReaderBuilder;
5
import com.opencsv.exceptions.CsvException;
6
import org.apache.commons.io.FileUtils;
7

8
import java.io.File;
9
import java.io.FileReader;
10
import java.io.IOException;
11
import java.nio.charset.StandardCharsets;
12
import java.text.Normalizer;
13
import java.util.ArrayList;
14
import java.util.HashMap;
15
import java.util.List;
16
import java.util.Map;
17
import java.util.logging.Logger;
18
import java.util.stream.Collectors;
19

20
import static de.gwdg.metadataqa.marc.cli.utils.placename.PlaceNameNormaliser.status.RESOLVED;
21
import static de.gwdg.metadataqa.marc.cli.utils.placename.PlaceNameNormaliser.status.UNRESOLVED;
22

23
public class PlaceNameNormaliser {
24
  private static final Logger logger = Logger.getLogger(PlaceNameNormaliser.class.getCanonicalName());
1✔
25

26
  enum status {RESOLVED, UNRESOLVED;}
1✔
27
  public static final String UNRESOLVED_PLACE_NAMES_FILE = "translations-unresolved-place-names.txt";
28

29
  /*
30
  private Set<String> knownMultiwordCities = Set.of(
31
    "New York", "Ithaca, NY", "Stamford, CN", "Tel Aviv", "Buenos Aires", "Middletown, CN",
32
    "Santiago de Chile", "Princeton, NJ", "Mount Vernon, NY", "Los Angeles", "Pôrto Alegre",
33
    "Berkeley, CA", "Philadelphia, PA", "'s-Gravenhage", "São Paulo", "Kentfield, CA", "Avon, CN"
34
  );
35
   */
36
  public static final List<String> UNKNOWN_PLACE_NAMES = List.of("Miejsce nieznane", "S.l.", "S.I.", "s.l.", "s.l", "S. l.", "S. I.", "n.p.", "s.n.");
1✔
37

38
  private final String translationPlaceNameDictionaryDir;
39
  private final String outputDir;
40

41
  private Map<String, PlaceName> coords;
42
  private Map<String, List<String>> synonyms;
43

44
  private Map<String, Integer> unresolvedPlaceNames = new HashMap<>();
1✔
45
  private Map<status, Integer> statistics = new HashMap<>();
1✔
46

47
  public PlaceNameNormaliser(String translationPlaceNameDictionaryDir,
48
                             String outputDir) {
1✔
49
    this.translationPlaceNameDictionaryDir = translationPlaceNameDictionaryDir;
1✔
50
    this.outputDir = outputDir;
1✔
51
    coords = new HashMap<>();
1✔
52
    synonyms = new HashMap<>();
1✔
53

54
    File errorFile = new File(outputDir, UNRESOLVED_PLACE_NAMES_FILE);
1✔
55
    if (errorFile.exists())
1✔
NEW
56
      errorFile.delete();
×
57

58
    try {
59
      processCoords(readCsvFile("coord.csv"));
1✔
60
      processSynonyms(readCsvFile("place-synonyms-normalized.csv"));
1✔
NEW
61
    } catch (IOException e) {
×
NEW
62
      throw new RuntimeException(e);
×
NEW
63
    } catch (CsvException e) {
×
NEW
64
      throw new RuntimeException(e);
×
65
    }
1✔
66
  }
1✔
67

68
  public Map<String, PlaceName> getCoords() {
69
    return coords;
1✔
70
  }
71

72
  public Map<String, List<String>> getSynonyms() {
73
    return synonyms;
1✔
74
  }
75

76
  private void processSynonyms(List<String[]> rows) {
77
    for (String[] row : rows) {
1✔
78
      synonyms.computeIfAbsent(row[0], k -> new ArrayList<>()).add(row[1]);
1✔
79
    }
1✔
80
  }
1✔
81

82
  public void reportUnresolvedPlaceNames() {
83
    if (unresolvedPlaceNames.isEmpty())
1✔
NEW
84
      return;
×
85
    String summary = String.format("resolved: %d, unresolved: %d\n", statistics.get(RESOLVED), statistics.get(UNRESOLVED));
1✔
86
    String content = unresolvedPlaceNames.entrySet().stream()
1✔
87
      .sorted((o1, o2) -> o2.getValue().compareTo(o1.getValue()))
1✔
88
      .map(e -> e.getKey() + ": " + e.getValue())
1✔
89
      .collect(Collectors.joining("\n"));
1✔
90

91
    try {
92

93
      FileUtils.writeStringToFile(new File(outputDir, UNRESOLVED_PLACE_NAMES_FILE), summary, StandardCharsets.UTF_8, false);
1✔
94
      FileUtils.writeStringToFile(new File(outputDir, UNRESOLVED_PLACE_NAMES_FILE), content, StandardCharsets.UTF_8, true);
1✔
NEW
95
    } catch (IOException e) {
×
NEW
96
      throw new RuntimeException(e);
×
97
    }
1✔
98
  }
1✔
99

100
  private void processCoords(List<String[]> rows) {
101
    for (String[] row : rows) {
1✔
102
      coords.put(row[0], new PlaceName(row));
1✔
103
    }
1✔
104
  }
1✔
105

106
  private List<String[]> readCsvFile(String csvFile) throws IOException, CsvException {
107
    FileReader filereader = new FileReader(new File(translationPlaceNameDictionaryDir, csvFile));
1✔
108
    CSVReader csvReader = new CSVReaderBuilder(filereader).withSkipLines(1).build();
1✔
109
    return csvReader.readAll();
1✔
110
  }
111

112
  public List<PlaceName> normalise(String placeName) {
NEW
113
    return resolve(clean(placeName));
×
114
  }
115

116
  public List<PlaceName> normalise(List<String> placeNames) {
117
    List<PlaceName> normalised = new ArrayList<>();
1✔
118
    for (String placeName : placeNames)
1✔
119
      normalised.addAll(resolve(clean(placeName)));
1✔
120
    return normalised;
1✔
121
  }
122

123
  public String clean(String input) {
124
    String output = input;
1✔
125
    // UTF-8 normalizer
126
    output = Normalizer.normalize(output, Normalizer.Form.NFKC);
1✔
127
    output = output.trim();
1✔
128

129
    // generic
130
    output = output.replaceAll("\\s*[,:;]+\\s*$", "");
1✔
131
    output = output.replaceAll("^\\[(.+)\\]$", "$1");
1✔
132
    output = output.replaceAll("^\\[(.+)$", "$1");
1✔
133
    output = output.replaceAll("^(.+)\\]$", "$1");
1✔
134
    output = output.replaceAll(",$", "");
1✔
135
    output = output.replaceAll("\\?$", "");
1✔
136
    output = output.replaceAll(" \\.\\.\\.$", "");
1✔
137
    output = output.replaceAll(" \\[etc\\.$", "");
1✔
138
    output = output.replaceAll(", cop\\.$", "");
1✔
139

140
    // States
141
    output = output.replaceAll(", (\\[?Ark\\.)$", ", AR");
1✔
142
    output = output.replaceAll(", Arizona$", ", AZ");
1✔
143
    output = output.replaceAll(", (\\[?D\\.C\\.)$", ", DC");
1✔
144
    output = output.replaceAll(", (California|\\[?Calif\\.|Calif)$", ", CA");
1✔
145
    output = output.replaceAll(", (\\[?Colo\\.)$", ", CO");         // Colorado
1✔
146
    output = output.replaceAll(", (Conn\\.|Connecticut|CT, USA)$", ", CT"); // Connecticut
1✔
147
    output = output.replaceAll(", (\\[?Fla\\.)$", ", FL");          // Florida
1✔
148
    output = output.replaceAll(", (Ga\\.)$", ", GA");               // Georgia
1✔
149
    output = output.replaceAll(", \\[?(Illinois|Ill\\.\\.?)$", ", IL");
1✔
150
    output = output.replaceAll(", (Ind\\.|Indiana)$", ", IN"); // Indiana
1✔
151
    output = output.replaceAll(", (\\[?Kans\\.)$", ", KS");
1✔
152
    output = output.replaceAll(", (La\\.)$", ", LA");               // Louisiana
1✔
153
    output = output.replaceAll(", (Massachusetts|\\[?Mass\\.|Ma\\.)$", ", MA"); // Massachusetts
1✔
154
    output = output.replaceAll(", (Michigan|Mich\\.)$", ", MI");
1✔
155
    output = output.replaceAll(", (Maryland|Md\\.|M\\.d\\.)$", ", MD");
1✔
156
    output = output.replaceAll(", (Minn\\.|Minnesota)$", ", MN");
1✔
157
    output = output.replaceAll(", (Miss\\.|Missouri)$", ", MO");            // Missouri
1✔
158
    output = output.replaceAll(", (\\[?Mt\\.|Mont\\.|Mo\\.)$", ", MT");
1✔
159
    output = output.replaceAll(", (\\[?Nebr\\.)$", ", NE");
1✔
160
    output = output.replaceAll(", (N\\. ?J\\.|New Jersey)$", ", NJ");
1✔
161
    output = output.replaceAll(", (\\[?N\\. ?Y\\.|N\\.Y|New York, USA)$", ", NY");
1✔
162
    output = output.replaceAll(", (Ohio)$", ", OH");
1✔
163
    output = output.replaceAll(", (Oklahoma)$", ", OK");
1✔
164
    output = output.replaceAll(", (Oreg\\.|Oregon)$", ", OR");     // Oregon
1✔
165
    output = output.replaceAll(", (\\[?Pa\\.|Pa|Pennsylvania)$", ", PA"); // Pennsylvania
1✔
166
    output = output.replaceAll(", (S\\.C\\.|SC\\.)$", ", SC");
1✔
167
    output = output.replaceAll(", (Tenn\\.)$", ", TN");            // Tennessee
1✔
168
    output = output.replaceAll(", (Texas|Tex\\.)$", ", TX");
1✔
169
    output = output.replaceAll(", (Virginia|\\[?Va\\.)$", ", VA"); // Virginia
1✔
170
    output = output.replaceAll(", (Vermont|Vt\\.)$", ", VT");      // Vermont
1✔
171
    output = output.replaceAll(", (Washington|Wash\\.)$", ", WA"); // Washington
1✔
172
    output = output.replaceAll(", (Wis\\.|Wisc\\.)$", ", WI");
1✔
173
    output = output.replaceAll(", \\[Vic\\.$", ", Vic.");   // Victoria
1✔
174

175
    if (UNKNOWN_PLACE_NAMES.contains(output))
1✔
176
      output = "UNKNOWN";
1✔
177

178
    return output;
1✔
179
  }
180

181
  public List<PlaceName> resolve(String originalNameForm) {
182
    if (coords.containsKey(originalNameForm)) {
1✔
183
      statistics.put(RESOLVED, statistics.computeIfAbsent(RESOLVED, k -> 0) + 1);
1✔
184
      return List.of(coords.get(originalNameForm));
1✔
185
    }
186
    List<PlaceName> placeNames = new ArrayList<>();
1✔
187
    boolean resolved = false;
1✔
188
    if (synonyms.containsKey(originalNameForm)) {
1✔
189
      statistics.put(RESOLVED, statistics.computeIfAbsent(RESOLVED, k -> 0) + 1);
1✔
190
      for (String synonym : synonyms.get(originalNameForm)) {
1✔
191
        placeNames.add(coords.get(synonym));
1✔
192
      }
1✔
193
    } else {
194
      statistics.put(UNRESOLVED, statistics.computeIfAbsent(UNRESOLVED, k -> 0) + 1);
1✔
195
      unresolvedPlaceNames.computeIfAbsent(originalNameForm, k -> 0);
1✔
196
      unresolvedPlaceNames.put(originalNameForm, unresolvedPlaceNames.get(originalNameForm)+1);
1✔
197
      // logger.info("Unresolved place name: " + originalNameForm);
198
    }
199
    return placeNames;
1✔
200
  }
201
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc