14540577447

Committed 18 Apr 2025 07:23PM UTC coverage: 51.439% (-6.2%) from 57.611%

Build # 14540577447

Build Type

push

github

Committed by

ljacqu

Commit Message

Remove EvaluationResult type param from all Evaluator interfaces

Run Details

239 of 546 branches covered (43.77%)

16 of 17 new or added lines in 16 files covered. (94.12%)

193 existing lines in 17 files now uncovered.

679 of 1320 relevant lines covered (51.44%)

3.0 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.37

/src/main/java/ch/jalu/wordeval/evaluators/impl/RepeatedSegment.java

package ch.jalu.wordeval.evaluators.impl;

import ch.jalu.wordeval.dictionary.Word;
import ch.jalu.wordeval.evaluators.WordEvaluator;
import ch.jalu.wordeval.evaluators.result.WordWithKeyAndScore;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;
import lombok.Getter;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * Finds segments in words that are repeated multiple times,
 * e.g. 3x "est" in af. "geestestoestand".
 */
public class RepeatedSegment implements WordEvaluator {

  @Getter
  private final List<WordWithKeyAndScore> results = new ArrayList<>();

  @Override
  public void evaluate(Word wordObject) {
    String word = wordObject.getLowercase();
    Map<String, Integer> results = new NgramGenerator(word).getResults();
    removeNgramSubsets(results);
    results.forEach((ngram, count) -> this.results.add(new WordWithKeyAndScore(wordObject, ngram, count)));
  }

  /**
   * Removes "subset" results that are covered by larger results. For example, processing the word
   * "geestestoestand" will yield the pairs (3, est), (3, es), (3, st). The last two are "contained"
   * in the first and so are removed.
   *
   * @param results the result to trim
   */
  private static void removeNgramSubsets(Map<String, Integer> results) {
    Set<String> subsets = new HashSet<>();
    for (Map.Entry<String, Integer> entry : results.entrySet()) {
      Integer count = entry.getValue();
      createNgrams(entry.getKey()).stream()
        .filter(subset -> Objects.equals(count, results.get(subset)))
        .forEach(subsets::add);
    }
    subsets.forEach(results::remove);
  }

  /**
   * Creates all possible n-grams for the given word.
   *
   * @param word the word to create n-grams for
   * @return constructed of n-grams
   */
  private static List<String> createNgrams(String word) {
    List<String> ngrams = new ArrayList<>();
    for (int start = 0; start < word.length(); ++start) {
      // need to adjust end if start == 0 or else we will also include the entire word
      int end = start == 0 ? word.length() - 1 : word.length();
      for ( ; end > start; --end) {
        ngrams.add(word.substring(start, end));
      }
    }
    return ngrams;
  }

  @Override
  public ListMultimap<Object, Object> getTopResults(int topScores, int maxLimit) {
    // todo: Sort better, considering the key length.
    List<WordWithKeyAndScore> sortedResult = results.stream()
        .sorted(Comparator.comparing(WordWithKeyAndScore::getScore).reversed())
        .toList();

    Set<Integer> uniqueValues = new HashSet<>();
    ListMultimap<Object, Object> filteredResults = ArrayListMultimap.create();
    for (WordWithKeyAndScore word : sortedResult) {
      if (uniqueValues.add(word.getScore()) && uniqueValues.size() > topScores) {
        break;
      }
      filteredResults.put(word.getScore(), word.getWord().getRaw() + " (" + word.getKey() + ")");
      if (filteredResults.size() >= maxLimit) {
        break;
      }
    }

    return filteredResults;
  }

  /**
   * Counts all n-grams of a word.
   */
  private static final class NgramGenerator {

    private final String word;
    private final int maxNgramSize;
    private final Map<String, Integer> ngramCount;

    public NgramGenerator(String word) {
      this.word = word;
      this.maxNgramSize = word.length() / 2;
      this.ngramCount = new HashMap<>();
      countNgrams();
    }

    /**
     * Returns all n-grams with multiple occurrences.
     *
     * @return collection of n-grams occurring multiple times (ngram -> count)
     */
    public Map<String, Integer> getResults() {
      return ngramCount.entrySet().stream()
        .peek(this::adjustCount)
        .filter(entry -> entry.getValue() > 1)
        .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    }

    private void countNgrams() {
      for (int i = 0; i <= word.length() - 2; ++i) {
        createNGramsAtOffset(i);
      }
    }

    private void createNGramsAtOffset(int start) {
      int end = Math.min(word.length(), start + maxNgramSize);
      while (end - start >= 2) {
        String ngram = word.substring(start, end);
        int count = nullToZero(ngramCount.get(ngram));
        ngramCount.put(ngram, ++count);
        --end;
      }
    }

    /**
     * Adjusts the count of an n-gram to ensure that it really occurs as many times as counted.
     * For instance, in "Mississippi" the initial count of "issi" is 2 but they overlap, so it
     * needs to be corrected to 1.
     *
     * @param entry the entry to adjust
     */
    private void adjustCount(Map.Entry<String, Integer> entry) {
      if (entry.getValue() > 1) {
        // int division result -> gets ceil'd automatically
        int lengthDiff = (word.length() - word.replaceAll(entry.getKey(), "").length())
            / entry.getKey().length();
        if (lengthDiff != entry.getValue()) {
          // May still not be correct...
          entry.setValue(entry.getValue() - 1);
        }
      }
    }

    private static int nullToZero(Integer i) {
      return i == null ? 0 : i;
    }
  }
}

1	package ch.jalu.wordeval.evaluators.impl;
2
3	import ch.jalu.wordeval.dictionary.Word;
4	import ch.jalu.wordeval.evaluators.WordEvaluator;
5	import ch.jalu.wordeval.evaluators.result.WordWithKeyAndScore;
6	import com.google.common.collect.ArrayListMultimap;
7	import com.google.common.collect.ListMultimap;
8	import lombok.Getter;
9
10	import java.util.ArrayList;
11	import java.util.Comparator;
12	import java.util.HashMap;
13	import java.util.HashSet;
14	import java.util.List;
15	import java.util.Map;
16	import java.util.Objects;
17	import java.util.Set;
18	import java.util.stream.Collectors;
19
20	/**
21	* Finds segments in words that are repeated multiple times,
22	* e.g. 3x "est" in af. "geestestoestand".
23	*/
24	public class RepeatedSegment implements WordEvaluator {	2✔
25
26	@Getter	6✔
27	private final List<WordWithKeyAndScore> results = new ArrayList<>();
28
29	@Override
30	public void evaluate(Word wordObject) {
31	String word = wordObject.getLowercase();	3✔
32	Map<String, Integer> results = new NgramGenerator(word).getResults();	6✔
33	removeNgramSubsets(results);	2✔
34	results.forEach((ngram, count) -> this.results.add(new WordWithKeyAndScore(wordObject, ngram, count)));	17✔
35	}	1✔
36
37	/**
38	* Removes "subset" results that are covered by larger results. For example, processing the word
39	* "geestestoestand" will yield the pairs (3, est), (3, es), (3, st). The last two are "contained"
40	* in the first and so are removed.
41	*
42	* @param results the result to trim
43	*/
44	private static void removeNgramSubsets(Map<String, Integer> results) {
45	Set<String> subsets = new HashSet<>();	4✔
46	for (Map.Entry<String, Integer> entry : results.entrySet()) {	11✔
47	Integer count = entry.getValue();	4✔
48	createNgrams(entry.getKey()).stream()	8✔
49	.filter(subset -> Objects.equals(count, results.get(subset)))	9✔
50	.forEach(subsets::add);	4✔
51	}	1✔
52	subsets.forEach(results::remove);	7✔
53	}	1✔
54
55	/**
56	* Creates all possible n-grams for the given word.
57	*
58	* @param word the word to create n-grams for
59	* @return constructed of n-grams
60	*/
61	private static List<String> createNgrams(String word) {
62	List<String> ngrams = new ArrayList<>();	4✔
63	for (int start = 0; start < word.length(); ++start) {	8✔
64	// need to adjust end if start == 0 or else we will also include the entire word
65	int end = start == 0 ? word.length() - 1 : word.length();	10✔
66	for ( ; end > start; --end) {	5✔
67	ngrams.add(word.substring(start, end));	7✔
68	}
69	}
70	return ngrams;	2✔
71	}
72
73	@Override
74	public ListMultimap<Object, Object> getTopResults(int topScores, int maxLimit) {
75	// todo: Sort better, considering the key length.
UNCOV 76	List<WordWithKeyAndScore> sortedResult = results.stream()	×
UNCOV 77	.sorted(Comparator.comparing(WordWithKeyAndScore::getScore).reversed())	×
UNCOV 78	.toList();	×
79
UNCOV 80	Set<Integer> uniqueValues = new HashSet<>();	×
UNCOV 81	ListMultimap<Object, Object> filteredResults = ArrayListMultimap.create();	×
UNCOV 82	for (WordWithKeyAndScore word : sortedResult) {	×
UNCOV 83	if (uniqueValues.add(word.getScore()) && uniqueValues.size() > topScores) {	×
UNCOV 84	break;	×
85	}
UNCOV 86	filteredResults.put(word.getScore(), word.getWord().getRaw() + " (" + word.getKey() + ")");	×
UNCOV 87	if (filteredResults.size() >= maxLimit) {	×
UNCOV 88	break;	×
89	}
UNCOV 90	}	×
91
UNCOV 92	return filteredResults;	×
93	}
94
95	/**
96	* Counts all n-grams of a word.
97	*/
98	private static final class NgramGenerator {
99
100	private final String word;
101	private final int maxNgramSize;
102	private final Map<String, Integer> ngramCount;
103
104	public NgramGenerator(String word) {	2✔
105	this.word = word;	3✔
106	this.maxNgramSize = word.length() / 2;	6✔
107	this.ngramCount = new HashMap<>();	5✔
108	countNgrams();	2✔
109	}	1✔
110
111	/**
112	* Returns all n-grams with multiple occurrences.
113	*
114	* @return collection of n-grams occurring multiple times (ngram -> count)
115	*/
116	public Map<String, Integer> getResults() {
117	return ngramCount.entrySet().stream()	7✔
118	.peek(this::adjustCount)	2✔
119	.filter(entry -> entry.getValue() > 1)	13✔
120	.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));	3✔
121	}
122
123	private void countNgrams() {
124	for (int i = 0; i <= word.length() - 2; ++i) {	11✔
125	createNGramsAtOffset(i);	3✔
126	}
127	}	1✔
128
129	private void createNGramsAtOffset(int start) {
130	int end = Math.min(word.length(), start + maxNgramSize);	9✔
131	while (end - start >= 2) {	5✔
132	String ngram = word.substring(start, end);	6✔
133	int count = nullToZero(ngramCount.get(ngram));	7✔
134	ngramCount.put(ngram, ++count);	8✔
135	--end;	1✔
136	}	1✔
137	}	1✔
138
139	/**
140	* Adjusts the count of an n-gram to ensure that it really occurs as many times as counted.
141	* For instance, in "Mississippi" the initial count of "issi" is 2 but they overlap, so it
142	* needs to be corrected to 1.
143	*
144	* @param entry the entry to adjust
145	*/
146	private void adjustCount(Map.Entry<String, Integer> entry) {
147	if (entry.getValue() > 1) {	6✔
148	// int division result -> gets ceil'd automatically
149	int lengthDiff = (word.length() - word.replaceAll(entry.getKey(), "").length())	13✔
150	/ entry.getKey().length();	5✔
151	if (lengthDiff != entry.getValue()) {	6✔
152	// May still not be correct...
153	entry.setValue(entry.getValue() - 1);	10✔
154	}
155	}
156	}	1✔
157
158	private static int nullToZero(Integer i) {
159	return i == null ? 0 : i;	7✔
160	}
161	}
162	}

ljacqu / wordeval / 14540577447

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous