• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

apache / datasketches-java / #306

30 Apr 2024 10:01PM UTC coverage: 97.645% (-0.5%) from 98.139%
#306

push

web-flow
Merge pull request #555 from apache/fix_pom_xml_header

Fix pom xml header

26865 of 27513 relevant lines covered (97.64%)

0.98 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

99.57
/src/main/java/org/apache/datasketches/frequencies/LongsSketch.java
1
/*
2
 * Licensed to the Apache Software Foundation (ASF) under one
3
 * or more contributor license agreements.  See the NOTICE file
4
 * distributed with this work for additional information
5
 * regarding copyright ownership.  The ASF licenses this file
6
 * to you under the Apache License, Version 2.0 (the
7
 * "License"); you may not use this file except in compliance
8
 * with the License.  You may obtain a copy of the License at
9
 *
10
 *   http://www.apache.org/licenses/LICENSE-2.0
11
 *
12
 * Unless required by applicable law or agreed to in writing,
13
 * software distributed under the License is distributed on an
14
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
 * KIND, either express or implied.  See the License for the
16
 * specific language governing permissions and limitations
17
 * under the License.
18
 */
19

20
package org.apache.datasketches.frequencies;
21

22
import static org.apache.datasketches.common.Util.LS;
23
import static org.apache.datasketches.common.Util.checkBounds;
24
import static org.apache.datasketches.common.Util.exactLog2OfInt;
25
import static org.apache.datasketches.common.Util.isPowerOf2;
26
import static org.apache.datasketches.frequencies.PreambleUtil.EMPTY_FLAG_MASK;
27
import static org.apache.datasketches.frequencies.PreambleUtil.SER_VER;
28
import static org.apache.datasketches.frequencies.PreambleUtil.extractActiveItems;
29
import static org.apache.datasketches.frequencies.PreambleUtil.extractFamilyID;
30
import static org.apache.datasketches.frequencies.PreambleUtil.extractFlags;
31
import static org.apache.datasketches.frequencies.PreambleUtil.extractLgCurMapSize;
32
import static org.apache.datasketches.frequencies.PreambleUtil.extractLgMaxMapSize;
33
import static org.apache.datasketches.frequencies.PreambleUtil.extractPreLongs;
34
import static org.apache.datasketches.frequencies.PreambleUtil.extractSerVer;
35
import static org.apache.datasketches.frequencies.PreambleUtil.insertActiveItems;
36
import static org.apache.datasketches.frequencies.PreambleUtil.insertFamilyID;
37
import static org.apache.datasketches.frequencies.PreambleUtil.insertFlags;
38
import static org.apache.datasketches.frequencies.PreambleUtil.insertLgCurMapSize;
39
import static org.apache.datasketches.frequencies.PreambleUtil.insertLgMaxMapSize;
40
import static org.apache.datasketches.frequencies.PreambleUtil.insertPreLongs;
41
import static org.apache.datasketches.frequencies.PreambleUtil.insertSerVer;
42
import static org.apache.datasketches.frequencies.Util.LG_MIN_MAP_SIZE;
43
import static org.apache.datasketches.frequencies.Util.SAMPLE_SIZE;
44

45
import java.util.ArrayList;
46
import java.util.Comparator;
47
import java.util.Objects;
48

49
import org.apache.datasketches.common.Family;
50
import org.apache.datasketches.common.SketchesArgumentException;
51
import org.apache.datasketches.common.SketchesStateException;
52
import org.apache.datasketches.common.SuppressFBWarnings;
53
import org.apache.datasketches.memory.Memory;
54
import org.apache.datasketches.memory.WritableMemory;
55

56
/**
57
 * <p>This sketch is useful for tracking approximate frequencies of <i>long</i> items with optional
58
 * associated counts (<i>long</i> item, <i>long</i> count) that are members of a multiset of
59
 * such items. The true frequency of an item is defined to be the sum of associated counts.</p>
60
 *
61
 * <p>This implementation provides the following capabilities:</p>
62
 * <ul>
63
 * <li>Estimate the frequency of an item.</li>
64
 * <li>Return upper and lower bounds of any item, such that the true frequency is always
65
 * between the upper and lower bounds.</li>
66
 * <li>Return a global maximum error that holds for all items in the stream.</li>
67
 * <li>Return an array of frequent items that qualify either a NO_FALSE_POSITIVES or a
68
 * NO_FALSE_NEGATIVES error type.</li>
69
 * <li>Merge itself with another sketch object created from this class.</li>
70
 * <li>Serialize/Deserialize to/from a String or byte array.</li>
71
 * </ul>
72
 *
73
 * <p><b>Space Usage</b></p>
74
 *
75
 * <p>The sketch is initialized with a <i>maxMapSize</i> that specifies the maximum physical
76
 * length of the internal hash map of the form (<i>long</i> item, <i>long</i> count).
77
 * The <i>maxMapSize</i> must be a power of 2.</p>
78
 *
79
 * <p>The hash map starts at a very small size (8 entries), and grows as needed up to the
80
 * specified <i>maxMapSize</i>.</p>
81
 *
82
 * <p>At any moment the internal memory space usage of this sketch is 18 * <i>mapSize</i> bytes,
83
 * plus a small constant number of additional bytes. The maximum internal memory space usage of
84
 * this sketch will never exceed 18 * <i>maxMapSize</i> bytes, plus a small constant number of
85
 * additional bytes.</p>
86
 *
87
 * <p><b>Maximum Capacity of the Sketch</b></p>
88
 *
89
 * <p>The LOAD_FACTOR for the hash map is internally set at 75%,
90
 * which means at any time the map capacity of (item, count) pairs is <i>mapCap</i> =
91
 * 0.75 * <i>mapSize</i>.
92
 * The maximum capacity of (item, count) pairs of the sketch is <i>maxMapCap</i> =
93
 * 0.75 * <i>maxMapSize</i>.</p>
94
 *
95
 * <p><b>Updating the sketch with (item, count) pairs</b></p>
96
 *
97
 * <p>If the item is found in the hash map, the mapped count field (the "counter") is
98
 * incremented by the incoming count, otherwise, a new counter "(item, count) pair" is
99
 * created. If the number of tracked counters reaches the maximum capacity of the hash map
100
 * the sketch decrements all of the counters (by an approximately computed median), and
101
 * removes any non-positive counters.</p>
102
 *
103
 * <p><b>Accuracy</b></p>
104
 *
105
 * <p>If fewer than 0.75 * <i>maxMapSize</i> different items are inserted into the sketch the
106
 * estimated frequencies returned by the sketch will be exact.</p>
107
 *
108
 * <p>The logic of the frequent items sketch is such that the stored counts and true counts are
109
 * never too different.
110
 * More specifically, for any <i>item</i>, the sketch can return an estimate of the
111
 * true frequency of <i>item</i>, along with upper and lower bounds on the frequency
112
 * (that hold deterministically).</p>
113
 *
114
 * <p>For this implementation and for a specific active <i>item</i>, it is guaranteed that
115
 * the true frequency will be between the Upper Bound (UB) and the Lower Bound (LB) computed for
116
 * that <i>item</i>.  Specifically, <i>(UB- LB) &le; W * epsilon</i>, where <i>W</i> denotes the
117
 * sum of all item counts, and <i>epsilon = 3.5/M</i>, where <i>M</i> is the <i>maxMapSize</i>.</p>
118
 *
119
 * <p>This is a worst case guarantee that applies to arbitrary inputs.<sup>1</sup>
120
 * For inputs typically seen in practice <i>(UB-LB)</i> is usually much smaller.
121
 * </p>
122
 *
123
 * <p><b>Background</b></p>
124
 *
125
 * <p>This code implements a variant of what is commonly known as the "Misra-Gries
126
 * algorithm". Variants of it were discovered and rediscovered and redesigned several times
127
 * over the years:</p>
128
 * <ul><li>"Finding repeated elements", Misra, Gries, 1982</li>
129
 * <li>"Frequency estimation of Internet packet streams with limited space" Demaine,
130
 * Lopez-Ortiz, Munro, 2002</li>
131
 * <li>"A simple algorithm for finding frequent elements in streams and bags" Karp, Shenker,
132
 * Papadimitriou, 2003</li>
133
 * <li>"Efficient Computation of Frequent and Top-k Elements in Data Streams" Metwally,
134
 * Agrawal, Abbadi, 2006</li>
135
 * </ul>
136
 *
137
 * <sup>1</sup> For speed we do employ some randomization that introduces a small probability that
138
 * our proof of the worst-case bound might not apply to a given run.  However, we have ensured
139
 * that this probability is extremely small. For example, if the stream causes one table purge
140
 * (rebuild), our proof of the worst case bound applies with probability at least 1 - 1E-14.
141
 * If the stream causes 1E9 purges, our proof applies with probability at least 1 - 1E-5.
142
 *
143
 * @author Justin Thaler
144
 * @author Lee Rhodes
145
 */
146
@SuppressFBWarnings(value = "SIC_INNER_SHOULD_BE_STATIC_ANON", justification = "Harmless, fix later")
147
public class LongsSketch {
148

149
  private static final int STR_PREAMBLE_TOKENS = 6;
150

151
  /**
152
   * Log2 Maximum length of the arrays internal to the hash map supported by the data
153
   * structure.
154
   */
155
  private int lgMaxMapSize;
156

157
  /**
158
   * The current number of counters supported by the hash map.
159
   */
160
  private int curMapCap; //the threshold to purge
161

162
  /**
163
   * Tracks the total of decremented counts.
164
   */
165
  private long offset;
166

167
  /**
168
   * The sum of all frequencies of the stream so far.
169
   */
170
  private long streamWeight = 0;
1✔
171

172
  /**
173
   * The maximum number of samples used to compute approximate median of counters when doing
174
   * decrement
175
   */
176
  private int sampleSize;
177

178
  /**
179
   * Hash map mapping stored items to approximate counts
180
   */
181
  private ReversePurgeLongHashMap hashMap;
182

183
  /**
184
   * Construct this sketch with the parameter maxMapSize and the default initialMapSize (8).
185
   *
186
   * @param maxMapSize Determines the physical size of the internal hash map managed by this
187
   * sketch and must be a power of 2.  The maximum capacity of this internal hash map is
188
   * 0.75 times * maxMapSize. Both the ultimate accuracy and size of this sketch are a
189
   * function of maxMapSize.
190
   */
191
  public LongsSketch(final int maxMapSize) {
192
    this(exactLog2OfInt(maxMapSize, "maxMapSize"), LG_MIN_MAP_SIZE);
1✔
193
  }
1✔
194

195
  /**
196
   * Construct this sketch with parameter lgMapMapSize and lgCurMapSize. This internal
197
   * constructor is used when deserializing the sketch.
198
   *
199
   * @param lgMaxMapSize Log2 of the physical size of the internal hash map managed by this
200
   * sketch. The maximum capacity of this internal hash map is 0.75 times 2^lgMaxMapSize.
201
   * Both the ultimate accuracy and size of this sketch are a function of lgMaxMapSize.
202
   *
203
   * @param lgCurMapSize Log2 of the starting (current) physical size of the internal hash
204
   * map managed by this sketch.
205
   */
206
  LongsSketch(final int lgMaxMapSize, final int lgCurMapSize) {
1✔
207
    //set initial size of hash map
208
    this.lgMaxMapSize = Math.max(lgMaxMapSize, LG_MIN_MAP_SIZE);
1✔
209
    final int lgCurMapSz = Math.max(lgCurMapSize, LG_MIN_MAP_SIZE);
1✔
210
    hashMap = new ReversePurgeLongHashMap(1 << lgCurMapSz);
1✔
211
    curMapCap = hashMap.getCapacity();
1✔
212
    final int maxMapCap =
1✔
213
        (int) ((1 << lgMaxMapSize) * ReversePurgeLongHashMap.getLoadFactor());
1✔
214
    offset = 0;
1✔
215
    sampleSize = Math.min(SAMPLE_SIZE, maxMapCap);
1✔
216
  }
1✔
217

218
  /**
219
   * Returns a sketch instance of this class from the given srcMem,
220
   * which must be a Memory representation of this sketch class.
221
   *
222
   * @param srcMem a Memory representation of a sketch of this class.
223
   * <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
224
   * @return a sketch instance of this class.
225
   */
226
  public static LongsSketch getInstance(final Memory srcMem) {
227
    Objects.requireNonNull(srcMem, "Source Memory must not be null.");
1✔
228
    final long pre0 = PreambleUtil.checkPreambleSize(srcMem); //check Memory capacity
1✔
229
    final int maxPreLongs = Family.FREQUENCY.getMaxPreLongs();
1✔
230

231
    final int preLongs = extractPreLongs(pre0);         //Byte 0
1✔
232
    final int serVer = extractSerVer(pre0);             //Byte 1
1✔
233
    final int familyID = extractFamilyID(pre0);         //Byte 2
1✔
234
    final int lgMaxMapSize = extractLgMaxMapSize(pre0); //Byte 3
1✔
235
    final int lgCurMapSize = extractLgCurMapSize(pre0); //Byte 4
1✔
236
    final boolean empty = (extractFlags(pre0) & EMPTY_FLAG_MASK) != 0; //Byte 5
1✔
237

238
    // Checks
239
    final boolean preLongsEq1 = (preLongs == 1);        //Byte 0
1✔
240
    final boolean preLongsEqMax = (preLongs == maxPreLongs);
1✔
241
    if (!preLongsEq1 && !preLongsEqMax) {
1✔
242
      throw new SketchesArgumentException(
1✔
243
          "Possible Corruption: PreLongs must be 1 or " + maxPreLongs + ": " + preLongs);
244
    }
245
    if (serVer != SER_VER) {                            //Byte 1
1✔
246
      throw new SketchesArgumentException(
1✔
247
          "Possible Corruption: Ser Ver must be " + SER_VER + ": " + serVer);
248
    }
249
    final int actFamID = Family.FREQUENCY.getID();      //Byte 2
1✔
250
    if (familyID != actFamID) {
1✔
251
      throw new SketchesArgumentException(
1✔
252
          "Possible Corruption: FamilyID must be " + actFamID + ": " + familyID);
253
    }
254
    if (empty ^ preLongsEq1) {                          //Byte 5 and Byte 0
1✔
255
      throw new SketchesArgumentException(
1✔
256
          "Possible Corruption: (PreLongs == 1) ^ Empty == True.");
257
    }
258

259
    if (empty) {
1✔
260
      return new LongsSketch(lgMaxMapSize, LG_MIN_MAP_SIZE);
1✔
261
    }
262
    //get full preamble
263
    final long[] preArr = new long[preLongs];
1✔
264
    srcMem.getLongArray(0, preArr, 0, preLongs);
1✔
265

266
    final LongsSketch fls = new LongsSketch(lgMaxMapSize, lgCurMapSize);
1✔
267
    fls.streamWeight = 0; //update after
1✔
268
    fls.offset = preArr[3];
1✔
269

270
    final int preBytes = preLongs << 3;
1✔
271
    final int activeItems = extractActiveItems(preArr[1]);
1✔
272

273
    //Get countArray
274
    final long[] countArray = new long[activeItems];
1✔
275
    final int reqBytes = preBytes + 2 * activeItems * Long.BYTES; //count Arr + Items Arr
1✔
276
    checkBounds(0, reqBytes, srcMem.getCapacity()); //check Memory capacity
1✔
277
    srcMem.getLongArray(preBytes, countArray, 0, activeItems);
1✔
278

279
    //Get itemArray
280
    final int itemsOffset = preBytes + (Long.BYTES * activeItems);
1✔
281
    final long[] itemArray = new long[activeItems];
1✔
282
    srcMem.getLongArray(itemsOffset, itemArray, 0, activeItems);
1✔
283
    //update the sketch
284
    for (int i = 0; i < activeItems; i++) {
1✔
285
      fls.update(itemArray[i], countArray[i]);
1✔
286
    }
287
    fls.streamWeight = preArr[2]; //override streamWeight due to updating
1✔
288
    return fls;
1✔
289
  }
290

291
  /**
292
   * Returns a sketch instance of this class from the given String,
293
   * which must be a String representation of this sketch class.
294
   *
295
   * @param string a String representation of a sketch of this class.
296
   * @return a sketch instance of this class.
297
   */
298
  public static LongsSketch getInstance(final String string) {
299
    Objects.requireNonNull(string, "string must not be null.");
1✔
300
    final String[] tokens = string.split(",");
1✔
301
    if (tokens.length < (STR_PREAMBLE_TOKENS + 2)) {
1✔
302
      throw new SketchesArgumentException(
1✔
303
          "String not long enough: " + tokens.length);
304
    }
305
    final int serVer  = Integer.parseInt(tokens[0]);
1✔
306
    final int famID   = Integer.parseInt(tokens[1]);
1✔
307
    final int lgMax   = Integer.parseInt(tokens[2]);
1✔
308
    final int flags   = Integer.parseInt(tokens[3]);
1✔
309
    final long streamWt = Long.parseLong(tokens[4]);
1✔
310
    final long offset       = Long.parseLong(tokens[5]); //error offset
1✔
311
    //should always get at least the next 2 from the map
312
    final int numActive = Integer.parseInt(tokens[6]);
1✔
313
    final int lgCur = Integer.numberOfTrailingZeros(Integer.parseInt(tokens[7]));
1✔
314

315
    //checks
316
    if (serVer != SER_VER) {
1✔
317
      throw new SketchesArgumentException("Possible Corruption: Bad SerVer: " + serVer);
1✔
318
    }
319
    Family.FREQUENCY.checkFamilyID(famID);
1✔
320
    final boolean empty = flags > 0;
1✔
321
    if (!empty && (numActive == 0)) {
1✔
322
      throw new SketchesArgumentException(
1✔
323
          "Possible Corruption: !Empty && NumActive=0;  strLen: " + numActive);
324
    }
325
    final int numTokens = tokens.length;
1✔
326
    if ((2 * numActive) != (numTokens - STR_PREAMBLE_TOKENS - 2)) {
1✔
327
      throw new SketchesArgumentException(
1✔
328
          "Possible Corruption: Incorrect # of tokens: " + numTokens
329
            + ", numActive: " + numActive);
330
    }
331

332
    final LongsSketch sketch = new LongsSketch(lgMax, lgCur);
1✔
333
    sketch.streamWeight = streamWt;
1✔
334
    sketch.offset = offset;
1✔
335
    sketch.hashMap = deserializeFromStringArray(tokens);
1✔
336
    return sketch;
1✔
337
  }
338

339
  /**
340
   * Returns the estimated <i>a priori</i> error given the maxMapSize for the sketch and the
341
   * estimatedTotalStreamWeight.
342
   * @param maxMapSize the planned map size to be used when constructing this sketch.
343
   * @param estimatedTotalStreamWeight the estimated total stream weight.
344
   * @return the estimated <i>a priori</i> error.
345
   */
346
  public static double getAprioriError(final int maxMapSize, final long estimatedTotalStreamWeight) {
347
    return getEpsilon(maxMapSize) * estimatedTotalStreamWeight;
1✔
348
  }
349

350
  /**
351
   * Returns the current number of counters the sketch is configured to support.
352
   *
353
   * @return the current number of counters the sketch is configured to support.
354
   */
355
  public int getCurrentMapCapacity() {
356
    return curMapCap;
1✔
357
  }
358

359
  /**
360
   * Returns epsilon used to compute <i>a priori</i> error.
361
   * This is just the value <i>3.5 / maxMapSize</i>.
362
   * @param maxMapSize the planned map size to be used when constructing this sketch.
363
   * @return epsilon used to compute <i>a priori</i> error.
364
   */
365
  public static double getEpsilon(final int maxMapSize) {
366
    if (!isPowerOf2(maxMapSize)) {
1✔
367
      throw new SketchesArgumentException("maxMapSize is not a power of 2.");
1✔
368
    }
369
    return 3.5 / maxMapSize;
1✔
370
  }
371

372
  /**
373
   * Gets the estimate of the frequency of the given item.
374
   * Note: The true frequency of a item would be the sum of the counts as a result of the
375
   * two update functions.
376
   *
377
   * @param item the given item
378
   * @return the estimate of the frequency of the given item
379
   */
380
  public long getEstimate(final long item) {
381
    // If item is tracked:
382
    // Estimate = itemCount + offset; Otherwise it is 0.
383
    final long itemCount = hashMap.get(item);
1✔
384
    return (itemCount > 0) ? itemCount + offset : 0;
1✔
385
  }
386

387
  /**
388
   * Gets the guaranteed lower bound frequency of the given item, which can never be
389
   * negative.
390
   *
391
   * @param item the given item.
392
   * @return the guaranteed lower bound frequency of the given item. That is, a number which
393
   * is guaranteed to be no larger than the real frequency.
394
   */
395
  public long getLowerBound(final long item) {
396
    //LB = itemCount or 0
397
    return hashMap.get(item);
1✔
398
  }
399

400
  /**
401
   * Returns an array of Rows that include frequent items, estimates, upper and lower bounds
402
   * given a threshold and an ErrorCondition. If the threshold is lower than getMaximumError(),
403
   * then getMaximumError() will be used instead.
404
   *
405
   * <p>The method first examines all active items in the sketch (items that have a counter).
406
   *
407
   * <p>If <i>ErrorType = NO_FALSE_NEGATIVES</i>, this will include an item in the result
408
   * list if getUpperBound(item) &gt; threshold.
409
   * There will be no false negatives, i.e., no Type II error.
410
   * There may be items in the set with true frequencies less than the threshold
411
   * (false positives).</p>
412
   *
413
   * <p>If <i>ErrorType = NO_FALSE_POSITIVES</i>, this will include an item in the result
414
   * list if getLowerBound(item) &gt; threshold.
415
   * There will be no false positives, i.e., no Type I error.
416
   * There may be items omitted from the set with true frequencies greater than the
417
   * threshold (false negatives). This is a subset of the NO_FALSE_NEGATIVES case.</p>
418
   *
419
   * @param threshold to include items in the result list
420
   * @param errorType determines whether no false positives or no false negatives are
421
   * desired.
422
   * @return an array of frequent items
423
   */
424
  public Row[] getFrequentItems(final long threshold, final ErrorType errorType) {
425
    return sortItems(threshold > getMaximumError() ? threshold : getMaximumError(), errorType);
1✔
426
  }
427

428
  /**
429
   * Returns an array of Rows that include frequent items, estimates, upper and lower bounds
430
   * given an ErrorCondition and the default threshold.
431
   * This is the same as getFrequentItems(getMaximumError(), errorType)
432
   *
433
   * @param errorType determines whether no false positives or no false negatives are
434
   * desired.
435
   * @return an array of frequent items
436
   */
437
  public Row[] getFrequentItems(final ErrorType errorType) {
438
    return sortItems(getMaximumError(), errorType);
1✔
439
  }
440

441
  /**
442
   * @return An upper bound on the maximum error of getEstimate(item) for any item.
443
   * This is equivalent to the maximum distance between the upper bound and the lower bound
444
   * for any item.
445
   */
446
  public long getMaximumError() {
447
    return offset;
1✔
448
  }
449

450
  /**
451
   * Returns the maximum number of counters the sketch is configured to support.
452
   *
453
   * @return the maximum number of counters the sketch is configured to support.
454
   */
455
  public int getMaximumMapCapacity() {
456
    return (int) ((1 << lgMaxMapSize) * ReversePurgeLongHashMap.getLoadFactor());
1✔
457
  }
458

459
  /**
460
   * @return the number of active items in the sketch.
461
   */
462
  public int getNumActiveItems() {
463
    return hashMap.getNumActive();
1✔
464
  }
465

466
  /**
467
   * Returns the number of bytes required to store this sketch as an array of bytes.
468
   *
469
   * @return the number of bytes required to store this sketch as an array of bytes.
470
   */
471
  public int getStorageBytes() {
472
    if (isEmpty()) { return 8; }
1✔
473
    return (4 * 8) + (16 * getNumActiveItems());
1✔
474
  }
475

476
  /**
477
   * Returns the sum of the frequencies (weights or counts) in the stream seen so far by the sketch
478
   *
479
   * @return the sum of the frequencies in the stream seen so far by the sketch
480
   */
481
  public long getStreamLength() {
482
    return streamWeight;
1✔
483
  }
484

485
  /**
486
   * Gets the guaranteed upper bound frequency of the given item.
487
   *
488
   * @param item the given item
489
   * @return the guaranteed upper bound frequency of the given item. That is, a number which
490
   * is guaranteed to be no smaller than the real frequency.
491
   */
492
  public long getUpperBound(final long item) {
493
    // UB = itemCount + offset
494
    return hashMap.get(item) + offset;
1✔
495
  }
496

497
  /**
498
   * Returns true if this sketch is empty
499
   *
500
   * @return true if this sketch is empty
501
   */
502
  public boolean isEmpty() {
503
    return getNumActiveItems() == 0;
1✔
504
  }
505

506
  /**
507
   * This function merges the other sketch into this one.
508
   * The other sketch may be of a different size.
509
   *
510
   * @param other sketch of this class
511
   * @return a sketch whose estimates are within the guarantees of the
512
   * largest error tolerance of the two merged sketches.
513
   */
514
  public LongsSketch merge(final LongsSketch other) {
515
    if (other == null) { return this; }
1✔
516
    if (other.isEmpty()) { return this; }
1✔
517

518
    final long streamWt = streamWeight + other.streamWeight; //capture before merge
1✔
519

520
    final ReversePurgeLongHashMap.Iterator iter = other.hashMap.iterator();
1✔
521
    while (iter.next()) { //this may add to offset during rebuilds
1✔
522
      this.update(iter.getKey(), iter.getValue());
1✔
523
    }
524
    offset += other.offset;
1✔
525
    streamWeight = streamWt; //corrected streamWeight
1✔
526
    return this;
1✔
527
  }
528

529
  /**
530
   * Resets this sketch to a virgin state.
531
   */
532
  public void reset() {
533
    hashMap = new ReversePurgeLongHashMap(1 << LG_MIN_MAP_SIZE);
1✔
534
    curMapCap = hashMap.getCapacity();
1✔
535
    offset = 0;
1✔
536
    streamWeight = 0;
1✔
537
  }
1✔
538

539
  //Serialization
540

541
  /**
542
   * Returns a String representation of this sketch
543
   *
544
   * @return a String representation of this sketch
545
   */
546
  public String serializeToString() {
547
    final StringBuilder sb = new StringBuilder();
1✔
548
    //start the string with parameters of the sketch
549
    final int serVer = SER_VER;                 //0
1✔
550
    final int famID = Family.FREQUENCY.getID(); //1
1✔
551
    final int lgMaxMapSz = lgMaxMapSize;        //2
1✔
552
    final int flags = (hashMap.getNumActive() == 0) ? EMPTY_FLAG_MASK : 0; //3
1✔
553
    final String fmt = "%d,%d,%d,%d,%d,%d,";
1✔
554
    final String s =
1✔
555
        String.format(fmt, serVer, famID, lgMaxMapSz, flags, streamWeight, offset);
1✔
556
    sb.append(s);
1✔
557
    sb.append(hashMap.serializeToString()); //numActive, curMaplen, key[i], value[i], ...
1✔
558
    // maxMapCap, samplesize are deterministic functions of maxMapSize,
559
    //  so we don't need them in the serialization
560
    return sb.toString();
1✔
561
  }
562

563
  /**
564
   * Returns a byte array representation of this sketch
565
   * @return a byte array representation of this sketch
566
   */
567
  public byte[] toByteArray() {
568
    final int preLongs, outBytes;
569
    final boolean empty = isEmpty();
1✔
570
    final int activeItems = getNumActiveItems();
1✔
571
    if (empty) {
1✔
572
      preLongs = 1;
1✔
573
      outBytes = 8;
1✔
574
    } else {
575
      preLongs = Family.FREQUENCY.getMaxPreLongs(); //4
1✔
576
      outBytes = (preLongs + (2 * activeItems)) << 3; //2 because both keys and values are longs
1✔
577
    }
578
    final byte[] outArr = new byte[outBytes];
1✔
579
    final WritableMemory mem = WritableMemory.writableWrap(outArr);
1✔
580

581
    // build first preLong empty or not
582
    long pre0 = 0L;
1✔
583
    pre0 = insertPreLongs(preLongs, pre0);                  //Byte 0
1✔
584
    pre0 = insertSerVer(SER_VER, pre0);                     //Byte 1
1✔
585
    pre0 = insertFamilyID(Family.FREQUENCY.getID(), pre0);  //Byte 2
1✔
586
    pre0 = insertLgMaxMapSize(lgMaxMapSize, pre0);          //Byte 3
1✔
587
    pre0 = insertLgCurMapSize(hashMap.getLgLength(), pre0); //Byte 4
1✔
588
    pre0 = (empty) ? insertFlags(EMPTY_FLAG_MASK, pre0) : insertFlags(0, pre0); //Byte 5
1✔
589

590
    if (empty) {
1✔
591
      mem.putLong(0, pre0);
1✔
592
    } else {
593
      final long pre = 0;
1✔
594
      final long[] preArr = new long[preLongs];
1✔
595
      preArr[0] = pre0;
1✔
596
      preArr[1] = insertActiveItems(activeItems, pre);
1✔
597
      preArr[2] = streamWeight;
1✔
598
      preArr[3] = offset;
1✔
599
      mem.putLongArray(0, preArr, 0, preLongs);
1✔
600
      final int preBytes = preLongs << 3;
1✔
601
      mem.putLongArray(preBytes, hashMap.getActiveValues(), 0, activeItems);
1✔
602

603
      mem.putLongArray(preBytes + (activeItems << 3), hashMap.getActiveKeys(), 0,
1✔
604
          activeItems);
605
    }
606
    return outArr;
1✔
607
  }
608

609
  /**
610
   * Returns a human readable summary of this sketch.
611
   * @return a human readable summary of this sketch.
612
   */
613
  @Override
614
  public String toString() {
615
    final StringBuilder sb = new StringBuilder();
1✔
616
    sb.append("FrequentLongsSketch:").append(LS);
1✔
617
    sb.append("  Stream Length    : " + streamWeight).append(LS);
1✔
618
    sb.append("  Max Error Offset : " + offset).append(LS);
1✔
619
    sb.append(hashMap.toString());
1✔
620
    return sb.toString();
1✔
621
  }
622

623
  /**
624
   * Returns a human readable string of the preamble of a byte array image of a LongsSketch.
625
   * @param byteArr the given byte array
626
   * @return a human readable string of the preamble of a byte array image of a LongsSketch.
627
   */
628
  public static String toString(final byte[] byteArr) {
629
    return toString(Memory.wrap(byteArr));
1✔
630
  }
631

632
  /**
633
   * Returns a human readable string of the preamble of a Memory image of a LongsSketch.
634
   * @param mem the given Memory object
635
   * @return  a human readable string of the preamble of a Memory image of a LongsSketch.
636
   */
637
  public static String toString(final Memory mem) {
638
    return PreambleUtil.preambleToString(mem);
1✔
639
  }
640

641
  /**
642
   * Update this sketch with an item and a frequency count of one.
643
   * @param item for which the frequency should be increased.
644
   */
645
  public void update(final long item) {
646
    update(item, 1);
1✔
647
  }
1✔
648

649
  /**
650
   * Update this sketch with a item and a positive frequency count (or weight).
651
   * @param item for which the frequency should be increased. The item can be any long value
652
   * and is only used by the sketch to determine uniqueness.
653
   * @param count the amount by which the frequency of the item should be increased.
654
   * An count of zero is a no-op, and a negative count will throw an exception.
655
   */
656
  public void update(final long item, final long count) {
657
    if (count == 0) { return; }
1✔
658
    if (count < 0) {
1✔
659
      throw new SketchesArgumentException("Count may not be negative");
1✔
660
    }
661
    streamWeight += count;
1✔
662
    hashMap.adjustOrPutValue(item, count);
1✔
663

664
    if (getNumActiveItems() > curMapCap) { //over the threshold, we need to do something
1✔
665
      if (hashMap.getLgLength() < lgMaxMapSize) { //below tgt size, we can grow
1✔
666
        hashMap.resize(2 * hashMap.getLength());
1✔
667
        curMapCap = hashMap.getCapacity();
1✔
668
      } else { //At tgt size, must purge
669
        offset += hashMap.purge(sampleSize);
1✔
670
        if (getNumActiveItems() > getMaximumMapCapacity()) {
1✔
671
          throw new SketchesStateException("Purge did not reduce active items.");
×
672
        }
673
      }
674
    }
675
  }
1✔
676

677
  /**
678
   * Row class that defines the return values from a getFrequentItems query.
679
   */
680
  public static class Row implements Comparable<Row> {
681
    final long item;
682
    final long est;
683
    final long ub;
684
    final long lb;
685
    private static final String fmt =  ("  %20d%20d%20d %d");
686
    private static final String hfmt = ("  %20s%20s%20s %s");
687

688
    Row(final long item, final long estimate, final long ub, final long lb) {
1✔
689
      this.item = item;
1✔
690
      est = estimate;
1✔
691
      this.ub = ub;
1✔
692
      this.lb = lb;
1✔
693
    }
1✔
694

695
    /**
696
     * @return item of type T
697
     */
698
    public long getItem() { return item; }
1✔
699

700
    /**
701
     * @return the estimate
702
     */
703
    public long getEstimate() { return est; }
1✔
704

705
    /**
706
     * @return the upper bound
707
     */
708
    public long getUpperBound() { return ub; }
1✔
709

710
    /**
711
     * @return return the lower bound
712
     */
713
    public long getLowerBound() { return lb; }
1✔
714

715
    /**
716
     * @return the descriptive row header
717
     */
718
    public static String getRowHeader() {
719
      return String.format(hfmt,"Est", "UB", "LB", "Item");
1✔
720
    }
721

722
    @Override
723
    public String toString() {
724
      return String.format(fmt, est, ub, lb, item);
1✔
725
    }
726

727
    /**
728
     * This compareTo is strictly limited to the Row.getEstimate() value and does not imply any
729
     * ordering whatsoever to the other elements of the row: item and upper and lower bounds.
730
     * Defined this way, this compareTo will be consistent with hashCode() and equals(Object).
731
     * @param that the other row to compare to.
732
     * @return a negative integer, zero, or a positive integer as this.getEstimate() is less than,
733
     * equal to, or greater than that.getEstimate().
734
     */
735
    @Override
736
    public int compareTo(final Row that) {
737
      return (est < that.est) ? -1 : (est > that.est) ? 1 : 0;
1✔
738
    }
739

740
    /**
741
     * This hashCode is computed only from the Row.getEstimate() value.
742
     * Defined this way, this hashCode will be consistent with equals(Object):<br>
743
     * If (x.equals(y)) implies: x.hashCode() == y.hashCode().<br>
744
     * If (!x.equals(y)) does NOT imply: x.hashCode() != y.hashCode().
745
     * @return the hashCode computed from getEstimate().
746
     */
747
    @Override
748
    public int hashCode() {
749
      final int prime = 31;
1✔
750
      int result = 1;
1✔
751
      result = (prime * result) + (int) (est ^ (est >>> 32));
1✔
752
      return result;
1✔
753
    }
754

755
    /**
756
     * This equals is computed only from the Row.getEstimate() value and does not imply equality
757
     * of the other items within the row: item and upper and lower bounds.
758
     * Defined this way, this equals will be consistent with compareTo(Row).
759
     * @param obj the other row to determine equality with.
760
     * @return true if this.getEstimate() equals ((Row)obj).getEstimate().
761
     */
762
    @Override
763
    public boolean equals(final Object obj) {
764
      if (this == obj) { return true; }
1✔
765
      if (obj == null) { return false; }
1✔
766
      if ( !(obj instanceof Row)) { return false; }
1✔
767
      final Row that = (Row) obj;
1✔
768
      if (est != that.est) { return false; }
1✔
769
      return true;
1✔
770
    }
771

772
  } // End of class Row
773

774
  Row[] sortItems(final long threshold, final ErrorType errorType) {
775
    final ArrayList<Row> rowList = new ArrayList<>();
1✔
776
    final ReversePurgeLongHashMap.Iterator iter = hashMap.iterator();
1✔
777
    if (errorType == ErrorType.NO_FALSE_NEGATIVES) {
1✔
778
      while (iter.next()) {
1✔
779
        final long est = getEstimate(iter.getKey());
1✔
780
        final long ub = getUpperBound(iter.getKey());
1✔
781
        final long lb = getLowerBound(iter.getKey());
1✔
782
        if (ub >= threshold) {
1✔
783
          final Row row = new Row(iter.getKey(), est, ub, lb);
1✔
784
          rowList.add(row);
1✔
785
        }
786
      }
1✔
787
    } else { //NO_FALSE_POSITIVES
788
      while (iter.next()) {
1✔
789
        final long est = getEstimate(iter.getKey());
1✔
790
        final long ub = getUpperBound(iter.getKey());
1✔
791
        final long lb = getLowerBound(iter.getKey());
1✔
792
        if (lb >= threshold) {
1✔
793
          final Row row = new Row(iter.getKey(), est, ub, lb);
1✔
794
          rowList.add(row);
1✔
795
        }
796
      }
1✔
797
    }
798

799
    // descending order
800
    rowList.sort(new Comparator<Row>() {
1✔
801
      @Override
802
      public int compare(final Row r1, final Row r2) {
803
        return r2.compareTo(r1);
1✔
804
      }
805
    });
806

807
    final Row[] rowsArr = rowList.toArray(new Row[rowList.size()]);
1✔
808
    return rowsArr;
1✔
809
  }
810

811
  /**
812
   * Deserializes an array of String tokens into a hash map object of this class.
813
   *
814
   * @param tokens the given array of Strings tokens.
815
   * @return a hash map object of this class
816
   */
817
  static ReversePurgeLongHashMap deserializeFromStringArray(final String[] tokens) {
818
    final int ignore = STR_PREAMBLE_TOKENS;
1✔
819
    final int numActive = Integer.parseInt(tokens[ignore]);
1✔
820
    final int length = Integer.parseInt(tokens[ignore + 1]);
1✔
821
    final ReversePurgeLongHashMap hashMap = new ReversePurgeLongHashMap(length);
1✔
822
    int j = 2 + ignore;
1✔
823
    for (int i = 0; i < numActive; i++) {
1✔
824
      final long key = Long.parseLong(tokens[j++]);
1✔
825
      final long value = Long.parseLong(tokens[j++]);
1✔
826
      hashMap.adjustOrPutValue(key, value);
1✔
827
    }
828
    return hashMap;
1✔
829
  }
830

831
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc