• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

apache / datasketches-java / #306

30 Apr 2024 10:01PM UTC coverage: 97.645% (-0.5%) from 98.139%
#306

push

web-flow
Merge pull request #555 from apache/fix_pom_xml_header

Fix pom xml header

26865 of 27513 relevant lines covered (97.64%)

0.98 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.69
/src/main/java/org/apache/datasketches/cpc/CpcSketch.java
1
/*
2
 * Licensed to the Apache Software Foundation (ASF) under one
3
 * or more contributor license agreements.  See the NOTICE file
4
 * distributed with this work for additional information
5
 * regarding copyright ownership.  The ASF licenses this file
6
 * to you under the Apache License, Version 2.0 (the
7
 * "License"); you may not use this file except in compliance
8
 * with the License.  You may obtain a copy of the License at
9
 *
10
 *   http://www.apache.org/licenses/LICENSE-2.0
11
 *
12
 * Unless required by applicable law or agreed to in writing,
13
 * software distributed under the License is distributed on an
14
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
 * KIND, either express or implied.  See the License for the
16
 * specific language governing permissions and limitations
17
 * under the License.
18
 */
19

20
package org.apache.datasketches.cpc;
21

22
import static java.lang.Math.log;
23
import static java.lang.Math.sqrt;
24
import static java.nio.charset.StandardCharsets.UTF_8;
25
import static org.apache.datasketches.common.Util.LS;
26
import static org.apache.datasketches.common.Util.invPow2;
27
import static org.apache.datasketches.common.Util.zeroPad;
28
import static org.apache.datasketches.cpc.CpcUtil.bitMatrixOfSketch;
29
import static org.apache.datasketches.cpc.CpcUtil.checkLgK;
30
import static org.apache.datasketches.cpc.CpcUtil.countBitsSetInMatrix;
31
import static org.apache.datasketches.hash.MurmurHash3.hash;
32

33
import java.nio.ByteBuffer;
34
import java.util.Arrays;
35

36
import org.apache.datasketches.common.Family;
37
import org.apache.datasketches.memory.Memory;
38
import org.apache.datasketches.memory.WritableMemory;
39
import org.apache.datasketches.thetacommon.ThetaUtil;
40

41
/**
42
 * This is a unique-counting sketch that implements the
43
 * <i>Compressed Probabilistic Counting (CPC, a.k.a FM85)</i> algorithms developed by Kevin Lang in
44
 * his paper
45
 * <a href="https://arxiv.org/abs/1708.06839">Back to the Future: an Even More Nearly
46
 * Optimal Cardinality Estimation Algorithm</a>.
47
 *
48
 * <p>This sketch is extremely space-efficient when serialized. In an apples-to-apples empirical
49
 * comparison against compressed HyperLogLog sketches, this new algorithm simultaneously wins on
50
 * the two dimensions of the space/accuracy tradeoff and produces sketches that are
51
 * smaller than the entropy of HLL, so no possible implementation of compressed HLL can match its
52
 * space efficiency for a given accuracy. As described in the paper this sketch implements a newly
53
 * developed ICON estimator algorithm that survives unioning operations, another
54
 * well-known estimator, the
55
 * <a href="https://arxiv.org/abs/1306.3284">Historical Inverse Probability (HIP)</a> estimator
56
 * does not.
57
 * The update speed performance of this sketch is quite fast and is comparable to the speed of HLL.
58
 * The unioning (merging) capability of this sketch also allows for merging of sketches with
59
 * different configurations of K.
60
 *
61
 * <p>For additional security this sketch can be configured with a user-specified hash seed.
62
 *
63
 * @author Lee Rhodes
64
 * @author Kevin Lang
65
 */
66
public final class CpcSketch {
67
  private static final double[] kxpByteLookup = new double[256];
1✔
68

69
  /**
70
   * The default Log_base2 of K
71
   */
72
  public static final int DEFAULT_LG_K = 11;
73
  final long seed;
74
  //common variables
75
  final int lgK;
76
  long numCoupons;      // The number of coupons collected so far.
77
  boolean mergeFlag;    // Is the sketch the result of merging?
78
  int fiCol; // First Interesting Column. This is part of a speed optimization.
79

80
  int windowOffset;
81
  byte[] slidingWindow; //either null or size K bytes
82
  PairTable pairTable; //for sparse and surprising values, either null or variable size
83

84
  //The following variables are only valid in HIP varients
85
  double kxp;                  //used with HIP
86
  double hipEstAccum;          //used with HIP
87

88
  /**
89
   * Constructor with default log_base2 of k
90
   */
91
  public CpcSketch() {
92
    this(DEFAULT_LG_K, ThetaUtil.DEFAULT_UPDATE_SEED);
×
93
  }
×
94

95
  /**
96
   * Constructor with log_base2 of k.
97
   * @param lgK the given log_base2 of k
98
   */
99
  public CpcSketch(final int lgK) {
100
    this(lgK, ThetaUtil.DEFAULT_UPDATE_SEED);
1✔
101
  }
1✔
102

103
  /**
104
   * Constructor with log_base2 of k and seed.
105
   * @param lgK the given log_base2 of k
106
   * @param seed the given seed
107
   */
108
  public CpcSketch(final int lgK, final long seed) {
1✔
109
    checkLgK(lgK);
1✔
110
    this.lgK = (byte) lgK;
1✔
111
    this.seed = seed;
1✔
112
    kxp = 1 << lgK;
1✔
113
    reset();
1✔
114
  }
1✔
115

116
  /**
117
   * Returns a copy of this sketch
118
   * @return a copy of this sketch
119
   */
120
  CpcSketch copy() {
121
    final CpcSketch copy = new CpcSketch(lgK, seed);
1✔
122
    copy.numCoupons = numCoupons;
1✔
123
    copy.mergeFlag = mergeFlag;
1✔
124
    copy.fiCol = fiCol;
1✔
125

126
    copy.windowOffset = windowOffset;
1✔
127
    copy.slidingWindow = (slidingWindow == null) ? null : slidingWindow.clone();
1✔
128
    copy.pairTable = (pairTable == null) ? null : pairTable.copy();
1✔
129

130
    copy.kxp = kxp;
1✔
131
    copy.hipEstAccum = hipEstAccum;
1✔
132
    return copy;
1✔
133
  }
134

135
  /**
136
   * Returns the best estimate of the cardinality of the sketch.
137
   * @return the best estimate of the cardinality of the sketch.
138
   */
139
  public double getEstimate() {
140
    if (mergeFlag) { return IconEstimator.getIconEstimate(lgK, numCoupons); }
1✔
141
    return hipEstAccum;
1✔
142
  }
143

144
  /**
145
   * Return the DataSketches identifier for this CPC family of sketches.
146
   * @return the DataSketches identifier for this CPC family of sketches.
147
   */
148
  public static Family getFamily() {
149
    return Family.CPC;
1✔
150
  }
151

152
  /**
153
   * Return the parameter LgK.
154
   * @return the parameter LgK.
155
   */
156
  public int getLgK() {
157
    return lgK;
1✔
158
  }
159

160
  /**
161
   * Returns the best estimate of the lower bound of the confidence interval given <i>kappa</i>,
162
   * the number of standard deviations from the mean.
163
   * @param kappa the given number of standard deviations from the mean: 1, 2 or 3.
164
   * @return the best estimate of the lower bound of the confidence interval given <i>kappa</i>.
165
   */
166
  public double getLowerBound(final int kappa) {
167
    if (mergeFlag) {
1✔
168
      return CpcConfidence.getIconConfidenceLB(lgK, numCoupons, kappa);
1✔
169
    }
170
    return CpcConfidence.getHipConfidenceLB(lgK, numCoupons, hipEstAccum, kappa);
1✔
171
  }
172

173
  /*
174
   * These empirical values for the 99.9th percentile of size in bytes were measured using 100,000
175
   * trials. The value for each trial is the maximum of 5*16=80 measurements that were equally
176
   * spaced over values of the quantity C/K between 3.0 and 8.0.  This table does not include the
177
   * worst-case space for the preamble, which is added by the function.
178
   */
179
  private static final int empiricalSizeMaxLgK = 19;
180
  private static final int[] empiricalMaxBytes  = {
1✔
181
      24,     // lgK = 4
182
      36,     // lgK = 5
183
      56,     // lgK = 6
184
      100,    // lgK = 7
185
      180,    // lgK = 8
186
      344,    // lgK = 9
187
      660,    // lgK = 10
188
      1292,   // lgK = 11
189
      2540,   // lgK = 12
190
      5020,   // lgK = 13
191
      9968,   // lgK = 14
192
      19836,  // lgK = 15
193
      39532,  // lgK = 16
194
      78880,  // lgK = 17
195
      157516, // lgK = 18
196
      314656  // lgK = 19
197
  };
198
  private static final double empiricalMaxSizeFactor = 0.6; // 0.6 = 4.8 / 8.0
199
  private static final int maxPreambleSizeBytes = 40;
200

201
  /**
202
   * The actual size of a compressed CPC sketch has a small random variance, but the following
203
   * empirically measured size should be large enough for at least 99.9 percent of sketches.
204
   *
205
   * <p>For small values of <i>n</i> the size can be much smaller.
206
   *
207
   * @param lgK the given value of lgK.
208
   * @return the estimated maximum compressed serialized size of a sketch.
209
   */
210
  public static int getMaxSerializedBytes(final int lgK) {
211
    checkLgK(lgK);
1✔
212
    if (lgK <= empiricalSizeMaxLgK) { return empiricalMaxBytes[lgK - CpcUtil.minLgK] + maxPreambleSizeBytes; }
1✔
213
    final int k = 1 << lgK;
1✔
214
    return (int) (empiricalMaxSizeFactor * k) + maxPreambleSizeBytes;
1✔
215
  }
216

217
  /**
218
   * Returns the best estimate of the upper bound of the confidence interval given <i>kappa</i>,
219
   * the number of standard deviations from the mean.
220
   * @param kappa the given number of standard deviations from the mean: 1, 2 or 3.
221
   * @return the best estimate of the upper bound of the confidence interval given <i>kappa</i>.
222
   */
223
  public double getUpperBound(final int kappa) {
224
    if (mergeFlag) {
1✔
225
      return CpcConfidence.getIconConfidenceUB(lgK, numCoupons, kappa);
1✔
226
    }
227
    return CpcConfidence.getHipConfidenceUB(lgK, numCoupons, hipEstAccum, kappa);
1✔
228
  }
229

230
  /**
231
   * Return the given Memory as a CpcSketch on the Java heap using the DEFAULT_UPDATE_SEED.
232
   * @param mem the given Memory
233
   * @return the given Memory as a CpcSketch on the Java heap.
234
   */
235
  public static CpcSketch heapify(final Memory mem) {
236
    return heapify(mem, ThetaUtil.DEFAULT_UPDATE_SEED);
1✔
237
  }
238

239
  /**
240
   * Return the given byte array as a CpcSketch on the Java heap using the DEFAULT_UPDATE_SEED.
241
   * @param byteArray the given byte array
242
   * @return the given byte array as a CpcSketch on the Java heap.
243
   */
244
  public static CpcSketch heapify(final byte[] byteArray) {
245
    return heapify(byteArray, ThetaUtil.DEFAULT_UPDATE_SEED);
×
246
  }
247

248
  /**
249
   * Return the given Memory as a CpcSketch on the Java heap.
250
   * @param mem the given Memory
251
   * @param seed the seed used to create the original sketch from which the Memory was derived.
252
   * @return the given Memory as a CpcSketch on the Java heap.
253
   */
254
  public static CpcSketch heapify(final Memory mem, final long seed) {
255
    final CompressedState state = CompressedState.importFromMemory(mem);
1✔
256
    return uncompress(state, seed);
1✔
257
  }
258

259
  /**
260
   * Return the given byte array as a CpcSketch on the Java heap.
261
   * @param byteArray the given byte array
262
   * @param seed the seed used to create the original sketch from which the byte array was derived.
263
   * @return the given byte array as a CpcSketch on the Java heap.
264
   */
265
  public static CpcSketch heapify(final byte[] byteArray, final long seed) {
266
    final Memory mem = Memory.wrap(byteArray);
1✔
267
    return heapify(mem, seed);
1✔
268
  }
269

270
  /**
271
   * Return true if this sketch is empty
272
   * @return true if this sketch is empty
273
   */
274
  public boolean isEmpty() {
275
    return numCoupons == 0;
1✔
276
  }
277

278
  /**
279
   * Resets this sketch to empty but retains the original LgK and Seed.
280
   */
281
  public final void reset() {
282
    numCoupons = 0;
1✔
283
    mergeFlag = false;
1✔
284
    fiCol = 0;
1✔
285

286
    windowOffset = 0;
1✔
287
    slidingWindow = null;
1✔
288
    pairTable = null;
1✔
289

290
    kxp = 1 << lgK;
1✔
291
    hipEstAccum = 0;
1✔
292
  }
1✔
293

294
  /**
295
   * Return this sketch as a compressed byte array.
296
   * @return this sketch as a compressed byte array.
297
   */
298
  public byte[] toByteArray() {
299
    final CompressedState state = CompressedState.compress(this);
1✔
300
    final long cap = state.getRequiredSerializedBytes();
1✔
301
    final WritableMemory wmem = WritableMemory.allocate((int) cap);
1✔
302
    state.exportToMemory(wmem);
1✔
303
    return (byte[]) wmem.getArray();
1✔
304
  }
305

306
  /**
307
   * Present the given long as a potential unique item.
308
   *
309
   * @param datum The given long datum.
310
   */
311
  public void update(final long datum) {
312
    final long[] data = { datum };
1✔
313
    final long[] arr = hash(data, seed);
1✔
314
    hashUpdate(arr[0], arr[1]);
1✔
315
  }
1✔
316

317
  /**
318
   * Present the given double (or float) datum as a potential unique item.
319
   * The double will be converted to a long using Double.doubleToLongBits(datum),
320
   * which normalizes all NaN values to a single NaN representation.
321
   * Plus and minus zero will be normalized to plus zero.
322
   * The special floating-point values NaN and +/- Infinity are treated as distinct.
323
   *
324
   * @param datum The given double datum.
325
   */
326
  public void update(final double datum) {
327
    final double d = (datum == 0.0) ? 0.0 : datum; // canonicalize -0.0, 0.0
1✔
328
    final long[] data = { Double.doubleToLongBits(d) };// canonicalize all NaN forms
1✔
329
    final long[] arr = hash(data, seed);
1✔
330
    hashUpdate(arr[0], arr[1]);
1✔
331
  }
1✔
332

333
  /**
334
   * Present the given String as a potential unique item.
335
   * The string is converted to a byte array using UTF8 encoding.
336
   * If the string is null or empty no update attempt is made and the method returns.
337
   *
338
   * <p>Note: About 2X faster performance can be obtained by first converting the String to a
339
   * char[] and updating the sketch with that. This bypasses the complexity of the Java UTF_8
340
   * encoding. This, of course, will not produce the same internal hash values as updating directly
341
   * with a String. So be consistent!  Unioning two sketches, one fed with strings and the other
342
   * fed with char[] will be meaningless.
343
   * </p>
344
   *
345
   * @param datum The given String.
346
   */
347
  public void update(final String datum) {
348
    if ((datum == null) || datum.isEmpty()) { return; }
1✔
349
    final byte[] data = datum.getBytes(UTF_8);
1✔
350
    final long[] arr = hash(data, seed);
1✔
351
    hashUpdate(arr[0], arr[1]);
1✔
352
  }
1✔
353

354
  /**
355
   * Present the given byte array as a potential unique item.
356
   * If the byte array is null or empty no update attempt is made and the method returns.
357
   *
358
   * @param data The given byte array.
359
   */
360
  public void update(final byte[] data) {
361
    if ((data == null) || (data.length == 0)) { return; }
1✔
362
    final long[] arr = hash(data, seed);
1✔
363
    hashUpdate(arr[0], arr[1]);
1✔
364
  }
1✔
365

366
  /**
367
   * Present the given ByteBuffer as a potential unique item
368
   * If the ByteBuffer is null or empty no update attempt is made and the method returns
369
   *
370
   * @param data The given ByteBuffer
371
   */
372
  public void update(final ByteBuffer data) {
373
    if ((data == null) || data.hasRemaining() == false) { return; }
1✔
374
    final long[] arr = hash(data, seed);
1✔
375
    hashUpdate(arr[0], arr[1]);
1✔
376
  }
1✔
377

378
  /**
379
   * Present the given char array as a potential unique item.
380
   * If the char array is null or empty no update attempt is made and the method returns.
381
   *
382
   * <p>Note: this will not produce the same output hash values as the {@link #update(String)}
383
   * method but will be a little faster as it avoids the complexity of the UTF8 encoding.</p>
384
   *
385
   * @param data The given char array.
386
   */
387
  public void update(final char[] data) {
388
    if ((data == null) || (data.length == 0)) { return; }
1✔
389
    final long[] arr = hash(data, seed);
1✔
390
    hashUpdate(arr[0], arr[1]);
1✔
391
  }
1✔
392

393
  /**
394
   * Present the given integer array as a potential unique item.
395
   * If the integer array is null or empty no update attempt is made and the method returns.
396
   *
397
   * @param data The given int array.
398
   */
399
  public void update(final int[] data) {
400
    if ((data == null) || (data.length == 0)) { return; }
1✔
401
    final long[] arr = hash(data, seed);
1✔
402
    hashUpdate(arr[0], arr[1]);
1✔
403
  }
1✔
404

405
  /**
406
   * Present the given long array as a potential unique item.
407
   * If the long array is null or empty no update attempt is made and the method returns.
408
   *
409
   * @param data The given long array.
410
   */
411
  public void update(final long[] data) {
412
    if ((data == null) || (data.length == 0)) { return; }
1✔
413
    final long[] arr = hash(data, seed);
1✔
414
    hashUpdate(arr[0], arr[1]);
1✔
415
  }
1✔
416

417
  /**
418
   * Convience function that this Sketch is valid. This is a troubleshooting tool
419
   * for sketches that have been heapified from serialized images.
420
   *
421
   * <p>If you are starting with a serialized image as a byte array, first heapify
422
   * the byte array to a sketch, which performs a number of checks. Then use this
423
   * function as one additional check on the sketch.</p>
424
   *
425
   * @return true if this sketch is validated.
426
   */
427
  public boolean validate() {
428
    final long[] bitMatrix = bitMatrixOfSketch(this);
1✔
429
    final long matrixCoupons = countBitsSetInMatrix(bitMatrix);
1✔
430
    return matrixCoupons == numCoupons;
1✔
431
  }
432

433
  /**
434
   * Returns the current Flavor of this sketch.
435
   * @return the current Flavor of this sketch.
436
   */
437
  Flavor getFlavor() {
438
    return CpcUtil.determineFlavor(lgK, numCoupons);
1✔
439
  }
440

441
  /**
442
   * Returns the Format of the serialized form of this sketch.
443
   * @return the Format of the serialized form of this sketch.
444
   */
445
  Format getFormat() {
446
    final int ordinal;
447
    final Flavor f = getFlavor();
1✔
448
    if ((f == Flavor.HYBRID) || (f == Flavor.SPARSE)) {
1✔
449
      ordinal = 2 | ( mergeFlag ? 0 : 1 ); //Hybrid is serialized as SPARSE
1✔
450
    } else {
451
      ordinal = ((slidingWindow != null) ? 4 : 0)
1✔
452
               | (((pairTable != null) && (pairTable.getNumPairs() > 0)) ? 2 : 0)
1✔
453
               | ( mergeFlag ? 0 : 1 );
454
    }
455
    return Format.ordinalToFormat(ordinal);
1✔
456
  }
457

458
  private static void promoteEmptyToSparse(final CpcSketch sketch) {
459
    assert sketch.numCoupons == 0;
1✔
460
    assert sketch.pairTable == null;
1✔
461
    sketch.pairTable = new PairTable(2, 6 + sketch.lgK);
1✔
462
  }
1✔
463

464
  //In terms of flavor, this promotes SPARSE to HYBRID.
465
  private static void promoteSparseToWindowed(final CpcSketch sketch) {
466
    final int lgK = sketch.lgK;
1✔
467
    final int k = (1 << lgK);
1✔
468
    final long c32 = sketch.numCoupons << 5;
1✔
469
    assert ((c32 == (3 * k)) || ((lgK == 4) && (c32 > (3 * k))));
1✔
470

471
    final byte[] window = new byte[k];
1✔
472

473
    final PairTable newTable = new PairTable(2, 6 + lgK);
1✔
474
    final PairTable oldTable = sketch.pairTable;
1✔
475

476
    final int[] oldSlots = oldTable.getSlotsArr();
1✔
477
    final int oldNumSlots = (1 << oldTable.getLgSizeInts());
1✔
478

479
    assert (sketch.windowOffset == 0);
1✔
480

481
    for (int i = 0; i < oldNumSlots; i++) {
1✔
482
      final int rowCol = oldSlots[i];
1✔
483
      if (rowCol != -1) {
1✔
484
        final int col = rowCol & 63;
1✔
485
        if (col < 8) {
1✔
486
          final int  row = rowCol >>> 6;
1✔
487
          window[row] |= (1 << col);
1✔
488
        }
1✔
489
        else {
490
          // cannot use Table.mustInsert(), because it doesn't provide for growth
491
          final boolean isNovel = PairTable.maybeInsert(newTable, rowCol);
1✔
492
          assert (isNovel == true);
1✔
493
        }
494
      }
495
    }
496

497
    assert (sketch.slidingWindow == null);
1✔
498
    sketch.slidingWindow = window;
1✔
499
    sketch.pairTable = newTable;
1✔
500
  }
1✔
501

502
  /**
503
   * The KXP register is a double with roughly 50 bits of precision, but
504
   * it might need roughly 90 bits to track the value with perfect accuracy.
505
   * Therefore we recalculate KXP occasionally from the sketch's full bitMatrix
506
   * so that it will reflect changes that were previously outside the mantissa.
507
   * @param sketch the given sketch
508
   * @param bitMatrix the given bit Matrix
509
   */
510
  //Also used in test
511
  static void refreshKXP(final CpcSketch sketch, final long[] bitMatrix) {
512
    final int k = (1 << sketch.lgK);
1✔
513

514
    // for improved numerical accuracy, we separately sum the bytes of the U64's
515
    final double[] byteSums = new double[8];
1✔
516

517
    Arrays.fill(byteSums, 0.0);
1✔
518

519
    for (int i = 0; i < k; i++) {
1✔
520
      long row = bitMatrix[i];
1✔
521
      for (int j = 0; j < 8; j++) {
1✔
522
        final int byteIdx = (int) (row & 0XFFL);
1✔
523
        byteSums[j] += kxpByteLookup[byteIdx];
1✔
524
        row >>>= 8;
1✔
525
      }
526
    }
527

528
    double total = 0.0;
1✔
529

530
    for (int j = 7; j-- > 0; ) { // the reverse order is important
1✔
531
      final double factor = invPow2(8 * j); // pow(256, -j) == pow(2, -8 * j);
1✔
532
      total += factor * byteSums[j];
1✔
533
    }
1✔
534
    sketch.kxp = total;
1✔
535
  }
1✔
536

537
  /**
538
   * This moves the sliding window
539
   * @param sketch the given sketch
540
   * @param newOffset the new offset, which must be oldOffset + 1
541
   */
542
  private static void modifyOffset(final CpcSketch sketch, final int newOffset) {
543
    assert ((newOffset >= 0) && (newOffset <= 56));
1✔
544
    assert (newOffset == (sketch.windowOffset + 1));
1✔
545
    assert (newOffset == CpcUtil.determineCorrectOffset(sketch.lgK, sketch.numCoupons));
1✔
546

547
    assert (sketch.slidingWindow != null);
1✔
548
    assert (sketch.pairTable != null);
1✔
549
    final int k = 1 << sketch.lgK;
1✔
550

551
    // Construct the full-sized bit matrix that corresponds to the sketch
552
    final long[] bitMatrix = CpcUtil.bitMatrixOfSketch(sketch);
1✔
553

554
    // refresh the KXP register on every 8th window shift.
555
    if ((newOffset & 0x7) == 0) { refreshKXP(sketch, bitMatrix); }
1✔
556

557
    sketch.pairTable.clear();
1✔
558

559
    final PairTable table = sketch.pairTable;
1✔
560
    final byte[] window = sketch.slidingWindow;
1✔
561
    final long maskForClearingWindow = (0XFFL << newOffset) ^ -1L;
1✔
562
    final long maskForFlippingEarlyZone = (1L << newOffset) - 1L;
1✔
563
    long allSurprisesORed = 0;
1✔
564

565
    for (int i = 0; i < k; i++) {
1✔
566
      long pattern = bitMatrix[i];
1✔
567
      window[i] = (byte) ((pattern >>> newOffset) & 0XFFL);
1✔
568
      pattern &= maskForClearingWindow;
1✔
569
      // The following line converts surprising 0's to 1's in the "early zone",
570
      // (and vice versa, which is essential for this procedure's O(k) time cost).
571
      pattern ^= maskForFlippingEarlyZone;
1✔
572
      allSurprisesORed |= pattern; // a cheap way to recalculate fiCol
1✔
573
      while (pattern != 0) {
1✔
574
        final int col = Long.numberOfTrailingZeros(pattern);
1✔
575
        pattern = pattern ^ (1L << col); // erase the 1.
1✔
576
        final int rowCol = (i << 6) | col;
1✔
577
        final boolean isNovel = PairTable.maybeInsert(table, rowCol);
1✔
578
        assert isNovel == true;
1✔
579
      }
1✔
580
    }
581
    sketch.windowOffset = newOffset;
1✔
582
    sketch.fiCol = Long.numberOfTrailingZeros(allSurprisesORed);
1✔
583
    if (sketch.fiCol > newOffset) {
1✔
584
      sketch.fiCol = newOffset; // corner case
×
585
    }
586
  }
1✔
587

588
  /**
589
   * Call this whenever a new coupon has been collected.
590
   * @param sketch the given sketch
591
   * @param rowCol the given row / column
592
   */
593
  private static void updateHIP(final CpcSketch sketch, final int rowCol) {
594
    final int k = 1 << sketch.lgK;
1✔
595
    final int col = rowCol & 63;
1✔
596
    final double oneOverP = k / sketch.kxp;
1✔
597
    sketch.hipEstAccum += oneOverP;
1✔
598
    sketch.kxp -= invPow2(col + 1); // notice the "+1"
1✔
599
  }
1✔
600

601
  private static void updateSparse(final CpcSketch sketch, final int rowCol) {
602
    final int k = 1 << sketch.lgK;
1✔
603
    final long c32pre = sketch.numCoupons << 5;
1✔
604
    assert (c32pre < (3L * k)); // C < 3K/32, in other words, flavor == SPARSE
1✔
605
    assert (sketch.pairTable != null);
1✔
606
    final boolean isNovel = PairTable.maybeInsert(sketch.pairTable, rowCol);
1✔
607
    if (isNovel) {
1✔
608
      sketch.numCoupons += 1;
1✔
609
      updateHIP(sketch, rowCol);
1✔
610
      final long c32post = sketch.numCoupons << 5;
1✔
611
      if (c32post >= (3L * k)) { promoteSparseToWindowed(sketch); } // C >= 3K/32
1✔
612
    }
613
  }
1✔
614

615
  /**
616
   * The flavor is HYBRID, PINNED, or SLIDING.
617
   * @param sketch the given sketch
618
   * @param rowCol the given rowCol
619
   */
620
  private static void updateWindowed(final CpcSketch sketch, final int rowCol) {
621
    assert ((sketch.windowOffset >= 0) && (sketch.windowOffset <= 56));
1✔
622
    final int k = 1 << sketch.lgK;
1✔
623
    final long c32pre = sketch.numCoupons << 5;
1✔
624
    assert c32pre >= (3L * k); // C < 3K/32, in other words flavor >= HYBRID
1✔
625
    final long c8pre = sketch.numCoupons << 3;
1✔
626
    final int w8pre = sketch.windowOffset << 3;
1✔
627
    assert c8pre < ((27L + w8pre) * k); // C < (K * 27/8) + (K * windowOffset)
1✔
628

629
    boolean isNovel = false; //novel if new coupon
1✔
630
    final int col = rowCol & 63;
1✔
631

632
    if (col < sketch.windowOffset) { // track the surprising 0's "before" the window
1✔
633
      isNovel = PairTable.maybeDelete(sketch.pairTable, rowCol); // inverted logic
1✔
634
    }
635
    else if (col < (sketch.windowOffset + 8)) { // track the 8 bits inside the window
1✔
636
      assert (col >= sketch.windowOffset);
1✔
637
      final int row = rowCol >>> 6;
1✔
638
      final byte oldBits = sketch.slidingWindow[row];
1✔
639
      final byte newBits = (byte) (oldBits | (1 << (col - sketch.windowOffset)));
1✔
640
      if (newBits != oldBits) {
1✔
641
        sketch.slidingWindow[row] = newBits;
1✔
642
        isNovel = true;
1✔
643
      }
644
    }
1✔
645
    else { // track the surprising 1's "after" the window
646
      assert col >= (sketch.windowOffset + 8);
1✔
647
      isNovel = PairTable.maybeInsert(sketch.pairTable, rowCol); // normal logic
1✔
648
    }
649

650
    if (isNovel) {
1✔
651
      sketch.numCoupons += 1;
1✔
652
      updateHIP(sketch, rowCol);
1✔
653
      final long c8post = sketch.numCoupons << 3;
1✔
654
      if (c8post >= ((27L + w8pre) * k)) {
1✔
655
        modifyOffset(sketch, sketch.windowOffset + 1);
1✔
656
        assert (sketch.windowOffset >= 1) && (sketch.windowOffset <= 56);
1✔
657
        final int w8post = sketch.windowOffset << 3;
1✔
658
        assert c8post < ((27L + w8post) * k); // C < (K * 27/8) + (K * windowOffset)
1✔
659
      }
660
    }
661
  }
1✔
662

663
  //also used in test
664
  static CpcSketch uncompress(final CompressedState source, final long seed) {
665
    ThetaUtil.checkSeedHashes(ThetaUtil.computeSeedHash(seed), source.seedHash);
1✔
666
    final CpcSketch sketch = new CpcSketch(source.lgK, seed);
1✔
667
    sketch.numCoupons = source.numCoupons;
1✔
668
    sketch.windowOffset = source.getWindowOffset();
1✔
669
    sketch.fiCol = source.fiCol;
1✔
670
    sketch.mergeFlag = source.mergeFlag;
1✔
671
    sketch.kxp = source.kxp;
1✔
672
    sketch.hipEstAccum = source.hipEstAccum;
1✔
673
    sketch.slidingWindow = null;
1✔
674
    sketch.pairTable = null;
1✔
675
    CpcCompression.uncompress(source, sketch);
1✔
676
    return sketch;
1✔
677
  }
678

679
  //Used here and for testing
680
  void hashUpdate(final long hash0, final long hash1) {
681
    int col = Long.numberOfLeadingZeros(hash1);
1✔
682
    if (col < fiCol) { return; } // important speed optimization
1✔
683
    if (col > 63) { col = 63; } // clip so that 0 <= col <= 63
1✔
684
    final long c = numCoupons;
1✔
685
    if (c == 0) { promoteEmptyToSparse(this); }
1✔
686
    final long k = 1L << lgK;
1✔
687
    final int row = (int) (hash0 & (k - 1L));
1✔
688
    int rowCol = (row << 6) | col;
1✔
689

690
    // Avoid the hash table's "empty" value which is (2^26 -1, 63) (all ones) by changing it
691
    // to the pair (2^26 - 2, 63), which effectively merges the two cells.
692
    // This case is *extremely* unlikely, but we might as well handle it.
693
    // It can't happen at all if lgK (or maxLgK) < 26.
694
    if (rowCol == -1) { rowCol ^= (1 << 6); } //set the LSB of row to 0
1✔
695

696
    if ((c << 5) < (3L * k)) { updateSparse(this, rowCol); }
1✔
697
    else { updateWindowed(this, rowCol); }
1✔
698
  }
1✔
699

700
  //Used by union and in testing
701
  void rowColUpdate(final int rowCol) {
702
    final int col = rowCol & 63;
1✔
703
    if (col < fiCol) { return; } // important speed optimization
1✔
704
    final long c = numCoupons;
1✔
705
    if (c == 0) { promoteEmptyToSparse(this); }
1✔
706
    final long k = 1L << lgK;
1✔
707
    if ((c << 5) < (3L * k)) { updateSparse(this, rowCol); }
1✔
708
    else { updateWindowed(this, rowCol); }
1✔
709
  }
1✔
710

711
  /**
712
   * Return a human-readable string summary of this sketch
713
   */
714
  @Override
715
  public String toString() {
716
    return toString(false);
1✔
717
  }
718

719
  /**
720
   * Return a human-readable string summary of this sketch
721
   * @param detail include data detail
722
   * @return a human-readable string summary of this sketch
723
   */
724
  public String toString(final boolean detail) {
725
    final int numPairs = (pairTable == null) ? 0 : pairTable.getNumPairs();
1✔
726
    final int seedHash = Short.toUnsignedInt(ThetaUtil.computeSeedHash(seed));
1✔
727
    final double errConst = mergeFlag ? log(2) : sqrt(log(2) / 2.0);
1✔
728
    final double rse = errConst / Math.sqrt(1 << lgK);
1✔
729
    final StringBuilder sb = new StringBuilder();
1✔
730
    sb.append("### CPD SKETCH - PREAMBLE:").append(LS);
1✔
731
    sb.append("  Flavor         : ").append(getFlavor()).append(LS);
1✔
732
    sb.append("  LgK            : ").append(lgK).append(LS);
1✔
733
    sb.append("  Merge Flag     : ").append(mergeFlag).append(LS);
1✔
734
    sb.append("  Error Const    : ").append(errConst).append(LS);
1✔
735
    sb.append("  RSE            : ").append(rse).append(LS);
1✔
736
    sb.append("  Seed Hash      : ").append(Integer.toHexString(seedHash))
1✔
737
      .append(" | ").append(seedHash).append(LS);
1✔
738
    sb.append("  Num Coupons    : ").append(numCoupons).append(LS);
1✔
739
    sb.append("  Num Pairs (SV) : ").append(numPairs).append(LS);
1✔
740
    sb.append("  First Inter Col: ").append(fiCol).append(LS);
1✔
741
    sb.append("  Valid Window   : ").append(slidingWindow != null).append(LS);
1✔
742
    sb.append("  Valid PairTable: ").append(pairTable != null).append(LS);
1✔
743
    sb.append("  Window Offset  : ").append(windowOffset).append(LS);
1✔
744
    sb.append("  KxP            : ").append(kxp).append(LS);
1✔
745
    sb.append("  HIP Accum      : ").append(hipEstAccum).append(LS);
1✔
746
    if (detail) {
1✔
747
      sb.append(LS).append("### CPC SKETCH - DATA").append(LS);
1✔
748
      if (pairTable != null) {
1✔
749
        sb.append(pairTable.toString(true));
1✔
750
      }
751
      if (slidingWindow != null) {
1✔
752
        sb.append("SlidingWindow  : ").append(LS);
1✔
753
        sb.append("    Index Bits (lsb ->)").append(LS);
1✔
754
        for (int i = 0; i < slidingWindow.length; i++) {
1✔
755

756
          final String bits = zeroPad(Integer.toBinaryString(slidingWindow[i] & 0XFF), 8);
1✔
757
          sb.append(String.format("%9d %8s" + LS, i, bits));
1✔
758
        }
759
      }
760
    }
761
    sb.append("### END CPC SKETCH");
1✔
762
    return sb.toString();
1✔
763
  }
764

765
  /**
766
   * Returns a human readable string of the preamble of a byte array image of a CpcSketch.
767
   * @param byteArr the given byte array
768
   * @param detail if true, a dump of the compressed window and surprising value streams will be
769
   * included.
770
   * @return a human readable string of the preamble of a byte array image of a CpcSketch.
771
   */
772
  public static String toString(final byte[] byteArr, final boolean detail) {
773
    return PreambleUtil.toString(byteArr, detail);
1✔
774
  }
775

776
  /**
777
   * Returns a human readable string of the preamble of a Memory image of a CpcSketch.
778
   * @param mem the given Memory
779
   * @param detail if true, a dump of the compressed window and surprising value streams will be
780
   * included.
781
   * @return a human readable string of the preamble of a Memory image of a CpcSketch.
782
   */
783
  public static String toString(final Memory mem, final boolean detail) {
784
    return PreambleUtil.toString(mem, detail);
1✔
785
  }
786

787
  private static void fillKxpByteLookup() { //called from static initializer
788
    for (int b = 0; b < 256; b++) {
1✔
789
      double sum = 0;
1✔
790
      for (int col = 0; col < 8; col++) {
1✔
791
        final int bit = (b >>> col) & 1;
1✔
792
        if (bit == 0) { // note the inverted logic
1✔
793
          sum += invPow2(col + 1); //note the "+1"
1✔
794
        }
795
      }
796
      kxpByteLookup[b] = sum;
1✔
797
    }
798
  }
1✔
799

800
  static {
801
    fillKxpByteLookup();
1✔
802
  }
1✔
803

804
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc