• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

apache / datasketches-java / #306

30 Apr 2024 10:01PM UTC coverage: 97.645% (-0.5%) from 98.139%
#306

push

web-flow
Merge pull request #555 from apache/fix_pom_xml_header

Fix pom xml header

26865 of 27513 relevant lines covered (97.64%)

0.98 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.78
/src/main/java/org/apache/datasketches/theta/UpdateSketch.java
1
/*
2
 * Licensed to the Apache Software Foundation (ASF) under one
3
 * or more contributor license agreements.  See the NOTICE file
4
 * distributed with this work for additional information
5
 * regarding copyright ownership.  The ASF licenses this file
6
 * to you under the Apache License, Version 2.0 (the
7
 * "License"); you may not use this file except in compliance
8
 * with the License.  You may obtain a copy of the License at
9
 *
10
 *   http://www.apache.org/licenses/LICENSE-2.0
11
 *
12
 * Unless required by applicable law or agreed to in writing,
13
 * software distributed under the License is distributed on an
14
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
 * KIND, either express or implied.  See the License for the
16
 * specific language governing permissions and limitations
17
 * under the License.
18
 */
19

20
package org.apache.datasketches.theta;
21

22
import static java.nio.charset.StandardCharsets.UTF_8;
23
import static org.apache.datasketches.common.Util.LONG_MAX_VALUE_AS_DOUBLE;
24
import static org.apache.datasketches.common.Util.checkBounds;
25
import static org.apache.datasketches.hash.MurmurHash3.hash;
26
import static org.apache.datasketches.theta.CompactOperations.componentsToCompact;
27
import static org.apache.datasketches.theta.PreambleUtil.BIG_ENDIAN_FLAG_MASK;
28
import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
29
import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE;
30
import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
31
import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
32
import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
33
import static org.apache.datasketches.theta.PreambleUtil.SER_VER;
34
import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE;
35
import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
36
import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
37
import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
38
import static org.apache.datasketches.theta.PreambleUtil.extractLgResizeFactor;
39
import static org.apache.datasketches.theta.PreambleUtil.extractP;
40
import static org.apache.datasketches.theta.PreambleUtil.extractSerVer;
41
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
42
import static org.apache.datasketches.theta.PreambleUtil.getMemBytes;
43
import static org.apache.datasketches.theta.UpdateReturnState.RejectedNullOrEmpty;
44

45
import java.nio.ByteBuffer;
46
import java.util.Objects;
47

48
import org.apache.datasketches.common.Family;
49
import org.apache.datasketches.common.ResizeFactor;
50
import org.apache.datasketches.common.SketchesArgumentException;
51
import org.apache.datasketches.memory.Memory;
52
import org.apache.datasketches.memory.WritableMemory;
53
import org.apache.datasketches.thetacommon.ThetaUtil;
54

55
/**
56
 * The parent class for the  Update Sketch families, such as QuickSelect and Alpha.
57
 * The primary task of an Update Sketch is to consider datums presented via the update() methods
58
 * for inclusion in its internal cache. This is the sketch building process.
59
 *
60
 * @author Lee Rhodes
61
 */
62
public abstract class UpdateSketch extends Sketch {
63

64
  UpdateSketch() {}
1✔
65

66
  /**
67
  * Wrap takes the sketch image in Memory and refers to it directly. There is no data copying onto
68
  * the java heap. Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have
69
  * been explicitly stored as direct objects can be wrapped. This method assumes the
70
  * {@link org.apache.datasketches.thetacommon.ThetaUtil#DEFAULT_UPDATE_SEED}.
71
  * <a href="{@docRoot}/resources/dictionary.html#defaultUpdateSeed">Default Update Seed</a>.
72
  * @param srcMem an image of a Sketch where the image seed hash matches the default seed hash.
73
  * It must have a size of at least 24 bytes.
74
  * <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
75
  * @return a Sketch backed by the given Memory
76
  */
77
  public static UpdateSketch wrap(final WritableMemory srcMem) {
78
    return wrap(srcMem, ThetaUtil.DEFAULT_UPDATE_SEED);
1✔
79
  }
80

81
  /**
82
  * Wrap takes the sketch image in Memory and refers to it directly. There is no data copying onto
83
  * the java heap. Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have
84
  * been explicitly stored as direct objects can be wrapped.
85
  * An attempt to "wrap" earlier version sketches will result in a "heapified", normal
86
  * Java Heap version of the sketch where all data will be copied to the heap.
87
  * @param srcMem an image of a Sketch where the image seed hash matches the given seed hash.
88
  * It must have a size of at least 24 bytes.
89
  * <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
90
  * @param expectedSeed the seed used to validate the given Memory image.
91
  * <a href="{@docRoot}/resources/dictionary.html#seed">See Update Hash Seed</a>.
92
  * Compact sketches store a 16-bit hash of the seed, but not the seed itself.
93
  * @return a UpdateSketch backed by the given Memory
94
  */
95
  public static UpdateSketch wrap(final WritableMemory srcMem, final long expectedSeed) {
96
    Objects.requireNonNull(srcMem, "Source Memory must not be null");
1✔
97
    checkBounds(0, 24, srcMem.getCapacity()); //need min 24 bytes
1✔
98
    final int  preLongs = srcMem.getByte(PREAMBLE_LONGS_BYTE) & 0X3F;
1✔
99
    final int serVer = srcMem.getByte(SER_VER_BYTE) & 0XFF;
1✔
100
    final int familyID = srcMem.getByte(FAMILY_BYTE) & 0XFF;
1✔
101
    final Family family = Family.idToFamily(familyID);
1✔
102
    if (family != Family.QUICKSELECT) {
1✔
103
      throw new SketchesArgumentException(
×
104
        "A " + family + " sketch cannot be wrapped as an UpdateSketch.");
105
    }
106
    if ((serVer == 3) && (preLongs == 3)) {
1✔
107
      return DirectQuickSelectSketch.writableWrap(srcMem, expectedSeed);
1✔
108
    } else {
109
      throw new SketchesArgumentException(
1✔
110
        "Corrupted: An UpdateSketch image must have SerVer = 3 and preLongs = 3");
111
    }
112
  }
113

114
  /**
115
   * Instantiates an on-heap UpdateSketch from Memory. This method assumes the
116
   * {@link org.apache.datasketches.thetacommon.ThetaUtil#DEFAULT_UPDATE_SEED}.
117
   * @param srcMem <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
118
   * It must have a size of at least 24 bytes.
119
   * @return an UpdateSketch
120
   */
121
  public static UpdateSketch heapify(final Memory srcMem) {
122
    return heapify(srcMem, ThetaUtil.DEFAULT_UPDATE_SEED);
1✔
123
  }
124

125
  /**
126
   * Instantiates an on-heap UpdateSketch from Memory.
127
   * @param srcMem <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
128
   * It must have a size of at least 24 bytes.
129
   * @param expectedSeed the seed used to validate the given Memory image.
130
   * <a href="{@docRoot}/resources/dictionary.html#seed">See Update Hash Seed</a>.
131
   * @return an UpdateSketch
132
   */
133
  public static UpdateSketch heapify(final Memory srcMem, final long expectedSeed) {
134
    Objects.requireNonNull(srcMem, "Source Memory must not be null");
1✔
135
    checkBounds(0, 24, srcMem.getCapacity()); //need min 24 bytes
1✔
136
    final Family family = Family.idToFamily(srcMem.getByte(FAMILY_BYTE));
1✔
137
    if (family.equals(Family.ALPHA)) {
1✔
138
      return HeapAlphaSketch.heapifyInstance(srcMem, expectedSeed);
×
139
    }
140
    return HeapQuickSelectSketch.heapifyInstance(srcMem, expectedSeed);
1✔
141
  }
142

143
  //Sketch interface
144

145
  @Override
146
  public CompactSketch compact(final boolean dstOrdered, final WritableMemory dstMem) {
147
    return componentsToCompact(getThetaLong(), getRetainedEntries(true), getSeedHash(), isEmpty(),
1✔
148
        false, false, dstOrdered, dstMem, getCache());
1✔
149
  }
150

151
  @Override
152
  public int getCompactBytes() {
153
    final int preLongs = getCompactPreambleLongs();
1✔
154
    final int dataLongs = getRetainedEntries(true);
1✔
155
    return (preLongs + dataLongs) << 3;
1✔
156
  }
157

158
  @Override
159
  int getCurrentDataLongs() {
160
    return 1 << getLgArrLongs();
1✔
161
  }
162

163
  @Override
164
  public boolean isCompact() {
165
    return false;
1✔
166
  }
167

168
  @Override
169
  public boolean isOrdered() {
170
    return false;
1✔
171
  }
172

173
  //UpdateSketch interface
174

175
  /**
176
   * Returns a new builder
177
   * @return a new builder
178
   */
179
  public static final UpdateSketchBuilder builder() {
180
    return new UpdateSketchBuilder();
1✔
181
  }
182

183
  /**
184
   * Returns the configured ResizeFactor
185
   * @return the configured ResizeFactor
186
   */
187
  public abstract ResizeFactor getResizeFactor();
188

189
  /**
190
   * Gets the configured sampling probability, <i>p</i>.
191
   * <a href="{@docRoot}/resources/dictionary.html#p">See Sampling Probability, <i>p</i></a>
192
   * @return the sampling probability, <i>p</i>
193
   */
194
  abstract float getP();
195

196
  /**
197
   * Gets the configured seed
198
   * @return the configured seed
199
   */
200
  abstract long getSeed();
201

202
  /**
203
   * Resets this sketch back to a virgin empty state.
204
   */
205
  public abstract void reset();
206

207
  /**
208
   * Rebuilds the hash table to remove dirty values or to reduce the size
209
   * to nominal entries.
210
   * @return this sketch
211
   */
212
  public abstract UpdateSketch rebuild();
213

214
  /**
215
   * Present this sketch with a long.
216
   *
217
   * @param datum The given long datum.
218
   * @return
219
   * <a href="{@docRoot}/resources/dictionary.html#updateReturnState">See Update Return State</a>
220
   */
221
  public UpdateReturnState update(final long datum) {
222
    final long[] data = { datum };
1✔
223
    return hashUpdate(hash(data, getSeed())[0] >>> 1);
1✔
224
  }
225

226
  /**
227
   * Present this sketch with the given double (or float) datum.
228
   * The double will be converted to a long using Double.doubleToLongBits(datum),
229
   * which normalizes all NaN values to a single NaN representation.
230
   * Plus and minus zero will be normalized to plus zero.
231
   * The special floating-point values NaN and +/- Infinity are treated as distinct.
232
   *
233
   * @param datum The given double datum.
234
   * @return
235
   * <a href="{@docRoot}/resources/dictionary.html#updateReturnState">See Update Return State</a>
236
   */
237
  public UpdateReturnState update(final double datum) {
238
    final double d = (datum == 0.0) ? 0.0 : datum; // canonicalize -0.0, 0.0
1✔
239
    final long[] data = { Double.doubleToLongBits(d) };// canonicalize all NaN & +/- infinity forms
1✔
240
    return hashUpdate(hash(data, getSeed())[0] >>> 1);
1✔
241
  }
242

243
  /**
244
   * Present this sketch with the given String.
245
   * The string is converted to a byte array using UTF8 encoding.
246
   * If the string is null or empty no update attempt is made and the method returns.
247
   *
248
   * <p>Note: this will not produce the same output hash values as the {@link #update(char[])}
249
   * method and will generally be a little slower depending on the complexity of the UTF8 encoding.
250
   * </p>
251
   *
252
   * @param datum The given String.
253
   * @return
254
   * <a href="{@docRoot}/resources/dictionary.html#updateReturnState">See Update Return State</a>
255
   */
256
  public UpdateReturnState update(final String datum) {
257
    if ((datum == null) || datum.isEmpty()) {
1✔
258
      return RejectedNullOrEmpty;
1✔
259
    }
260
    final byte[] data = datum.getBytes(UTF_8);
1✔
261
    return hashUpdate(hash(data, getSeed())[0] >>> 1);
1✔
262
  }
263

264
  /**
265
   * Present this sketch with the given byte array.
266
   * If the byte array is null or empty no update attempt is made and the method returns.
267
   *
268
   * @param data The given byte array.
269
   * @return
270
   * <a href="{@docRoot}/resources/dictionary.html#updateReturnState">See Update Return State</a>
271
   */
272
  public UpdateReturnState update(final byte[] data) {
273
    if ((data == null) || (data.length == 0)) {
1✔
274
      return RejectedNullOrEmpty;
1✔
275
    }
276
    return hashUpdate(hash(data, getSeed())[0] >>> 1);
1✔
277
  }
278

279
  /**
280
   * Present this sketch with the given ByteBuffer
281
   * If the ByteBuffer is null or empty, no update attempt is made and the method returns.
282
   *
283
   * @param buffer the input ByteBuffer
284
   * @return
285
   * <a href="{@docRoot}/resources/dictionary.html#updateReturnState">See Update Return State</a>
286
   */
287
  public UpdateReturnState update(final ByteBuffer buffer) {
288
    if (buffer == null || buffer.hasRemaining() == false) {
1✔
289
      return RejectedNullOrEmpty;
1✔
290
    }
291
    return hashUpdate(hash(buffer, getSeed())[0] >>> 1);
1✔
292
  }
293

294
  /**
295
   * Present this sketch with the given char array.
296
   * If the char array is null or empty no update attempt is made and the method returns.
297
   *
298
   * <p>Note: this will not produce the same output hash values as the {@link #update(String)}
299
   * method but will be a little faster as it avoids the complexity of the UTF8 encoding.</p>
300
   *
301
   * @param data The given char array.
302
   * @return
303
   * <a href="{@docRoot}/resources/dictionary.html#updateReturnState">See Update Return State</a>
304
   */
305
  public UpdateReturnState update(final char[] data) {
306
    if ((data == null) || (data.length == 0)) {
1✔
307
      return RejectedNullOrEmpty;
1✔
308
    }
309
    return hashUpdate(hash(data, getSeed())[0] >>> 1);
1✔
310
  }
311

312
  /**
313
   * Present this sketch with the given integer array.
314
   * If the integer array is null or empty no update attempt is made and the method returns.
315
   *
316
   * @param data The given int array.
317
   * @return
318
   * <a href="{@docRoot}/resources/dictionary.html#updateReturnState">See Update Return State</a>
319
   */
320
  public UpdateReturnState update(final int[] data) {
321
    if ((data == null) || (data.length == 0)) {
1✔
322
      return RejectedNullOrEmpty;
1✔
323
    }
324
    return hashUpdate(hash(data, getSeed())[0] >>> 1);
1✔
325
  }
326

327
  /**
328
   * Present this sketch with the given long array.
329
   * If the long array is null or empty no update attempt is made and the method returns.
330
   *
331
   * @param data The given long array.
332
   * @return
333
   * <a href="{@docRoot}/resources/dictionary.html#updateReturnState">See Update Return State</a>
334
   */
335
  public UpdateReturnState update(final long[] data) {
336
    if ((data == null) || (data.length == 0)) {
1✔
337
      return RejectedNullOrEmpty;
1✔
338
    }
339
    return hashUpdate(hash(data, getSeed())[0] >>> 1);
1✔
340
  }
341

342
  //restricted methods
343

344
  /**
345
   * All potential updates converge here.
346
   * <p>Don't ever call this unless you really know what you are doing!</p>
347
   *
348
   * @param hash the given input hash value.  A hash of zero or Long.MAX_VALUE is ignored.
349
   * A negative hash value will throw an exception.
350
   * @return <a href="{@docRoot}/resources/dictionary.html#updateReturnState">See Update Return State</a>
351
   */
352
  abstract UpdateReturnState hashUpdate(long hash);
353

354
  /**
355
   * Gets the Log base 2 of the current size of the internal cache
356
   * @return the Log base 2 of the current size of the internal cache
357
   */
358
  abstract int getLgArrLongs();
359

360
  /**
361
   * Gets the Log base 2 of the configured nominal entries
362
   * @return the Log base 2 of the configured nominal entries
363
   */
364
  public abstract int getLgNomLongs();
365

366
  /**
367
   * Returns true if the internal cache contains "dirty" values that are greater than or equal
368
   * to thetaLong.
369
   * @return true if the internal cache is dirty.
370
   */
371
  abstract boolean isDirty();
372

373
  /**
374
   * Returns true if numEntries (curCount) is greater than the hashTableThreshold.
375
   * @param numEntries the given number of entries (or current count).
376
   * @return true if numEntries (curCount) is greater than the hashTableThreshold.
377
   */
378
  abstract boolean isOutOfSpace(int numEntries);
379

380
  static void checkUnionQuickSelectFamily(final Memory mem, final int preambleLongs,
381
      final int lgNomLongs) {
382
    //Check Family
383
    final int familyID = extractFamilyID(mem);                       //byte 2
1✔
384
    final Family family = Family.idToFamily(familyID);
1✔
385
    if (family.equals(Family.UNION)) {
1✔
386
      if (preambleLongs != Family.UNION.getMinPreLongs()) {
1✔
387
        throw new SketchesArgumentException(
1✔
388
            "Possible corruption: Invalid PreambleLongs value for UNION: " + preambleLongs);
389
      }
390
    }
391
    else if (family.equals(Family.QUICKSELECT)) {
1✔
392
      if (preambleLongs != Family.QUICKSELECT.getMinPreLongs()) {
1✔
393
        throw new SketchesArgumentException(
1✔
394
            "Possible corruption: Invalid PreambleLongs value for QUICKSELECT: " + preambleLongs);
395
      }
396
    } else {
397
      throw new SketchesArgumentException(
1✔
398
          "Possible corruption: Invalid Family: " + family.toString());
1✔
399
    }
400

401
    //Check lgNomLongs
402
    if (lgNomLongs < ThetaUtil.MIN_LG_NOM_LONGS) {
1✔
403
      throw new SketchesArgumentException(
1✔
404
          "Possible corruption: Current Memory lgNomLongs < min required size: "
405
              + lgNomLongs + " < " + ThetaUtil.MIN_LG_NOM_LONGS);
406
    }
407
  }
1✔
408

409
  static void checkMemIntegrity(final Memory srcMem, final long expectedSeed, final int preambleLongs,
410
      final int lgNomLongs, final int lgArrLongs) {
411

412
    //Check SerVer
413
    final int serVer = extractSerVer(srcMem);                           //byte 1
1✔
414
    if (serVer != SER_VER) {
1✔
415
      throw new SketchesArgumentException(
1✔
416
          "Possible corruption: Invalid Serialization Version: " + serVer);
417
    }
418

419
    //Check flags
420
    final int flags = extractFlags(srcMem);                             //byte 5
1✔
421
    final int flagsMask =
1✔
422
        ORDERED_FLAG_MASK | COMPACT_FLAG_MASK | READ_ONLY_FLAG_MASK | BIG_ENDIAN_FLAG_MASK;
423
    if ((flags & flagsMask) > 0) {
1✔
424
      throw new SketchesArgumentException(
1✔
425
        "Possible corruption: Input srcMem cannot be: big-endian, compact, ordered, or read-only");
426
    }
427

428
    //Check seed hashes
429
    final short seedHash = checkMemorySeedHash(srcMem, expectedSeed);              //byte 6,7
1✔
430
    ThetaUtil.checkSeedHashes(seedHash, ThetaUtil.computeSeedHash(expectedSeed));
1✔
431

432
    //Check mem capacity, lgArrLongs
433
    final long curCapBytes = srcMem.getCapacity();
1✔
434
    final int minReqBytes = getMemBytes(lgArrLongs, preambleLongs);
1✔
435
    if (curCapBytes < minReqBytes) {
1✔
436
      throw new SketchesArgumentException(
1✔
437
          "Possible corruption: Current Memory size < min required size: "
438
              + curCapBytes + " < " + minReqBytes);
439
    }
440
    //check Theta, p
441
    final float p = extractP(srcMem);                                   //bytes 12-15
1✔
442
    final long thetaLong = extractThetaLong(srcMem);                    //bytes 16-23
1✔
443
    final double theta = thetaLong / LONG_MAX_VALUE_AS_DOUBLE;
1✔
444
    //if (lgArrLongs <= lgNomLongs) the sketch is still resizing, thus theta cannot be < p.
445
    if ((lgArrLongs <= lgNomLongs) && (theta < p) ) {
1✔
446
      throw new SketchesArgumentException(
1✔
447
        "Possible corruption: Theta cannot be < p and lgArrLongs <= lgNomLongs. "
448
            + lgArrLongs + " <= " + lgNomLongs + ", Theta: " + theta + ", p: " + p);
449
    }
450
  }
1✔
451

452
  /**
453
   * This checks to see if the memory RF factor was set correctly as early versions may not
454
   * have set it.
455
   * @param srcMem the source memory
456
   * @param lgNomLongs the current lgNomLongs
457
   * @param lgArrLongs the current lgArrLongs
458
   * @return true if the the memory RF factor is incorrect and the caller can either
459
   * correct it or throw an error.
460
   */
461
  static boolean isResizeFactorIncorrect(final Memory srcMem, final int lgNomLongs,
462
      final int lgArrLongs) {
463
    final int lgT = lgNomLongs + 1;
1✔
464
    final int lgA = lgArrLongs;
1✔
465
    final int lgR = extractLgResizeFactor(srcMem);
1✔
466
    if (lgR == 0) { return lgA != lgT; }
1✔
467
    return !(((lgT - lgA) % lgR) == 0);
1✔
468
  }
469

470
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc