• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

apache / datasketches-java / #306

30 Apr 2024 10:01PM UTC coverage: 97.645% (-0.5%) from 98.139%
#306

push

web-flow
Merge pull request #555 from apache/fix_pom_xml_header

Fix pom xml header

26865 of 27513 relevant lines covered (97.64%)

0.98 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

97.26
/src/main/java/org/apache/datasketches/theta/CompactSketch.java
1
/*
2
 * Licensed to the Apache Software Foundation (ASF) under one
3
 * or more contributor license agreements.  See the NOTICE file
4
 * distributed with this work for additional information
5
 * regarding copyright ownership.  The ASF licenses this file
6
 * to you under the Apache License, Version 2.0 (the
7
 * "License"); you may not use this file except in compliance
8
 * with the License.  You may obtain a copy of the License at
9
 *
10
 *   http://www.apache.org/licenses/LICENSE-2.0
11
 *
12
 * Unless required by applicable law or agreed to in writing,
13
 * software distributed under the License is distributed on an
14
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
 * KIND, either express or implied.  See the License for the
16
 * specific language governing permissions and limitations
17
 * under the License.
18
 */
19

20
package org.apache.datasketches.theta;
21

22
import static org.apache.datasketches.common.Family.idToFamily;
23
import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
24
import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
25
import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
26
import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
27
import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
28
import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
29
import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
30
import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
31
import static org.apache.datasketches.theta.PreambleUtil.extractSerVer;
32
import static org.apache.datasketches.theta.PreambleUtil.extractEntryBitsV4;
33
import static org.apache.datasketches.theta.PreambleUtil.extractNumEntriesBytesV4;
34
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLongV4;
35
import static org.apache.datasketches.theta.SingleItemSketch.otherCheckForSingleItem;
36

37
import org.apache.datasketches.common.Family;
38
import org.apache.datasketches.common.SketchesArgumentException;
39
import org.apache.datasketches.memory.Memory;
40
import org.apache.datasketches.memory.WritableMemory;
41
import org.apache.datasketches.thetacommon.ThetaUtil;
42

43
/**
44
 * The parent class of all the CompactSketches. CompactSketches are never created directly.
45
 * They are created as a result of the compact() method of an UpdateSketch, a result of a
46
 * getResult() of a SetOperation, or from a heapify method.
47
 *
48
 * <p>A CompactSketch is the simplest form of a Theta Sketch. It consists of a compact list
49
 * (i.e., no intervening spaces) of hash values, which may be ordered or not, a value for theta
50
 * and a seed hash.  A CompactSketch is immutable (read-only),
51
 * and the space required when stored is only the space required for the hash values and 8 to 24
52
 * bytes of preamble. An empty CompactSketch consumes only 8 bytes.</p>
53
 *
54
 * @author Lee Rhodes
55
 */
56
public abstract class CompactSketch extends Sketch {
1✔
57

58
  /**
59
   * Heapify takes a CompactSketch image in Memory and instantiates an on-heap CompactSketch.
60
   *
61
   * <p>The resulting sketch will not retain any link to the source Memory and all of its data will be
62
   * copied to the heap CompactSketch.</p>
63
   *
64
   * <p>This method assumes that the sketch image was created with the correct hash seed, so it is not checked.
65
   * The resulting on-heap CompactSketch will be given the seedHash derived from the given sketch image.
66
   * However, Serial Version 1 sketch images do not have a seedHash field,
67
   * so the resulting heapified CompactSketch will be given the hash of the DEFAULT_UPDATE_SEED.</p>
68
   *
69
   * @param srcMem an image of a CompactSketch.
70
   * <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>.
71
   * @return a CompactSketch on the heap.
72
   */
73
  public static CompactSketch heapify(final Memory srcMem) {
74
    return heapify(srcMem, ThetaUtil.DEFAULT_UPDATE_SEED, false);
1✔
75
  }
76

77
  /**
78
   * Heapify takes a CompactSketch image in Memory and instantiates an on-heap CompactSketch.
79
   *
80
   * <p>The resulting sketch will not retain any link to the source Memory and all of its data will be
81
   * copied to the heap CompactSketch.</p>
82
   *
83
   * <p>This method checks if the given expectedSeed was used to create the source Memory image.
84
   * However, SerialVersion 1 sketch images cannot be checked as they don't have a seedHash field,
85
   * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.</p>
86
   *
87
   * @param srcMem an image of a CompactSketch that was created using the given expectedSeed.
88
   * <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>.
89
   * @param expectedSeed the seed used to validate the given Memory image.
90
   * <a href="{@docRoot}/resources/dictionary.html#seed">See Update Hash Seed</a>.
91
   * @return a CompactSketch on the heap.
92
   */
93
  public static CompactSketch heapify(final Memory srcMem, final long expectedSeed) {
94
    return heapify(srcMem, expectedSeed, true);
1✔
95
  }
96

97
  private static CompactSketch heapify(final Memory srcMem, final long seed, final boolean enforceSeed) {
98
    final int serVer = extractSerVer(srcMem);
1✔
99
    final int familyID = extractFamilyID(srcMem);
1✔
100
    final Family family = idToFamily(familyID);
1✔
101
    if (family != Family.COMPACT) {
1✔
102
      throw new IllegalArgumentException("Corrupted: " + family + " is not Compact!");
×
103
    }
104
    if (serVer == 4) {
1✔
105
       return heapifyV4(srcMem, seed, enforceSeed);
1✔
106
    }
107
    if (serVer == 3) {
1✔
108
      final int flags = extractFlags(srcMem);
1✔
109
      final boolean srcOrdered = (flags & ORDERED_FLAG_MASK) != 0;
1✔
110
      final boolean empty = (flags & EMPTY_FLAG_MASK) != 0;
1✔
111
      if (enforceSeed && !empty) { PreambleUtil.checkMemorySeedHash(srcMem, seed); }
1✔
112
      return CompactOperations.memoryToCompact(srcMem, srcOrdered, null);
1✔
113
    }
114
    //not SerVer 3, assume compact stored form
115
    final short seedHash = ThetaUtil.computeSeedHash(seed);
1✔
116
    if (serVer == 1) {
1✔
117
      return ForwardCompatibility.heapify1to3(srcMem, seedHash);
1✔
118
    }
119
    if (serVer == 2) {
1✔
120
      return ForwardCompatibility.heapify2to3(srcMem,
1✔
121
          enforceSeed ? seedHash : (short) extractSeedHash(srcMem));
1✔
122
    }
123
    throw new SketchesArgumentException("Unknown Serialization Version: " + serVer);
×
124
  }
125

126
  /**
127
   * Wrap takes the CompactSketch image in given Memory and refers to it directly.
128
   * There is no data copying onto the java heap.
129
   * The wrap operation enables fast read-only merging and access to all the public read-only API.
130
   *
131
   * <p>Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have
132
   * been explicitly stored as direct sketches can be wrapped.
133
   * Wrapping earlier serial version sketches will result in a heapify operation.
134
   * These early versions were never designed to "wrap".</p>
135
   *
136
   * <p>Wrapping any subclass of this class that is empty or contains only a single item will
137
   * result in heapified forms of empty and single item sketch respectively.
138
   * This is actually faster and consumes less overall memory.</p>
139
   *
140
   * <p>This method assumes that the sketch image was created with the correct hash seed, so it is not checked.
141
   * However, Serial Version 1 sketch images do not have a seedHash field,
142
   * so the resulting on-heap CompactSketch will be given the hash of the DEFAULT_UPDATE_SEED.</p>
143
   *
144
   * @param srcMem an image of a Sketch.
145
   * <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>.
146
   * @return a CompactSketch backed by the given Memory except as above.
147
   */
148
  public static CompactSketch wrap(final Memory srcMem) {
149
    return wrap(srcMem, ThetaUtil.DEFAULT_UPDATE_SEED, false);
1✔
150
  }
151

152
  /**
153
   * Wrap takes the sketch image in the given Memory and refers to it directly.
154
   * There is no data copying onto the java heap.
155
   * The wrap operation enables fast read-only merging and access to all the public read-only API.
156
   *
157
   * <p>Only "Direct" Serialization Version 3 (i.e, OpenSource) sketches that have
158
   * been explicitly stored as direct sketches can be wrapped.
159
   * Wrapping earlier serial version sketches will result in a heapify operation.
160
   * These early versions were never designed to "wrap".</p>
161
   *
162
   * <p>Wrapping any subclass of this class that is empty or contains only a single item will
163
   * result in heapified forms of empty and single item sketch respectively.
164
   * This is actually faster and consumes less overall memory.</p>
165
   *
166
   * <p>This method checks if the given expectedSeed was used to create the source Memory image.
167
   * However, SerialVersion 1 sketches cannot be checked as they don't have a seedHash field,
168
   * so the resulting heapified CompactSketch will be given the hash of the expectedSeed.</p>
169
   *
170
   * @param srcMem an image of a Sketch that was created using the given expectedSeed.
171
   * <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
172
   * @param expectedSeed the seed used to validate the given Memory image.
173
   * <a href="{@docRoot}/resources/dictionary.html#seed">See Update Hash Seed</a>.
174
   * @return a CompactSketch backed by the given Memory except as above.
175
   */
176
  public static CompactSketch wrap(final Memory srcMem, final long expectedSeed) {
177
    return wrap(srcMem, expectedSeed, true);
1✔
178
  }
179

180
  private static CompactSketch wrap(final Memory srcMem, final long seed, final boolean enforceSeed) {
181
    final int serVer = extractSerVer(srcMem);
1✔
182
    final int familyID = extractFamilyID(srcMem);
1✔
183
    final Family family = Family.idToFamily(familyID);
1✔
184
    if (family != Family.COMPACT) {
1✔
185
      throw new IllegalArgumentException("Corrupted: " + family + " is not Compact!");
×
186
    }
187
    final short seedHash = ThetaUtil.computeSeedHash(seed);
1✔
188

189
    if (serVer == 4) {
1✔
190
      // not wrapping the compressed format since currently we cannot take advantage of
191
      // decompression during iteration because set operations reach into memory directly
192
      return heapifyV4(srcMem, seed, enforceSeed);
1✔
193
    }
194
    else if (serVer == 3) {
1✔
195
      if (PreambleUtil.isEmptyFlag(srcMem)) {
1✔
196
        return EmptyCompactSketch.getHeapInstance(srcMem);
1✔
197
      }
198
      if (otherCheckForSingleItem(srcMem)) {
1✔
199
        return SingleItemSketch.heapify(srcMem, enforceSeed ? seedHash : (short) extractSeedHash(srcMem));
1✔
200
      }
201
      //not empty & not singleItem
202
      final int flags = extractFlags(srcMem);
1✔
203
      final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0;
1✔
204
      if (!compactFlag) {
1✔
205
        throw new SketchesArgumentException(
1✔
206
            "Corrupted: COMPACT family sketch image must have compact flag set");
207
      }
208
      final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0;
1✔
209
      if (!readOnly) {
1✔
210
        throw new SketchesArgumentException(
1✔
211
            "Corrupted: COMPACT family sketch image must have Read-Only flag set");
212
      }
213
      return DirectCompactSketch.wrapInstance(srcMem,
1✔
214
          enforceSeed ? seedHash : (short) extractSeedHash(srcMem));
1✔
215
    } //end of serVer 3
216
    else if (serVer == 1) {
1✔
217
      return ForwardCompatibility.heapify1to3(srcMem, seedHash);
1✔
218
    }
219
    else if (serVer == 2) {
1✔
220
      return ForwardCompatibility.heapify2to3(srcMem,
1✔
221
          enforceSeed ? seedHash : (short) extractSeedHash(srcMem));
1✔
222
    }
223
    throw new SketchesArgumentException(
1✔
224
        "Corrupted: Serialization Version " + serVer + " not recognized.");
225
  }
226

227
  //Sketch Overrides
228

229
  @Override
230
  public abstract CompactSketch compact(final boolean dstOrdered, final WritableMemory dstMem);
231

232
  @Override
233
  public int getCompactBytes() {
234
    return getCurrentBytes();
1✔
235
  }
236

237
  @Override
238
  int getCurrentDataLongs() {
239
    return getRetainedEntries(true);
1✔
240
  }
241

242
  @Override
243
  public Family getFamily() {
244
    return Family.COMPACT;
1✔
245
  }
246

247
  @Override
248
  public boolean isCompact() {
249
    return true;
1✔
250
  }
251

252
  /**
253
   * gets the sketch as a compressed byte array
254
   * @return the sketch as a compressed byte array
255
   */
256
  public byte[] toByteArrayCompressed() {
257
    if (!isOrdered() || getRetainedEntries() == 0 || (getRetainedEntries() == 1 && !isEstimationMode())) {
1✔
258
      return toByteArray();
×
259
    }
260
    return toByteArrayV4();
1✔
261
  }
262

263
  private int computeMinLeadingZeros() {
264
    // compression is based on leading zeros in deltas between ordered hash values
265
    // assumes ordered sketch
266
    long previous = 0;
1✔
267
    long ored = 0;
1✔
268
    final HashIterator it = iterator();
1✔
269
    while (it.next()) {
1✔
270
      final long delta = it.get() - previous;
1✔
271
      ored |= delta;
1✔
272
      previous = it.get();
1✔
273
    }
1✔
274
    return Long.numberOfLeadingZeros(ored);
1✔
275
  }
276

277
  private static int wholeBytesToHoldBits(final int bits) {
278
    return (bits >>> 3) + ((bits & 7) > 0 ? 1 : 0);
1✔
279
  }
280

281
  private byte[] toByteArrayV4() {
282
    final int preambleLongs = isEstimationMode() ? 2 : 1;
1✔
283
    final int entryBits = 64 - computeMinLeadingZeros();
1✔
284
    final int compressedBits = entryBits * getRetainedEntries();
1✔
285

286
    // store num_entries as whole bytes since whole-byte blocks will follow (most probably)
287
    final int numEntriesBytes = wholeBytesToHoldBits(32 - Integer.numberOfLeadingZeros(getRetainedEntries()));
1✔
288

289
    final int size = preambleLongs * Long.BYTES + numEntriesBytes + wholeBytesToHoldBits(compressedBits);
1✔
290
    final byte[] bytes = new byte[size];
1✔
291
    final WritableMemory mem = WritableMemory.writableWrap(bytes);
1✔
292
    int offsetBytes = 0;
1✔
293
    mem.putByte(offsetBytes++, (byte) preambleLongs);
1✔
294
    mem.putByte(offsetBytes++, (byte) 4); // to do: add constant
1✔
295
    mem.putByte(offsetBytes++, (byte) Family.COMPACT.getID());
1✔
296
    mem.putByte(offsetBytes++, (byte) entryBits);
1✔
297
    mem.putByte(offsetBytes++, (byte) numEntriesBytes);
1✔
298
    mem.putByte(offsetBytes++, (byte) (COMPACT_FLAG_MASK | READ_ONLY_FLAG_MASK | ORDERED_FLAG_MASK));
1✔
299
    mem.putShort(offsetBytes, getSeedHash());
1✔
300
    offsetBytes += Short.BYTES;
1✔
301
    if (isEstimationMode()) {
1✔
302
      mem.putLong(offsetBytes, getThetaLong());
1✔
303
      offsetBytes += Long.BYTES;
1✔
304
    }
305
    int numEntries = getRetainedEntries();
1✔
306
    for (int i = 0; i < numEntriesBytes; i++) {
1✔
307
      mem.putByte(offsetBytes++, (byte) (numEntries & 0xff));
1✔
308
      numEntries >>>= 8;
1✔
309
    }
310
    long previous = 0;
1✔
311
    final long[] deltas = new long[8];
1✔
312
    final HashIterator it = iterator();
1✔
313
    int i;
314
    for (i = 0; i + 7 < getRetainedEntries(); i += 8) {
1✔
315
      for (int j = 0; j < 8; j++) {
1✔
316
        it.next();
1✔
317
        deltas[j] = it.get() - previous;
1✔
318
        previous = it.get();
1✔
319
      }
320
      BitPacking.packBitsBlock8(deltas, 0, bytes, offsetBytes, entryBits);
1✔
321
      offsetBytes += entryBits;
1✔
322
    }
323
    int offsetBits = 0;
1✔
324
    for (; i < getRetainedEntries(); i++) {
1✔
325
      it.next();
1✔
326
      final long delta = it.get() - previous;
1✔
327
      previous = it.get();
1✔
328
      BitPacking.packBits(delta, entryBits, bytes, offsetBytes, offsetBits);
1✔
329
      offsetBytes += (offsetBits + entryBits) >>> 3;
1✔
330
      offsetBits = (offsetBits + entryBits) & 7;
1✔
331
    }
332
    return bytes;
1✔
333
  }
334

335
  private static CompactSketch heapifyV4(final Memory srcMem, final long seed, final boolean enforceSeed) {
336
    final int preLongs = extractPreLongs(srcMem);
1✔
337
    final int flags = extractFlags(srcMem);
1✔
338
    final int entryBits = extractEntryBitsV4(srcMem);
1✔
339
    final int numEntriesBytes = extractNumEntriesBytesV4(srcMem);
1✔
340
    final short seedHash = (short) extractSeedHash(srcMem);
1✔
341
    final boolean isEmpty = (flags & EMPTY_FLAG_MASK) > 0;
1✔
342
    if (enforceSeed && !isEmpty) { PreambleUtil.checkMemorySeedHash(srcMem, seed); }
1✔
343
    int offsetBytes = 8;
1✔
344
    long theta = Long.MAX_VALUE;
1✔
345
    if (preLongs > 1) {
1✔
346
      theta = extractThetaLongV4(srcMem);
1✔
347
      offsetBytes += Long.BYTES;
1✔
348
    }
349
    int numEntries = 0;
1✔
350
    for (int i = 0; i < numEntriesBytes; i++) {
1✔
351
      numEntries |= Byte.toUnsignedInt(srcMem.getByte(offsetBytes++)) << (i << 3);
1✔
352
    }
353
    final long[] entries = new long[numEntries];
1✔
354
    final byte[] bytes = new byte[entryBits]; // temporary buffer for unpacking
1✔
355
    int i;
356
    for (i = 0; i + 7 < numEntries; i += 8) {
1✔
357
      srcMem.getByteArray(offsetBytes, bytes, 0, entryBits);
1✔
358
      BitPacking.unpackBitsBlock8(entries, i, bytes, 0, entryBits);
1✔
359
      offsetBytes += entryBits;
1✔
360
    }
361
    if (i < numEntries) {
1✔
362
      srcMem.getByteArray(offsetBytes, bytes, 0, wholeBytesToHoldBits((numEntries - i) * entryBits));
1✔
363
      int offsetBits = 0;
1✔
364
      offsetBytes = 0;
1✔
365
      for (; i < numEntries; i++) {
1✔
366
        BitPacking.unpackBits(entries, i, entryBits, bytes, offsetBytes, offsetBits);
1✔
367
        offsetBytes += (offsetBits + entryBits) >>> 3;
1✔
368
        offsetBits = (offsetBits + entryBits) & 7;
1✔
369
      }
370
    }
371
    // undo deltas
372
    long previous = 0;
1✔
373
    for (i = 0; i < numEntries; i++) {
1✔
374
      entries[i] += previous;
1✔
375
      previous = entries[i];
1✔
376
    }
377
    return new HeapCompactSketch(entries, isEmpty, seedHash, numEntries, theta, true);
1✔
378
  }
379

380
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc