• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

apache / datasketches-java / #306

30 Apr 2024 10:01PM UTC coverage: 97.645% (-0.5%) from 98.139%
#306

push

web-flow
Merge pull request #555 from apache/fix_pom_xml_header

Fix pom xml header

26865 of 27513 relevant lines covered (97.64%)

0.98 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.09
/src/main/java/org/apache/datasketches/hllmap/HllMap.java
1
/*
2
 * Licensed to the Apache Software Foundation (ASF) under one
3
 * or more contributor license agreements.  See the NOTICE file
4
 * distributed with this work for additional information
5
 * regarding copyright ownership.  The ASF licenses this file
6
 * to you under the Apache License, Version 2.0 (the
7
 * "License"); you may not use this file except in compliance
8
 * with the License.  You may obtain a copy of the License at
9
 *
10
 *   http://www.apache.org/licenses/LICENSE-2.0
11
 *
12
 * Unless required by applicable law or agreed to in writing,
13
 * software distributed under the License is distributed on an
14
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
 * KIND, either express or implied.  See the License for the
16
 * specific language governing permissions and limitations
17
 * under the License.
18
 */
19

20
package org.apache.datasketches.hllmap;
21

22
import static java.lang.Math.log;
23
import static java.lang.Math.sqrt;
24
import static org.apache.datasketches.common.Util.invPow2;
25

26
import java.util.Arrays;
27

28
import org.apache.datasketches.common.SketchesArgumentException;
29
import org.apache.datasketches.common.SuppressFBWarnings;
30
import org.apache.datasketches.hash.MurmurHash3;
31

32
/**
33
 * Implements a key-value map where the value is a compact HLL sketch of size k.
34
 * The HLL bins are compacted into 10 bins per long so that a 1024 bins are compacted into
35
 * 824 bytes, which is a 20% reduction in space. Higher density compressions are possible
36
 * (up to 50%), but the required code is much more complex and considerably slower.
37
 *
38
 * <p>Each entry row, associated with a key, also contains 3 double registers for accurately
39
 * tracking the HIP (Historical Inverse Probability) estimator. HLL implementations have multiple
40
 * estimators and the early estimators in this implementation are quite novel and provide superior
41
 * error performance over most other HLL implementations.
42
 *
43
 * @author Lee Rhodes
44
 * @author KevinLang
45
 * @author Alexander Saydakov
46
 */
47
final class HllMap extends Map {
48
  private static final double LOAD_FACTOR = 15.0 / 16.0;
49
  private static final int HLL_INIT_NUM_ENTRIES = 157;
50
  private static final int HLL_INIT_NUM_ENTRIES_ARR_SIZE = (int) Math.ceil(HLL_INIT_NUM_ENTRIES / 8.0);
1✔
51
  private static final float HLL_RESIZE_FACTOR = 2.0F;
52
  private static final double RSE = sqrt(log(2.0)) / 32.0;
1✔
53
  private final int k_;
54
  private final int hllArrLongs_; //# of longs required to store the HLL array
55

56
  private int tableEntries_;      //Full size of the table
57
  private int capacityEntries_;   //max capacity entries defined by Load factor
58
  private int curCountEntries_;   //current count of valid entries
59
  private float growthFactor_;    //e.g., 1.2 to 2.0
60
  private double entrySizeBytes_;
61

62
  //Arrays
63
  private byte[] keysArr_; //keys of zero are allowed
64
  private long[] arrOfHllArr_;
65
  private double[] invPow2SumHiArr_;
66
  private double[] invPow2SumLoArr_;
67
  private double[] hipEstAccumArr_;
68
  private byte[] stateArr_;
69

70
  /**
71
   * Private constructor used to set all finals
72
   * @param keySizeBytes size of key in bytes
73
   * @param k size of HLL sketch
74
   */
75
  private HllMap(final int keySizeBytes, final int k) {
76
    super(keySizeBytes);
1✔
77
    k_ = k;
1✔
78
    hllArrLongs_ = (k / 10) + 1;
1✔
79
  }
1✔
80

81
  static HllMap getInstance(final int keySizeBytes, final int k) {
82

83
    final HllMap map = new HllMap(keySizeBytes, k);
1✔
84
    map.tableEntries_ = HLL_INIT_NUM_ENTRIES;
1✔
85
    map.capacityEntries_ = (int)(HLL_INIT_NUM_ENTRIES * LOAD_FACTOR);
1✔
86
    map.curCountEntries_ = 0;
1✔
87
    map.growthFactor_ = HLL_RESIZE_FACTOR;
1✔
88
    map.entrySizeBytes_ = updateEntrySizeBytes(map.tableEntries_, keySizeBytes, map.hllArrLongs_);
1✔
89

90
    map.keysArr_ = new byte[HLL_INIT_NUM_ENTRIES * map.keySizeBytes_];
1✔
91
    map.arrOfHllArr_ = new long[HLL_INIT_NUM_ENTRIES * map.hllArrLongs_];
1✔
92
    map.invPow2SumHiArr_ = new double[HLL_INIT_NUM_ENTRIES];
1✔
93
    map.invPow2SumLoArr_ = new double[HLL_INIT_NUM_ENTRIES];
1✔
94
    map.hipEstAccumArr_ = new double[HLL_INIT_NUM_ENTRIES];
1✔
95
    map.stateArr_ = new byte[HLL_INIT_NUM_ENTRIES_ARR_SIZE];
1✔
96
    return map;
1✔
97
  }
98

99
  @Override
100
  double update(final byte[] key, final short coupon) {
101
    final int entryIndex = findOrInsertKey(key);
1✔
102
    return update(entryIndex, coupon);
1✔
103
  }
104

105
  @Override
106
  double update(final int entryIndex, final short coupon) {
107
    updateHll(entryIndex, coupon); //update HLL array, updates HIP
1✔
108
    return hipEstAccumArr_[entryIndex];
1✔
109
  }
110

111
  @Override
112
  double getEstimate(final byte[] key) {
113
    if (key == null) { return Double.NaN; }
1✔
114
    final int entryIndex = findKey(key);
1✔
115
    if (entryIndex < 0) {
1✔
116
      return 0;
1✔
117
    }
118
    return hipEstAccumArr_[entryIndex];
1✔
119
  }
120

121
  @Override
122
  double getUpperBound(final byte[] key) {
123
    return getEstimate(key) * (1 + RSE);
1✔
124
  }
125

126
  @Override
127
  double getLowerBound(final byte[] key) {
128
    return getEstimate(key) * (1 - RSE);
1✔
129
  }
130

131
  @Override
132
  void updateEstimate(final int entryIndex, final double estimate) {
133
    hipEstAccumArr_[entryIndex] = estimate;
1✔
134
  }
1✔
135

136
  /**
137
   * Returns the entry index for the given key given the array of keys, if found.
138
   * Otherwise, returns the one's complement of first empty entry found;
139
   * @param key the key to search for
140
   * @return the entry index of the given key, or the one's complement of the index if not found.
141
   */
142
  @Override
143
  final int findKey(final byte[] key) {
144
    final int keyLen = key.length;
1✔
145
    final long[] hash = MurmurHash3.hash(key, SEED);
1✔
146
    int entryIndex  = getIndex(hash[0], tableEntries_);
1✔
147
    final int stride = getStride(hash[1], tableEntries_);
1✔
148
    final int loopIndex = entryIndex;
1✔
149

150
    do {
151
      if (isBitClear(stateArr_, entryIndex)) { //check if slot is empty
1✔
152
        return ~entryIndex;
1✔
153
      }
154
      if (arraysEqual(key, 0, keysArr_, entryIndex * keyLen, keyLen)) { //check for key match
1✔
155
        return entryIndex;
1✔
156
      }
157
      entryIndex = (entryIndex + stride) % tableEntries_;
1✔
158
    } while (entryIndex != loopIndex);
1✔
159
    throw new SketchesArgumentException("Key not found and no empty slots!");
×
160
  }
161

162
  @Override
163
  int findOrInsertKey(final byte[] key) {
164
    int entryIndex = findKey(key);
1✔
165
    if (entryIndex < 0) { //key not found, initialize new row
1✔
166
      entryIndex = ~entryIndex;
1✔
167
      System.arraycopy(key, 0, keysArr_, entryIndex * keySizeBytes_, keySizeBytes_);
1✔
168
      setBit(stateArr_, entryIndex);
1✔
169
      invPow2SumHiArr_[entryIndex] = k_;
1✔
170
      invPow2SumLoArr_[entryIndex] = 0;
1✔
171
      hipEstAccumArr_[entryIndex] = 0;
1✔
172
      curCountEntries_++;
1✔
173
      if (curCountEntries_ > capacityEntries_) {
1✔
174
        resize();
1✔
175
        entryIndex = findKey(key);
1✔
176
        assert entryIndex >= 0;
1✔
177
      }
178
    }
179
    return entryIndex;
1✔
180
  }
181

182
  @Override
183
  double getEntrySizeBytes() {
184
    return entrySizeBytes_;
1✔
185
  }
186

187
  @Override
188
  int getTableEntries() {
189
    return tableEntries_;
1✔
190
  }
191

192
  @Override
193
  int getCapacityEntries() {
194
    return capacityEntries_;
1✔
195
  }
196

197
  @Override
198
  int getCurrentCountEntries() {
199
    return curCountEntries_;
1✔
200
  }
201

202
  @Override
203
  long getMemoryUsageBytes() {
204
    final long arrays = keysArr_.length
1✔
205
        + ((long) arrOfHllArr_.length * Long.BYTES)
206
        + ((long) invPow2SumLoArr_.length * Double.BYTES)
207
        + ((long) invPow2SumHiArr_.length * Double.BYTES)
208
        + ((long) hipEstAccumArr_.length * Double.BYTES)
209
        + stateArr_.length;
210
    final long other = (5L * Integer.BYTES) + Float.BYTES + Double.BYTES;
1✔
211
    return arrays + other;
1✔
212
  }
213

214
  @Override
215
  CouponsIterator getCouponsIterator(final int index) {
216
    // not applicable
217
    return null;
×
218
  }
219

220
  @Override
221
  int getMaxCouponsPerEntry() {
222
    // not applicable
223
    return 0;
1✔
224
  }
225

226
  @Override
227
  int getCapacityCouponsPerEntry() {
228
    // not applicable
229
    return 0;
1✔
230
  }
231

232
  @Override
233
  int getActiveEntries() {
234
    return curCountEntries_;
1✔
235
  }
236

237
  @Override
238
  int getDeletedEntries() {
239
    return 0;
1✔
240
  }
241

242
  /**
243
   * Find the first empty slot for the given key.
244
   * Only used by resize, where it is known that the key does not exist in the table.
245
   * Throws an exception if no empty slots.
246
   * @param key the given key
247
   * @param tableEntries prime size of table
248
   * @param stateArr the valid bit array
249
   * @return the first empty slot for the given key
250
   */
251
  private static final int findEmpty(final byte[] key, final int tableEntries, final byte[] stateArr) {
252
    final long[] hash = MurmurHash3.hash(key, SEED);
1✔
253
    int entryIndex  = getIndex(hash[0], tableEntries);
1✔
254
    final int stride = getStride(hash[1], tableEntries);
1✔
255
    final int loopIndex = entryIndex;
1✔
256

257
    do {
258
      if (isBitClear(stateArr, entryIndex)) { //check if slot is empty
1✔
259
        return entryIndex;
1✔
260
      }
261
      entryIndex = (entryIndex + stride) % tableEntries;
1✔
262
    } while (entryIndex != loopIndex);
1✔
263
    throw new SketchesArgumentException("No empty slots.");
×
264
  }
265

266
  //This method is specifically tied to the HLL array layout
267
  @SuppressFBWarnings(value = "IM_MULTIPLYING_RESULT_OF_IREM", justification = "False Positive")
268
  private final boolean updateHll(final int entryIndex, final int coupon) {
269
    final int newValue = coupon16Value(coupon);
1✔
270

271
    final int hllIdx = coupon & (k_ - 1); //lower lgK bits
1✔
272
    final int longIdx = hllIdx / 10;
1✔
273
    final int shift = ((hllIdx % 10) * 6) & SIX_BIT_MASK;
1✔
274

275
    long hllLong = arrOfHllArr_[(entryIndex * hllArrLongs_) + longIdx];
1✔
276
    final int oldValue = (int)(hllLong >>> shift) & SIX_BIT_MASK;
1✔
277
    if (newValue <= oldValue) { return false; }
1✔
278
    // newValue > oldValue
279

280
    //update hipEstAccum BEFORE updating invPow2Sum
281
    final double invPow2Sum = invPow2SumHiArr_[entryIndex] + invPow2SumLoArr_[entryIndex];
1✔
282
    final double oneOverQ = k_ / invPow2Sum;
1✔
283
    hipEstAccumArr_[entryIndex] += oneOverQ;
1✔
284

285
    //update invPow2Sum
286
    if (oldValue < 32) { invPow2SumHiArr_[entryIndex] -= invPow2(oldValue); }
1✔
287
    else               { invPow2SumLoArr_[entryIndex] -= invPow2(oldValue); }
×
288
    if (newValue < 32) { invPow2SumHiArr_[entryIndex] += invPow2(newValue); }
1✔
289
    else               { invPow2SumLoArr_[entryIndex] += invPow2(newValue); }
×
290

291
    //insert the new value
292
    hllLong &= ~(0X3FL << shift);  //zero out the 6-bit field
1✔
293
    hllLong |=  ((long)newValue) << shift; //insert
1✔
294
    arrOfHllArr_[(entryIndex * hllArrLongs_) + longIdx] = hllLong;
1✔
295
    return true;
1✔
296
  }
297

298
  private final void resize() {
299
    final int newTableEntries = nextPrime((int)(tableEntries_ * growthFactor_));
1✔
300
    final int newCapacityEntries = (int)(newTableEntries * LOAD_FACTOR);
1✔
301

302
    final byte[] newKeysArr = new byte[newTableEntries * keySizeBytes_];
1✔
303
    final long[] newArrOfHllArr = new long[newTableEntries * hllArrLongs_];
1✔
304
    final double[] newInvPow2Sum1 = new double[newTableEntries];
1✔
305
    final double[] newInvPow2Sum2 = new double[newTableEntries];
1✔
306
    final double[] newHipEstAccum = new double[newTableEntries];
1✔
307
    final byte[] newStateArr = new byte[(int) Math.ceil(newTableEntries / 8.0)];
1✔
308

309
    for (int oldIndex = 0; oldIndex < tableEntries_; oldIndex++) {
1✔
310
      if (isBitClear(stateArr_, oldIndex)) { continue; }
1✔
311
      // extract an old key
312
      final byte[] key =
1✔
313
          Arrays.copyOfRange(keysArr_, oldIndex * keySizeBytes_, (oldIndex + 1) * keySizeBytes_);
1✔
314
      final int newIndex = findEmpty(key, newTableEntries, newStateArr);
1✔
315
      System.arraycopy(key, 0, newKeysArr, newIndex * keySizeBytes_, keySizeBytes_); //put key
1✔
316
      //put the rest of the row
317
      System.arraycopy(arrOfHllArr_, oldIndex * hllArrLongs_, newArrOfHllArr,
1✔
318
          newIndex * hllArrLongs_, hllArrLongs_);
319
      newInvPow2Sum1[newIndex] = invPow2SumHiArr_[oldIndex];
1✔
320
      newInvPow2Sum2[newIndex] = invPow2SumLoArr_[oldIndex];
1✔
321
      newHipEstAccum[newIndex] = hipEstAccumArr_[oldIndex];
1✔
322
      setBit(newStateArr, newIndex);
1✔
323
    }
324
    //restore into sketch
325
    tableEntries_ = newTableEntries;
1✔
326
    capacityEntries_ = newCapacityEntries;
1✔
327
    //curCountEntries_, growthFactor_  unchanged
328
    entrySizeBytes_ = updateEntrySizeBytes(tableEntries_, keySizeBytes_, hllArrLongs_);
1✔
329

330
    keysArr_ = newKeysArr;
1✔
331
    arrOfHllArr_ = newArrOfHllArr;
1✔
332
    invPow2SumHiArr_ = newInvPow2Sum1; //init to k
1✔
333
    invPow2SumLoArr_ = newInvPow2Sum2; //init to 0
1✔
334
    hipEstAccumArr_ = newHipEstAccum;  //init to 0
1✔
335
    stateArr_ = newStateArr;
1✔
336
  }
1✔
337

338
  private static final double updateEntrySizeBytes(final int tableEntries, final int keySizeBytes,
339
      final int hllArrLongs) {
340
    final double byteFraction = Math.ceil(tableEntries / 8.0) / tableEntries;
1✔
341
    return keySizeBytes + ((double) hllArrLongs * Long.BYTES) + (3.0 * Double.BYTES) + byteFraction;
1✔
342
  }
343

344
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc