• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

apache / datasketches-java / #306

30 Apr 2024 10:01PM UTC coverage: 97.645% (-0.5%) from 98.139%
#306

push

web-flow
Merge pull request #555 from apache/fix_pom_xml_header

Fix pom xml header

26865 of 27513 relevant lines covered (97.64%)

0.98 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

95.83
/src/main/java/org/apache/datasketches/hll/HllEstimators.java
1
/*
2
 * Licensed to the Apache Software Foundation (ASF) under one
3
 * or more contributor license agreements.  See the NOTICE file
4
 * distributed with this work for additional information
5
 * regarding copyright ownership.  The ASF licenses this file
6
 * to you under the Apache License, Version 2.0 (the
7
 * "License"); you may not use this file except in compliance
8
 * with the License.  You may obtain a copy of the License at
9
 *
10
 *   http://www.apache.org/licenses/LICENSE-2.0
11
 *
12
 * Unless required by applicable law or agreed to in writing,
13
 * software distributed under the License is distributed on an
14
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
 * KIND, either express or implied.  See the License for the
16
 * specific language governing permissions and limitations
17
 * under the License.
18
 */
19

20
package org.apache.datasketches.hll;
21

22
import static org.apache.datasketches.hll.HllUtil.MIN_LOG_K;
23

24
/**
25
 * @author Lee Rhodes
26
 * @author Kevin Lang
27
 */
28
class HllEstimators {
×
29

30
  //HLL UPPER AND LOWER BOUNDS
31

32
  /*
33
   * The upper and lower bounds are not symmetric and thus are treated slightly differently.
34
   * For the lower bound, when the unique count is <= k, LB >= numNonZeros, where
35
   * numNonZeros = k - numAtCurMin AND curMin == 0.
36
   *
37
   * For HLL6 and HLL8, curMin is always 0 and numAtCurMin is initialized to k and is decremented
38
   * down for each valid update until it reaches 0, where it stays. Thus, for these two
39
   * isomorphs, when numAtCurMin = 0, means the true curMin is > 0 and the unique count must be
40
   * greater than k.
41
   *
42
   * HLL4 always maintains both curMin and numAtCurMin dynamically. Nonetheless, the rules for
43
   * the very small values <= k where curMin = 0 still apply.
44
   */
45

46
  static final double hllLowerBound(final AbstractHllArray absHllArr, final int numStdDev) {
47
    final int lgConfigK = absHllArr.lgConfigK;
1✔
48
    final int configK = 1 << lgConfigK;
1✔
49
    final double numNonZeros =
1✔
50
        (absHllArr.getCurMin() == 0) ? configK - absHllArr.getNumAtCurMin() : configK;
1✔
51
    final double estimate = absHllArr.getEstimate();
1✔
52
    final boolean oooFlag = absHllArr.isOutOfOrder();
1✔
53
    final double relErr = BaseHllSketch.getRelErr(false, oooFlag, lgConfigK, numStdDev);
1✔
54
    return Math.max(estimate / (1.0 + relErr), numNonZeros);
1✔
55
  }
56

57
  static final double hllUpperBound(final AbstractHllArray absHllArr, final int numStdDev) {
58
    final int lgConfigK = absHllArr.lgConfigK;
1✔
59
    final double estimate = absHllArr.getEstimate();
1✔
60
    final boolean oooFlag = absHllArr.isOutOfOrder();
1✔
61
    final double relErr = BaseHllSketch.getRelErr(true, oooFlag, lgConfigK, numStdDev);
1✔
62
    return estimate / (1.0 - relErr);
1✔
63
  }
64

65
  //THE HLL COMPOSITE ESTIMATOR
66

67
  /**
68
   * This is the (non-HIP) estimator.
69
   * It is called "composite" because multiple estimators are pasted together.
70
   * @param absHllArr an instance of the AbstractHllArray class.
71
   * @return the composite estimate
72
   */
73
  //In C: again-two-registers.c hhb_get_composite_estimate L1489
74
  static final double hllCompositeEstimate(final AbstractHllArray absHllArr) {
75
    final int lgConfigK = absHllArr.getLgConfigK();
1✔
76
    final double rawEst = getHllRawEstimate(lgConfigK, absHllArr.getKxQ0() + absHllArr.getKxQ1());
1✔
77

78
    final double[] xArr = CompositeInterpolationXTable.xArrs[lgConfigK - MIN_LOG_K];
1✔
79
    final double yStride = CompositeInterpolationXTable.yStrides[lgConfigK - MIN_LOG_K];
1✔
80
    final int xArrLen = xArr.length;
1✔
81

82
    if (rawEst < xArr[0]) { return 0; }
1✔
83

84
    final int xArrLenM1 = xArrLen - 1;
1✔
85

86
    if (rawEst > xArr[xArrLenM1]) {
1✔
87
      final double finalY = yStride * (xArrLenM1);
1✔
88
      final double factor = finalY / xArr[xArrLenM1];
1✔
89
      return rawEst * factor;
1✔
90
    }
91

92
    final double adjEst =
1✔
93
        CubicInterpolation.usingXArrAndYStride(xArr, yStride, rawEst);
1✔
94

95
    // We need to completely avoid the linear_counting estimator if it might have a crazy value.
96
    // Empirical evidence suggests that the threshold 3*k will keep us safe if 2^4 <= k <= 2^21.
97

98
    if (adjEst > (3 << lgConfigK)) { return adjEst; }
1✔
99
    //Alternate call
100
    //if ((adjEst > (3 << lgConfigK)) || ((curMin != 0) || (numAtCurMin == 0)) ) { return adjEst; }
101

102
    final double linEst =
1✔
103
        getHllBitMapEstimate(lgConfigK, absHllArr.getCurMin(), absHllArr.getNumAtCurMin());
1✔
104

105
    // Bias is created when the value of an estimator is compared with a threshold to decide whether
106
    // to use that estimator or a different one.
107
    // We conjecture that less bias is created when the average of the two estimators
108
    // is compared with the threshold. Empirical measurements support this conjecture.
109

110
    final double avgEst = (adjEst + linEst) / 2.0;
1✔
111

112
    // The following constants comes from empirical measurements of the crossover point
113
    // between the average error of the linear estimator and the adjusted HLL estimator
114
    double crossOver = 0.64;
1✔
115
    if (lgConfigK == 4)      { crossOver = 0.718; }
1✔
116
    else if (lgConfigK == 5) { crossOver = 0.672; }
1✔
117

118
    return (avgEst > (crossOver * (1 << lgConfigK))) ? adjEst : linEst;
1✔
119
  }
120

121
  /**
122
   * Estimator when N is small, roughly less than k log(k).
123
   * Refer to Wikipedia: Coupon Collector Problem
124
   * @param lgConfigK the current configured lgK of the sketch
125
   * @param curMin the current minimum value of the HLL window
126
   * @param numAtCurMin the current number of slots with the value curMin
127
   * @return the very low range estimate
128
   */
129
  //In C: again-two-registers.c hhb_get_improved_linear_counting_estimate L1274
130
  private static final double getHllBitMapEstimate(
131
      final int lgConfigK, final int curMin, final int numAtCurMin) {
132
    final int configK = 1 << lgConfigK;
1✔
133
    final int numUnhitBuckets =  (curMin == 0) ? numAtCurMin : 0;
1✔
134

135
    //This will eventually go away.
136
    if (numUnhitBuckets == 0) {
1✔
137
      return configK * Math.log(configK / 0.5);
×
138
    }
139

140
    final int numHitBuckets = configK - numUnhitBuckets;
1✔
141
    return HarmonicNumbers.getBitMapEstimate(configK, numHitBuckets);
1✔
142
  }
143

144
  //In C: again-two-registers.c hhb_get_raw_estimate L1167
145
  //This algorithm is from Flajolet's, et al, 2007 HLL paper, Fig 3.
146
  private static final double getHllRawEstimate(final int lgConfigK, final double kxqSum) {
147
    final int configK = 1 << lgConfigK;
1✔
148
    final double correctionFactor;
149
    if (lgConfigK == 4) { correctionFactor = 0.673; }
1✔
150
    else if (lgConfigK == 5) { correctionFactor = 0.697; }
1✔
151
    else if (lgConfigK == 6) { correctionFactor = 0.709; }
1✔
152
    else { correctionFactor = 0.7213 / (1.0 + (1.079 / configK)); }
1✔
153
    final double hyperEst = (correctionFactor * configK * configK) / kxqSum;
1✔
154
    return hyperEst;
1✔
155
  }
156

157
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc