- 浏览: 125460 次
- 性别:
- 来自: 杭州
文章分类
最新评论
package hllctest
import java.util
import org.scalatest.{FlatSpec}
import org.spark.sqludf.HLLCounter
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
class HllcCrossSetTest extends FlatSpec {
val ramdom = new Random()
val m = 18
// 用于验证hllc 的错误率
def errorRateCal(sampleCount: Int) = {
errorRate(0.01, sampleCount)
errorRate(0.05, sampleCount)
errorRate(0.1, sampleCount)
errorRate(0.2, sampleCount)
errorRate(0.5, sampleCount)
}
"hllc test" should "hllc merge,mix error rate" in {
errorRateCal(1000)
errorRateCal(2000)
errorRateCal(5000)
errorRateCal(10000)
errorRateCal(20000)
errorRateCal(50000)
errorRateCal(100000)
errorRateCal(200000)
errorRateCal(500000)
errorRateCal(100000)
errorRateCal(200000)
errorRateCal(500000)
}
// 不放回抽样 测试集合生成
def getRandomStr(setCollection: mutable.HashSet[String], totalIntArray: Array[Int]): Unit = {
val str = getTestString(totalIntArray)
if (!setCollection.contains(str)) setCollection.add(str)
else
getRandomStr(setCollection, totalIntArray)
}
//5% 100w 级别集合 计算总量
def errorRate(r: Double, testSetLength: Int) = {
println(s" ********************begin r :{$r},testSetLength:{$testSetLength} ,m:{$m} ***********************")
var setA = new mutable.HashSet[String]()
var setB = new mutable.HashSet[String]()
val tatolCount = (testSetLength / r).toInt
var b = System.currentTimeMillis()
val totalIntArray = (tatolCount + "").toCharArray.map(x => x.toString.toInt)
var timeRecord = System.currentTimeMillis()
for (i <- 0 until testSetLength) {
getRandomStr(setA, totalIntArray)
getRandomStr(setB, totalIntArray)
// if (i%5000 == 0) {
// println(s" generate data ${i} ,cost: ${System.currentTimeMillis()-timeRecord}) ")
// timeRecord = System.currentTimeMillis()
// }
}
println(s"tatolCount: ${tatolCount} ,r : ${r} setA size: ${setA.size} , setB size: ${setB.size} ")
var e = System.currentTimeMillis()
println(s" generate data cost time: ${e - b} ")
/*
* realMix 交集
* hllcMix
* realMerge
* hllcMerge
* */
b = System.currentTimeMillis()
val realMixCnt = realMix(setA, setB)
val mixRate = realMixRate(realMixCnt, setA)
println(s" realMixCnt: ${realMixCnt} , mixRate:${mixRate}")
e = System.currentTimeMillis()
println(s" Map collection cost time: ${e - b} ")
b = System.currentTimeMillis()
val hllcMixCnt = hllcMix(setA, setB)
val mixRatehllc = realMixRate(hllcMixCnt, setA)
println(s" hllcMixCnt: ${hllcMixCnt} , mixRatehllc:${mixRatehllc}")
val distinct = mixRatehllc - mixRate
println(f" mixRatehllc - mixRate: $distinct%1.6f ")
println(f" hllcMixCnt - realMixCnt: ${hllcMixCnt - realMixCnt} ")
e = System.currentTimeMillis()
println(s" hllc cost time: ${e - b} ")
}
def realMix(setA: mutable.HashSet[String], setB: mutable.HashSet[String]) = {
val hashSet = new util.HashSet[String]
setA.foreach(str => if (!hashSet.contains(str)) hashSet.add(str))
var mixCount = 0
setB.foreach(str => if (hashSet.contains(str)) mixCount += 1)
mixCount
}
def realMixRate(mixCount: Int, set: mutable.HashSet[String]) = {
mixCount * 1.0 / set.size
}
def realMixRate(mixCount: Long, set: mutable.HashSet[String]) = {
mixCount * 1.0 / set.size
}
def hllcMix(setA: mutable.HashSet[String], setB: mutable.HashSet[String]): Long = {
val hllc16A = new HLLCounter(m)
setA.foreach(item => hllc16A.add(item))
val hllc16B = new HLLCounter(m)
setA.foreach(item => hllc16B.add(item))
hllc16A.getCountEstimate + hllc16B.getCountEstimate - hllcMerge(setA, setB)
}
def hllcMerge(setA: mutable.HashSet[String], setB: mutable.HashSet[String]) = {
val hllc16 = new HLLCounter(m)
setA.foreach(item => hllc16.add(item))
setB.foreach(item => hllc16.add(item))
hllc16.getCountEstimate
}
def realMerge(setA: ArrayBuffer[String], setB: ArrayBuffer[String]) = {
val hashSet = new util.HashSet[String]
setA.foreach(str => if (!hashSet.contains(str)) hashSet.add(str))
setB.foreach(str => if (!hashSet.contains(str)) hashSet.add(str))
hashSet.size()
}
def getTestString(totalCountArray: Array[Int]) = {
val sbf = new StringBuffer()
//没一位的数字是几, 然后根据这个来生成随机数
totalCountArray.foreach(s => {
if (!0.equals(s))
sbf.append(getRamdomStringS(s))
else sbf.append(getRamdomStringS(10))
})
sbf.toString
}
// n -> 10 ^^n
def getRamdomString(length: Int): String = {
val sbf = new StringBuffer()
for (i <- 0 until length) sbf.append((ramdom.nextInt(10) + 97).toChar)
sbf.toString
}
// n -> 10 ^^n
def getRamdomStringS(l: Int): String = {
(ramdom.nextInt(l) + 97).toChar.toString
}
}
********************begin r :{0.01},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.01 setA size: 1000 , setB size: 1000
generate data cost time: 48
realMixCnt: 12 , mixRate:0.012
Map collection cost time: 6
hllcMixCnt: 8 , mixRatehllc:0.008
mixRatehllc - mixRate: -0.004000
hllcMixCnt - realMixCnt: -4
hllc cost time: 131
********************begin r :{0.05},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 20000 ,r : 0.05 setA size: 1000 , setB size: 1000
generate data cost time: 4
realMixCnt: 49 , mixRate:0.049
Map collection cost time: 3
hllcMixCnt: 48 , mixRatehllc:0.048
mixRatehllc - mixRate: -0.001000
hllcMixCnt - realMixCnt: -1
hllc cost time: 19
********************begin r :{0.1},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 10000 ,r : 0.1 setA size: 1000 , setB size: 1000
generate data cost time: 3
realMixCnt: 108 , mixRate:0.108
Map collection cost time: 5
hllcMixCnt: 107 , mixRatehllc:0.107
mixRatehllc - mixRate: -0.001000
hllcMixCnt - realMixCnt: -1
hllc cost time: 15
********************begin r :{0.2},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 5000 ,r : 0.2 setA size: 1000 , setB size: 1000
generate data cost time: 3
realMixCnt: 196 , mixRate:0.196
Map collection cost time: 1
hllcMixCnt: 195 , mixRatehllc:0.195
mixRatehllc - mixRate: -0.001000
hllcMixCnt - realMixCnt: -1
hllc cost time: 16
********************begin r :{0.5},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 2000 ,r : 0.5 setA size: 1000 , setB size: 1000
generate data cost time: 7
realMixCnt: 489 , mixRate:0.489
Map collection cost time: 1
hllcMixCnt: 490 , mixRatehllc:0.49
mixRatehllc - mixRate: 0.001000
hllcMixCnt - realMixCnt: 1
hllc cost time: 8
********************begin r :{0.01},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.01 setA size: 2000 , setB size: 2000
generate data cost time: 6
realMixCnt: 11 , mixRate:0.0055
Map collection cost time: 0
hllcMixCnt: 19 , mixRatehllc:0.0095
mixRatehllc - mixRate: 0.004000
hllcMixCnt - realMixCnt: 8
hllc cost time: 24
********************begin r :{0.05},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 40000 ,r : 0.05 setA size: 2000 , setB size: 2000
generate data cost time: 5
realMixCnt: 102 , mixRate:0.051
Map collection cost time: 1
hllcMixCnt: 110 , mixRatehllc:0.055
mixRatehllc - mixRate: 0.004000
hllcMixCnt - realMixCnt: 8
hllc cost time: 11
********************begin r :{0.1},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 20000 ,r : 0.1 setA size: 2000 , setB size: 2000
generate data cost time: 4
realMixCnt: 192 , mixRate:0.096
Map collection cost time: 0
hllcMixCnt: 192 , mixRatehllc:0.096
mixRatehllc - mixRate: 0.000000
hllcMixCnt - realMixCnt: 0
hllc cost time: 11
********************begin r :{0.2},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 10000 ,r : 0.2 setA size: 2000 , setB size: 2000
generate data cost time: 3
realMixCnt: 395 , mixRate:0.1975
Map collection cost time: 1
hllcMixCnt: 387 , mixRatehllc:0.1935
mixRatehllc - mixRate: -0.004000
hllcMixCnt - realMixCnt: -8
hllc cost time: 12
********************begin r :{0.5},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 4000 ,r : 0.5 setA size: 2000 , setB size: 2000
generate data cost time: 6
realMixCnt: 986 , mixRate:0.493
Map collection cost time: 1
hllcMixCnt: 981 , mixRatehllc:0.4905
mixRatehllc - mixRate: -0.002500
hllcMixCnt - realMixCnt: -5
hllc cost time: 16
********************begin r :{0.01},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.01 setA size: 5000 , setB size: 5000
generate data cost time: 17
realMixCnt: 48 , mixRate:0.0096
Map collection cost time: 2
hllcMixCnt: 41 , mixRatehllc:0.0082
mixRatehllc - mixRate: -0.001400
hllcMixCnt - realMixCnt: -7
hllc cost time: 13
********************begin r :{0.05},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.05 setA size: 5000 , setB size: 5000
generate data cost time: 7
realMixCnt: 263 , mixRate:0.0526
Map collection cost time: 1
hllcMixCnt: 271 , mixRatehllc:0.0542
mixRatehllc - mixRate: 0.001600
hllcMixCnt - realMixCnt: 8
hllc cost time: 13
********************begin r :{0.1},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 50000 ,r : 0.1 setA size: 5000 , setB size: 5000
generate data cost time: 6
realMixCnt: 527 , mixRate:0.1054
Map collection cost time: 0
hllcMixCnt: 526 , mixRatehllc:0.1052
mixRatehllc - mixRate: -0.000200
hllcMixCnt - realMixCnt: -1
hllc cost time: 26
********************begin r :{0.2},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 25000 ,r : 0.2 setA size: 5000 , setB size: 5000
generate data cost time: 16
realMixCnt: 2505 , mixRate:0.501
Map collection cost time: 4
hllcMixCnt: 2496 , mixRatehllc:0.4992
mixRatehllc - mixRate: -0.001800
hllcMixCnt - realMixCnt: -9
hllc cost time: 16
********************begin r :{0.5},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 10000 ,r : 0.5 setA size: 5000 , setB size: 5000
generate data cost time: 8
realMixCnt: 2499 , mixRate:0.4998
Map collection cost time: 1
hllcMixCnt: 2505 , mixRatehllc:0.501
mixRatehllc - mixRate: 0.001200
hllcMixCnt - realMixCnt: 6
hllc cost time: 14
********************begin r :{0.01},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.01 setA size: 10000 , setB size: 10000
generate data cost time: 15
realMixCnt: 103 , mixRate:0.0103
Map collection cost time: 2
hllcMixCnt: 62 , mixRatehllc:0.0062
mixRatehllc - mixRate: -0.004100
hllcMixCnt - realMixCnt: -41
hllc cost time: 24
********************begin r :{0.05},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.05 setA size: 10000 , setB size: 10000
generate data cost time: 18
realMixCnt: 484 , mixRate:0.0484
Map collection cost time: 2
hllcMixCnt: 467 , mixRatehllc:0.0467
mixRatehllc - mixRate: -0.001700
hllcMixCnt - realMixCnt: -17
hllc cost time: 18
********************begin r :{0.1},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.1 setA size: 10000 , setB size: 10000
generate data cost time: 11
realMixCnt: 938 , mixRate:0.0938
Map collection cost time: 1
hllcMixCnt: 967 , mixRatehllc:0.0967
mixRatehllc - mixRate: 0.002900
hllcMixCnt - realMixCnt: 29
hllc cost time: 12
********************begin r :{0.2},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 50000 ,r : 0.2 setA size: 10000 , setB size: 10000
generate data cost time: 10
realMixCnt: 1997 , mixRate:0.1997
Map collection cost time: 5
hllcMixCnt: 1999 , mixRatehllc:0.1999
mixRatehllc - mixRate: 0.000200
hllcMixCnt - realMixCnt: 2
hllc cost time: 15
********************begin r :{0.5},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 20000 ,r : 0.5 setA size: 10000 , setB size: 10000
generate data cost time: 18
realMixCnt: 5010 , mixRate:0.501
Map collection cost time: 4
hllcMixCnt: 4990 , mixRatehllc:0.499
mixRatehllc - mixRate: -0.002000
hllcMixCnt - realMixCnt: -20
hllc cost time: 23
********************begin r :{0.01},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.01 setA size: 20000 , setB size: 20000
generate data cost time: 41
realMixCnt: 218 , mixRate:0.0109
Map collection cost time: 4
hllcMixCnt: 134 , mixRatehllc:0.0067
mixRatehllc - mixRate: -0.004200
hllcMixCnt - realMixCnt: -84
hllc cost time: 36
********************begin r :{0.05},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 400000 ,r : 0.05 setA size: 20000 , setB size: 20000
generate data cost time: 19
realMixCnt: 946 , mixRate:0.0473
Map collection cost time: 3
hllcMixCnt: 949 , mixRatehllc:0.04745
mixRatehllc - mixRate: 0.000150
hllcMixCnt - realMixCnt: 3
hllc cost time: 23
********************begin r :{0.1},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.1 setA size: 20000 , setB size: 20000
generate data cost time: 26
realMixCnt: 2001 , mixRate:0.10005
Map collection cost time: 10
hllcMixCnt: 2080 , mixRatehllc:0.104
mixRatehllc - mixRate: 0.003950
hllcMixCnt - realMixCnt: 79
hllc cost time: 56
********************begin r :{0.2},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.2 setA size: 20000 , setB size: 20000
generate data cost time: 28
realMixCnt: 4034 , mixRate:0.2017
Map collection cost time: 7
hllcMixCnt: 4113 , mixRatehllc:0.20565
mixRatehllc - mixRate: 0.003950
hllcMixCnt - realMixCnt: 79
hllc cost time: 25
********************begin r :{0.5},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 40000 ,r : 0.5 setA size: 20000 , setB size: 20000
generate data cost time: 24
realMixCnt: 9975 , mixRate:0.49875
Map collection cost time: 8
hllcMixCnt: 9994 , mixRatehllc:0.4997
mixRatehllc - mixRate: 0.000950
hllcMixCnt - realMixCnt: 19
hllc cost time: 23
********************begin r :{0.01},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 5000000 ,r : 0.01 setA size: 50000 , setB size: 50000
generate data cost time: 88
realMixCnt: 468 , mixRate:0.00936
Map collection cost time: 48
hllcMixCnt: 603 , mixRatehllc:0.01206
mixRatehllc - mixRate: 0.002700
hllcMixCnt - realMixCnt: 135
hllc cost time: 163
********************begin r :{0.05},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.05 setA size: 50000 , setB size: 50000
generate data cost time: 99
realMixCnt: 2381 , mixRate:0.04762
Map collection cost time: 17
hllcMixCnt: 2335 , mixRatehllc:0.0467
mixRatehllc - mixRate: -0.000920
hllcMixCnt - realMixCnt: -46
hllc cost time: 59
********************begin r :{0.1},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.1 setA size: 50000 , setB size: 50000
generate data cost time: 52
realMixCnt: 5091 , mixRate:0.10182
Map collection cost time: 19
hllcMixCnt: 5116 , mixRatehllc:0.10232
mixRatehllc - mixRate: 0.000500
hllcMixCnt - realMixCnt: 25
hllc cost time: 88
********************begin r :{0.2},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 250000 ,r : 0.2 setA size: 50000 , setB size: 50000
generate data cost time: 72
realMixCnt: 24889 , mixRate:0.49778
Map collection cost time: 18
hllcMixCnt: 25002 , mixRatehllc:0.50004
mixRatehllc - mixRate: 0.002260
hllcMixCnt - realMixCnt: 113
hllc cost time: 61
********************begin r :{0.5},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.5 setA size: 50000 , setB size: 50000
generate data cost time: 86
realMixCnt: 25140 , mixRate:0.5028
Map collection cost time: 15
hllcMixCnt: 25190 , mixRatehllc:0.5038
mixRatehllc - mixRate: 0.001000
hllcMixCnt - realMixCnt: 50
hllc cost time: 71
********************begin r :{0.01},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.01 setA size: 100000 , setB size: 100000
generate data cost time: 154
realMixCnt: 1051 , mixRate:0.01051
Map collection cost time: 29
hllcMixCnt: 811 , mixRatehllc:0.00811
mixRatehllc - mixRate: -0.002400
hllcMixCnt - realMixCnt: -240
hllc cost time: 232
********************begin r :{0.05},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.05 setA size: 100000 , setB size: 100000
generate data cost time: 171
realMixCnt: 4903 , mixRate:0.04903
Map collection cost time: 19
hllcMixCnt: 5095 , mixRatehllc:0.05095
mixRatehllc - mixRate: 0.001920
hllcMixCnt - realMixCnt: 192
hllc cost time: 122
********************begin r :{0.1},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.1 setA size: 100000 , setB size: 100000
generate data cost time: 131
realMixCnt: 9931 , mixRate:0.09931
Map collection cost time: 42
hllcMixCnt: 10136 , mixRatehllc:0.10136
mixRatehllc - mixRate: 0.002050
hllcMixCnt - realMixCnt: 205
hllc cost time: 155
********************begin r :{0.2},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.2 setA size: 100000 , setB size: 100000
generate data cost time: 117
realMixCnt: 20148 , mixRate:0.20148
Map collection cost time: 35
hllcMixCnt: 20414 , mixRatehllc:0.20414
mixRatehllc - mixRate: 0.002660
hllcMixCnt - realMixCnt: 266
hllc cost time: 111
********************begin r :{0.5},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.5 setA size: 100000 , setB size: 100000
generate data cost time: 130
realMixCnt: 49964 , mixRate:0.49964
Map collection cost time: 35
hllcMixCnt: 50268 , mixRatehllc:0.50268
mixRatehllc - mixRate: 0.003040
hllcMixCnt - realMixCnt: 304
hllc cost time: 133
********************begin r :{0.01},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 20000000 ,r : 0.01 setA size: 200000 , setB size: 200000
generate data cost time: 260
realMixCnt: 2035 , mixRate:0.010175
Map collection cost time: 83
hllcMixCnt: 1247 , mixRatehllc:0.006235
mixRatehllc - mixRate: -0.003940
hllcMixCnt - realMixCnt: -788
hllc cost time: 389
********************begin r :{0.05},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 4000000 ,r : 0.05 setA size: 200000 , setB size: 200000
generate data cost time: 311
realMixCnt: 10159 , mixRate:0.050795
Map collection cost time: 94
hllcMixCnt: 10030 , mixRatehllc:0.05015
mixRatehllc - mixRate: -0.000645
hllcMixCnt - realMixCnt: -129
hllc cost time: 308
********************begin r :{0.1},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.1 setA size: 200000 , setB size: 200000
generate data cost time: 255
realMixCnt: 20009 , mixRate:0.100045
Map collection cost time: 133
hllcMixCnt: 19539 , mixRatehllc:0.097695
mixRatehllc - mixRate: -0.002350
hllcMixCnt - realMixCnt: -470
hllc cost time: 235
********************begin r :{0.2},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.2 setA size: 200000 , setB size: 200000
generate data cost time: 229
realMixCnt: 39946 , mixRate:0.19973
Map collection cost time: 92
hllcMixCnt: 41310 , mixRatehllc:0.20655
mixRatehllc - mixRate: 0.006820
hllcMixCnt - realMixCnt: 1364
hllc cost time: 271
********************begin r :{0.5},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 400000 ,r : 0.5 setA size: 200000 , setB size: 200000
generate data cost time: 357
realMixCnt: 100095 , mixRate:0.500475
Map collection cost time: 93
hllcMixCnt: 100242 , mixRatehllc:0.50121
mixRatehllc - mixRate: 0.000735
hllcMixCnt - realMixCnt: 147
hllc cost time: 422
********************begin r :{0.01},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 50000000 ,r : 0.01 setA size: 500000 , setB size: 500000
generate data cost time: 758
realMixCnt: 5084 , mixRate:0.010168
Map collection cost time: 211
hllcMixCnt: 2978 , mixRatehllc:0.005956
mixRatehllc - mixRate: -0.004212
hllcMixCnt - realMixCnt: -2106
hllc cost time: 844
********************begin r :{0.05},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.05 setA size: 500000 , setB size: 500000
generate data cost time: 721
realMixCnt: 25296 , mixRate:0.050592
Map collection cost time: 222
hllcMixCnt: 23440 , mixRatehllc:0.04688
mixRatehllc - mixRate: -0.003712
hllcMixCnt - realMixCnt: -1856
hllc cost time: 699
********************begin r :{0.1},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 5000000 ,r : 0.1 setA size: 500000 , setB size: 500000
generate data cost time: 688
realMixCnt: 50178 , mixRate:0.100356
Map collection cost time: 200
hllcMixCnt: 45070 , mixRatehllc:0.09014
mixRatehllc - mixRate: -0.010216
hllcMixCnt - realMixCnt: -5108
hllc cost time: 701
********************begin r :{0.2},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 2500000 ,r : 0.2 setA size: 500000 , setB size: 500000
generate data cost time: 897
realMixCnt: 249899 , mixRate:0.499798
Map collection cost time: 223
hllcMixCnt: 250263 , mixRatehllc:0.500526
mixRatehllc - mixRate: 0.000728
hllcMixCnt - realMixCnt: 364
hllc cost time: 658
********************begin r :{0.5},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.5 setA size: 500000 , setB size: 500000
generate data cost time: 868
realMixCnt: 249895 , mixRate:0.49979
Map collection cost time: 245
hllcMixCnt: 249916 , mixRatehllc:0.499832
mixRatehllc - mixRate: 0.000042
hllcMixCnt - realMixCnt: 21
hllc cost time: 724
********************begin r :{0.01},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.01 setA size: 100000 , setB size: 100000
generate data cost time: 110
realMixCnt: 1026 , mixRate:0.01026
Map collection cost time: 28
hllcMixCnt: 569 , mixRatehllc:0.00569
mixRatehllc - mixRate: -0.004570
hllcMixCnt - realMixCnt: -457
hllc cost time: 95
********************begin r :{0.05},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.05 setA size: 100000 , setB size: 100000
generate data cost time: 91
realMixCnt: 5024 , mixRate:0.05024
Map collection cost time: 26
hllcMixCnt: 5439 , mixRatehllc:0.05439
mixRatehllc - mixRate: 0.004150
hllcMixCnt - realMixCnt: 415
hllc cost time: 131
********************begin r :{0.1},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.1 setA size: 100000 , setB size: 100000
generate data cost time: 93
realMixCnt: 9925 , mixRate:0.09925
Map collection cost time: 28
hllcMixCnt: 10201 , mixRatehllc:0.10201
mixRatehllc - mixRate: 0.002760
hllcMixCnt - realMixCnt: 276
hllc cost time: 141
********************begin r :{0.2},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.2 setA size: 100000 , setB size: 100000
generate data cost time: 90
realMixCnt: 19983 , mixRate:0.19983
Map collection cost time: 32
hllcMixCnt: 19936 , mixRatehllc:0.19936
mixRatehllc - mixRate: -0.000470
hllcMixCnt - realMixCnt: -47
hllc cost time: 128
********************begin r :{0.5},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.5 setA size: 100000 , setB size: 100000
generate data cost time: 121
realMixCnt: 50027 , mixRate:0.50027
Map collection cost time: 35
hllcMixCnt: 49726 , mixRatehllc:0.49726
mixRatehllc - mixRate: -0.003010
hllcMixCnt - realMixCnt: -301
hllc cost time: 137
********************begin r :{0.01},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 20000000 ,r : 0.01 setA size: 200000 , setB size: 200000
generate data cost time: 247
realMixCnt: 1991 , mixRate:0.009955
Map collection cost time: 38
hllcMixCnt: 2118 , mixRatehllc:0.01059
mixRatehllc - mixRate: 0.000635
hllcMixCnt - realMixCnt: 127
hllc cost time: 200
********************begin r :{0.05},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 4000000 ,r : 0.05 setA size: 200000 , setB size: 200000
generate data cost time: 225
realMixCnt: 10000 , mixRate:0.05
Map collection cost time: 71
hllcMixCnt: 9751 , mixRatehllc:0.048755
mixRatehllc - mixRate: -0.001245
hllcMixCnt - realMixCnt: -249
hllc cost time: 273
********************begin r :{0.1},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.1 setA size: 200000 , setB size: 200000
generate data cost time: 224
realMixCnt: 19974 , mixRate:0.09987
Map collection cost time: 71
hllcMixCnt: 19810 , mixRatehllc:0.09905
mixRatehllc - mixRate: -0.000820
hllcMixCnt - realMixCnt: -164
hllc cost time: 300
********************begin r :{0.2},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.2 setA size: 200000 , setB size: 200000
generate data cost time: 243
realMixCnt: 40093 , mixRate:0.200465
Map collection cost time: 82
hllcMixCnt: 40549 , mixRatehllc:0.202745
mixRatehllc - mixRate: 0.002280
hllcMixCnt - realMixCnt: 456
hllc cost time: 297
********************begin r :{0.5},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 400000 ,r : 0.5 setA size: 200000 , setB size: 200000
generate data cost time: 283
realMixCnt: 99874 , mixRate:0.49937
Map collection cost time: 88
hllcMixCnt: 99730 , mixRatehllc:0.49865
mixRatehllc - mixRate: -0.000720
hllcMixCnt - realMixCnt: -144
hllc cost time: 306
********************begin r :{0.01},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 50000000 ,r : 0.01 setA size: 500000 , setB size: 500000
generate data cost time: 678
realMixCnt: 5148 , mixRate:0.010296
Map collection cost time: 181
hllcMixCnt: 3895 , mixRatehllc:0.00779
mixRatehllc - mixRate: -0.002506
hllcMixCnt - realMixCnt: -1253
hllc cost time: 673
********************begin r :{0.05},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.05 setA size: 500000 , setB size: 500000
generate data cost time: 820
realMixCnt: 25131 , mixRate:0.050262
Map collection cost time: 185
hllcMixCnt: 24850 , mixRatehllc:0.0497
mixRatehllc - mixRate: -0.000562
hllcMixCnt - realMixCnt: -281
hllc cost time: 647
********************begin r :{0.1},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 5000000 ,r : 0.1 setA size: 500000 , setB size: 500000
generate data cost time: 691
realMixCnt: 49911 , mixRate:0.099822
Map collection cost time: 187
hllcMixCnt: 50951 , mixRatehllc:0.101902
mixRatehllc - mixRate: 0.002080
hllcMixCnt - realMixCnt: 1040
hllc cost time: 690
********************begin r :{0.2},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 2500000 ,r : 0.2 setA size: 500000 , setB size: 500000
generate data cost time: 888
realMixCnt: 250250 , mixRate:0.5005
Map collection cost time: 212
hllcMixCnt: 249358 , mixRatehllc:0.498716
mixRatehllc - mixRate: -0.001784
hllcMixCnt - realMixCnt: -892
hllc cost time: 608
********************begin r :{0.5},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.5 setA size: 500000 , setB size: 500000
generate data cost time: 840
realMixCnt: 249691 , mixRate:0.499382
Map collection cost time: 230
hllcMixCnt: 249833 , mixRatehllc:0.499666
mixRatehllc - mixRate: 0.000284
hllcMixCnt - realMixCnt: 142
hllc cost time: 714
Process finished with exit code 0
import java.util
import org.scalatest.{FlatSpec}
import org.spark.sqludf.HLLCounter
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
class HllcCrossSetTest extends FlatSpec {
val ramdom = new Random()
val m = 18
// 用于验证hllc 的错误率
def errorRateCal(sampleCount: Int) = {
errorRate(0.01, sampleCount)
errorRate(0.05, sampleCount)
errorRate(0.1, sampleCount)
errorRate(0.2, sampleCount)
errorRate(0.5, sampleCount)
}
"hllc test" should "hllc merge,mix error rate" in {
errorRateCal(1000)
errorRateCal(2000)
errorRateCal(5000)
errorRateCal(10000)
errorRateCal(20000)
errorRateCal(50000)
errorRateCal(100000)
errorRateCal(200000)
errorRateCal(500000)
errorRateCal(100000)
errorRateCal(200000)
errorRateCal(500000)
}
// 不放回抽样 测试集合生成
def getRandomStr(setCollection: mutable.HashSet[String], totalIntArray: Array[Int]): Unit = {
val str = getTestString(totalIntArray)
if (!setCollection.contains(str)) setCollection.add(str)
else
getRandomStr(setCollection, totalIntArray)
}
//5% 100w 级别集合 计算总量
def errorRate(r: Double, testSetLength: Int) = {
println(s" ********************begin r :{$r},testSetLength:{$testSetLength} ,m:{$m} ***********************")
var setA = new mutable.HashSet[String]()
var setB = new mutable.HashSet[String]()
val tatolCount = (testSetLength / r).toInt
var b = System.currentTimeMillis()
val totalIntArray = (tatolCount + "").toCharArray.map(x => x.toString.toInt)
var timeRecord = System.currentTimeMillis()
for (i <- 0 until testSetLength) {
getRandomStr(setA, totalIntArray)
getRandomStr(setB, totalIntArray)
// if (i%5000 == 0) {
// println(s" generate data ${i} ,cost: ${System.currentTimeMillis()-timeRecord}) ")
// timeRecord = System.currentTimeMillis()
// }
}
println(s"tatolCount: ${tatolCount} ,r : ${r} setA size: ${setA.size} , setB size: ${setB.size} ")
var e = System.currentTimeMillis()
println(s" generate data cost time: ${e - b} ")
/*
* realMix 交集
* hllcMix
* realMerge
* hllcMerge
* */
b = System.currentTimeMillis()
val realMixCnt = realMix(setA, setB)
val mixRate = realMixRate(realMixCnt, setA)
println(s" realMixCnt: ${realMixCnt} , mixRate:${mixRate}")
e = System.currentTimeMillis()
println(s" Map collection cost time: ${e - b} ")
b = System.currentTimeMillis()
val hllcMixCnt = hllcMix(setA, setB)
val mixRatehllc = realMixRate(hllcMixCnt, setA)
println(s" hllcMixCnt: ${hllcMixCnt} , mixRatehllc:${mixRatehllc}")
val distinct = mixRatehllc - mixRate
println(f" mixRatehllc - mixRate: $distinct%1.6f ")
println(f" hllcMixCnt - realMixCnt: ${hllcMixCnt - realMixCnt} ")
e = System.currentTimeMillis()
println(s" hllc cost time: ${e - b} ")
}
def realMix(setA: mutable.HashSet[String], setB: mutable.HashSet[String]) = {
val hashSet = new util.HashSet[String]
setA.foreach(str => if (!hashSet.contains(str)) hashSet.add(str))
var mixCount = 0
setB.foreach(str => if (hashSet.contains(str)) mixCount += 1)
mixCount
}
def realMixRate(mixCount: Int, set: mutable.HashSet[String]) = {
mixCount * 1.0 / set.size
}
def realMixRate(mixCount: Long, set: mutable.HashSet[String]) = {
mixCount * 1.0 / set.size
}
def hllcMix(setA: mutable.HashSet[String], setB: mutable.HashSet[String]): Long = {
val hllc16A = new HLLCounter(m)
setA.foreach(item => hllc16A.add(item))
val hllc16B = new HLLCounter(m)
setA.foreach(item => hllc16B.add(item))
hllc16A.getCountEstimate + hllc16B.getCountEstimate - hllcMerge(setA, setB)
}
def hllcMerge(setA: mutable.HashSet[String], setB: mutable.HashSet[String]) = {
val hllc16 = new HLLCounter(m)
setA.foreach(item => hllc16.add(item))
setB.foreach(item => hllc16.add(item))
hllc16.getCountEstimate
}
def realMerge(setA: ArrayBuffer[String], setB: ArrayBuffer[String]) = {
val hashSet = new util.HashSet[String]
setA.foreach(str => if (!hashSet.contains(str)) hashSet.add(str))
setB.foreach(str => if (!hashSet.contains(str)) hashSet.add(str))
hashSet.size()
}
def getTestString(totalCountArray: Array[Int]) = {
val sbf = new StringBuffer()
//没一位的数字是几, 然后根据这个来生成随机数
totalCountArray.foreach(s => {
if (!0.equals(s))
sbf.append(getRamdomStringS(s))
else sbf.append(getRamdomStringS(10))
})
sbf.toString
}
// n -> 10 ^^n
def getRamdomString(length: Int): String = {
val sbf = new StringBuffer()
for (i <- 0 until length) sbf.append((ramdom.nextInt(10) + 97).toChar)
sbf.toString
}
// n -> 10 ^^n
def getRamdomStringS(l: Int): String = {
(ramdom.nextInt(l) + 97).toChar.toString
}
}
********************begin r :{0.01},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.01 setA size: 1000 , setB size: 1000
generate data cost time: 48
realMixCnt: 12 , mixRate:0.012
Map collection cost time: 6
hllcMixCnt: 8 , mixRatehllc:0.008
mixRatehllc - mixRate: -0.004000
hllcMixCnt - realMixCnt: -4
hllc cost time: 131
********************begin r :{0.05},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 20000 ,r : 0.05 setA size: 1000 , setB size: 1000
generate data cost time: 4
realMixCnt: 49 , mixRate:0.049
Map collection cost time: 3
hllcMixCnt: 48 , mixRatehllc:0.048
mixRatehllc - mixRate: -0.001000
hllcMixCnt - realMixCnt: -1
hllc cost time: 19
********************begin r :{0.1},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 10000 ,r : 0.1 setA size: 1000 , setB size: 1000
generate data cost time: 3
realMixCnt: 108 , mixRate:0.108
Map collection cost time: 5
hllcMixCnt: 107 , mixRatehllc:0.107
mixRatehllc - mixRate: -0.001000
hllcMixCnt - realMixCnt: -1
hllc cost time: 15
********************begin r :{0.2},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 5000 ,r : 0.2 setA size: 1000 , setB size: 1000
generate data cost time: 3
realMixCnt: 196 , mixRate:0.196
Map collection cost time: 1
hllcMixCnt: 195 , mixRatehllc:0.195
mixRatehllc - mixRate: -0.001000
hllcMixCnt - realMixCnt: -1
hllc cost time: 16
********************begin r :{0.5},testSetLength:{1000} ,m:{18} ***********************
tatolCount: 2000 ,r : 0.5 setA size: 1000 , setB size: 1000
generate data cost time: 7
realMixCnt: 489 , mixRate:0.489
Map collection cost time: 1
hllcMixCnt: 490 , mixRatehllc:0.49
mixRatehllc - mixRate: 0.001000
hllcMixCnt - realMixCnt: 1
hllc cost time: 8
********************begin r :{0.01},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.01 setA size: 2000 , setB size: 2000
generate data cost time: 6
realMixCnt: 11 , mixRate:0.0055
Map collection cost time: 0
hllcMixCnt: 19 , mixRatehllc:0.0095
mixRatehllc - mixRate: 0.004000
hllcMixCnt - realMixCnt: 8
hllc cost time: 24
********************begin r :{0.05},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 40000 ,r : 0.05 setA size: 2000 , setB size: 2000
generate data cost time: 5
realMixCnt: 102 , mixRate:0.051
Map collection cost time: 1
hllcMixCnt: 110 , mixRatehllc:0.055
mixRatehllc - mixRate: 0.004000
hllcMixCnt - realMixCnt: 8
hllc cost time: 11
********************begin r :{0.1},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 20000 ,r : 0.1 setA size: 2000 , setB size: 2000
generate data cost time: 4
realMixCnt: 192 , mixRate:0.096
Map collection cost time: 0
hllcMixCnt: 192 , mixRatehllc:0.096
mixRatehllc - mixRate: 0.000000
hllcMixCnt - realMixCnt: 0
hllc cost time: 11
********************begin r :{0.2},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 10000 ,r : 0.2 setA size: 2000 , setB size: 2000
generate data cost time: 3
realMixCnt: 395 , mixRate:0.1975
Map collection cost time: 1
hllcMixCnt: 387 , mixRatehllc:0.1935
mixRatehllc - mixRate: -0.004000
hllcMixCnt - realMixCnt: -8
hllc cost time: 12
********************begin r :{0.5},testSetLength:{2000} ,m:{18} ***********************
tatolCount: 4000 ,r : 0.5 setA size: 2000 , setB size: 2000
generate data cost time: 6
realMixCnt: 986 , mixRate:0.493
Map collection cost time: 1
hllcMixCnt: 981 , mixRatehllc:0.4905
mixRatehllc - mixRate: -0.002500
hllcMixCnt - realMixCnt: -5
hllc cost time: 16
********************begin r :{0.01},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.01 setA size: 5000 , setB size: 5000
generate data cost time: 17
realMixCnt: 48 , mixRate:0.0096
Map collection cost time: 2
hllcMixCnt: 41 , mixRatehllc:0.0082
mixRatehllc - mixRate: -0.001400
hllcMixCnt - realMixCnt: -7
hllc cost time: 13
********************begin r :{0.05},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.05 setA size: 5000 , setB size: 5000
generate data cost time: 7
realMixCnt: 263 , mixRate:0.0526
Map collection cost time: 1
hllcMixCnt: 271 , mixRatehllc:0.0542
mixRatehllc - mixRate: 0.001600
hllcMixCnt - realMixCnt: 8
hllc cost time: 13
********************begin r :{0.1},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 50000 ,r : 0.1 setA size: 5000 , setB size: 5000
generate data cost time: 6
realMixCnt: 527 , mixRate:0.1054
Map collection cost time: 0
hllcMixCnt: 526 , mixRatehllc:0.1052
mixRatehllc - mixRate: -0.000200
hllcMixCnt - realMixCnt: -1
hllc cost time: 26
********************begin r :{0.2},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 25000 ,r : 0.2 setA size: 5000 , setB size: 5000
generate data cost time: 16
realMixCnt: 2505 , mixRate:0.501
Map collection cost time: 4
hllcMixCnt: 2496 , mixRatehllc:0.4992
mixRatehllc - mixRate: -0.001800
hllcMixCnt - realMixCnt: -9
hllc cost time: 16
********************begin r :{0.5},testSetLength:{5000} ,m:{18} ***********************
tatolCount: 10000 ,r : 0.5 setA size: 5000 , setB size: 5000
generate data cost time: 8
realMixCnt: 2499 , mixRate:0.4998
Map collection cost time: 1
hllcMixCnt: 2505 , mixRatehllc:0.501
mixRatehllc - mixRate: 0.001200
hllcMixCnt - realMixCnt: 6
hllc cost time: 14
********************begin r :{0.01},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.01 setA size: 10000 , setB size: 10000
generate data cost time: 15
realMixCnt: 103 , mixRate:0.0103
Map collection cost time: 2
hllcMixCnt: 62 , mixRatehllc:0.0062
mixRatehllc - mixRate: -0.004100
hllcMixCnt - realMixCnt: -41
hllc cost time: 24
********************begin r :{0.05},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.05 setA size: 10000 , setB size: 10000
generate data cost time: 18
realMixCnt: 484 , mixRate:0.0484
Map collection cost time: 2
hllcMixCnt: 467 , mixRatehllc:0.0467
mixRatehllc - mixRate: -0.001700
hllcMixCnt - realMixCnt: -17
hllc cost time: 18
********************begin r :{0.1},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.1 setA size: 10000 , setB size: 10000
generate data cost time: 11
realMixCnt: 938 , mixRate:0.0938
Map collection cost time: 1
hllcMixCnt: 967 , mixRatehllc:0.0967
mixRatehllc - mixRate: 0.002900
hllcMixCnt - realMixCnt: 29
hllc cost time: 12
********************begin r :{0.2},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 50000 ,r : 0.2 setA size: 10000 , setB size: 10000
generate data cost time: 10
realMixCnt: 1997 , mixRate:0.1997
Map collection cost time: 5
hllcMixCnt: 1999 , mixRatehllc:0.1999
mixRatehllc - mixRate: 0.000200
hllcMixCnt - realMixCnt: 2
hllc cost time: 15
********************begin r :{0.5},testSetLength:{10000} ,m:{18} ***********************
tatolCount: 20000 ,r : 0.5 setA size: 10000 , setB size: 10000
generate data cost time: 18
realMixCnt: 5010 , mixRate:0.501
Map collection cost time: 4
hllcMixCnt: 4990 , mixRatehllc:0.499
mixRatehllc - mixRate: -0.002000
hllcMixCnt - realMixCnt: -20
hllc cost time: 23
********************begin r :{0.01},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.01 setA size: 20000 , setB size: 20000
generate data cost time: 41
realMixCnt: 218 , mixRate:0.0109
Map collection cost time: 4
hllcMixCnt: 134 , mixRatehllc:0.0067
mixRatehllc - mixRate: -0.004200
hllcMixCnt - realMixCnt: -84
hllc cost time: 36
********************begin r :{0.05},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 400000 ,r : 0.05 setA size: 20000 , setB size: 20000
generate data cost time: 19
realMixCnt: 946 , mixRate:0.0473
Map collection cost time: 3
hllcMixCnt: 949 , mixRatehllc:0.04745
mixRatehllc - mixRate: 0.000150
hllcMixCnt - realMixCnt: 3
hllc cost time: 23
********************begin r :{0.1},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.1 setA size: 20000 , setB size: 20000
generate data cost time: 26
realMixCnt: 2001 , mixRate:0.10005
Map collection cost time: 10
hllcMixCnt: 2080 , mixRatehllc:0.104
mixRatehllc - mixRate: 0.003950
hllcMixCnt - realMixCnt: 79
hllc cost time: 56
********************begin r :{0.2},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.2 setA size: 20000 , setB size: 20000
generate data cost time: 28
realMixCnt: 4034 , mixRate:0.2017
Map collection cost time: 7
hllcMixCnt: 4113 , mixRatehllc:0.20565
mixRatehllc - mixRate: 0.003950
hllcMixCnt - realMixCnt: 79
hllc cost time: 25
********************begin r :{0.5},testSetLength:{20000} ,m:{18} ***********************
tatolCount: 40000 ,r : 0.5 setA size: 20000 , setB size: 20000
generate data cost time: 24
realMixCnt: 9975 , mixRate:0.49875
Map collection cost time: 8
hllcMixCnt: 9994 , mixRatehllc:0.4997
mixRatehllc - mixRate: 0.000950
hllcMixCnt - realMixCnt: 19
hllc cost time: 23
********************begin r :{0.01},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 5000000 ,r : 0.01 setA size: 50000 , setB size: 50000
generate data cost time: 88
realMixCnt: 468 , mixRate:0.00936
Map collection cost time: 48
hllcMixCnt: 603 , mixRatehllc:0.01206
mixRatehllc - mixRate: 0.002700
hllcMixCnt - realMixCnt: 135
hllc cost time: 163
********************begin r :{0.05},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.05 setA size: 50000 , setB size: 50000
generate data cost time: 99
realMixCnt: 2381 , mixRate:0.04762
Map collection cost time: 17
hllcMixCnt: 2335 , mixRatehllc:0.0467
mixRatehllc - mixRate: -0.000920
hllcMixCnt - realMixCnt: -46
hllc cost time: 59
********************begin r :{0.1},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.1 setA size: 50000 , setB size: 50000
generate data cost time: 52
realMixCnt: 5091 , mixRate:0.10182
Map collection cost time: 19
hllcMixCnt: 5116 , mixRatehllc:0.10232
mixRatehllc - mixRate: 0.000500
hllcMixCnt - realMixCnt: 25
hllc cost time: 88
********************begin r :{0.2},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 250000 ,r : 0.2 setA size: 50000 , setB size: 50000
generate data cost time: 72
realMixCnt: 24889 , mixRate:0.49778
Map collection cost time: 18
hllcMixCnt: 25002 , mixRatehllc:0.50004
mixRatehllc - mixRate: 0.002260
hllcMixCnt - realMixCnt: 113
hllc cost time: 61
********************begin r :{0.5},testSetLength:{50000} ,m:{18} ***********************
tatolCount: 100000 ,r : 0.5 setA size: 50000 , setB size: 50000
generate data cost time: 86
realMixCnt: 25140 , mixRate:0.5028
Map collection cost time: 15
hllcMixCnt: 25190 , mixRatehllc:0.5038
mixRatehllc - mixRate: 0.001000
hllcMixCnt - realMixCnt: 50
hllc cost time: 71
********************begin r :{0.01},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.01 setA size: 100000 , setB size: 100000
generate data cost time: 154
realMixCnt: 1051 , mixRate:0.01051
Map collection cost time: 29
hllcMixCnt: 811 , mixRatehllc:0.00811
mixRatehllc - mixRate: -0.002400
hllcMixCnt - realMixCnt: -240
hllc cost time: 232
********************begin r :{0.05},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.05 setA size: 100000 , setB size: 100000
generate data cost time: 171
realMixCnt: 4903 , mixRate:0.04903
Map collection cost time: 19
hllcMixCnt: 5095 , mixRatehllc:0.05095
mixRatehllc - mixRate: 0.001920
hllcMixCnt - realMixCnt: 192
hllc cost time: 122
********************begin r :{0.1},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.1 setA size: 100000 , setB size: 100000
generate data cost time: 131
realMixCnt: 9931 , mixRate:0.09931
Map collection cost time: 42
hllcMixCnt: 10136 , mixRatehllc:0.10136
mixRatehllc - mixRate: 0.002050
hllcMixCnt - realMixCnt: 205
hllc cost time: 155
********************begin r :{0.2},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.2 setA size: 100000 , setB size: 100000
generate data cost time: 117
realMixCnt: 20148 , mixRate:0.20148
Map collection cost time: 35
hllcMixCnt: 20414 , mixRatehllc:0.20414
mixRatehllc - mixRate: 0.002660
hllcMixCnt - realMixCnt: 266
hllc cost time: 111
********************begin r :{0.5},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.5 setA size: 100000 , setB size: 100000
generate data cost time: 130
realMixCnt: 49964 , mixRate:0.49964
Map collection cost time: 35
hllcMixCnt: 50268 , mixRatehllc:0.50268
mixRatehllc - mixRate: 0.003040
hllcMixCnt - realMixCnt: 304
hllc cost time: 133
********************begin r :{0.01},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 20000000 ,r : 0.01 setA size: 200000 , setB size: 200000
generate data cost time: 260
realMixCnt: 2035 , mixRate:0.010175
Map collection cost time: 83
hllcMixCnt: 1247 , mixRatehllc:0.006235
mixRatehllc - mixRate: -0.003940
hllcMixCnt - realMixCnt: -788
hllc cost time: 389
********************begin r :{0.05},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 4000000 ,r : 0.05 setA size: 200000 , setB size: 200000
generate data cost time: 311
realMixCnt: 10159 , mixRate:0.050795
Map collection cost time: 94
hllcMixCnt: 10030 , mixRatehllc:0.05015
mixRatehllc - mixRate: -0.000645
hllcMixCnt - realMixCnt: -129
hllc cost time: 308
********************begin r :{0.1},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.1 setA size: 200000 , setB size: 200000
generate data cost time: 255
realMixCnt: 20009 , mixRate:0.100045
Map collection cost time: 133
hllcMixCnt: 19539 , mixRatehllc:0.097695
mixRatehllc - mixRate: -0.002350
hllcMixCnt - realMixCnt: -470
hllc cost time: 235
********************begin r :{0.2},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.2 setA size: 200000 , setB size: 200000
generate data cost time: 229
realMixCnt: 39946 , mixRate:0.19973
Map collection cost time: 92
hllcMixCnt: 41310 , mixRatehllc:0.20655
mixRatehllc - mixRate: 0.006820
hllcMixCnt - realMixCnt: 1364
hllc cost time: 271
********************begin r :{0.5},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 400000 ,r : 0.5 setA size: 200000 , setB size: 200000
generate data cost time: 357
realMixCnt: 100095 , mixRate:0.500475
Map collection cost time: 93
hllcMixCnt: 100242 , mixRatehllc:0.50121
mixRatehllc - mixRate: 0.000735
hllcMixCnt - realMixCnt: 147
hllc cost time: 422
********************begin r :{0.01},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 50000000 ,r : 0.01 setA size: 500000 , setB size: 500000
generate data cost time: 758
realMixCnt: 5084 , mixRate:0.010168
Map collection cost time: 211
hllcMixCnt: 2978 , mixRatehllc:0.005956
mixRatehllc - mixRate: -0.004212
hllcMixCnt - realMixCnt: -2106
hllc cost time: 844
********************begin r :{0.05},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.05 setA size: 500000 , setB size: 500000
generate data cost time: 721
realMixCnt: 25296 , mixRate:0.050592
Map collection cost time: 222
hllcMixCnt: 23440 , mixRatehllc:0.04688
mixRatehllc - mixRate: -0.003712
hllcMixCnt - realMixCnt: -1856
hllc cost time: 699
********************begin r :{0.1},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 5000000 ,r : 0.1 setA size: 500000 , setB size: 500000
generate data cost time: 688
realMixCnt: 50178 , mixRate:0.100356
Map collection cost time: 200
hllcMixCnt: 45070 , mixRatehllc:0.09014
mixRatehllc - mixRate: -0.010216
hllcMixCnt - realMixCnt: -5108
hllc cost time: 701
********************begin r :{0.2},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 2500000 ,r : 0.2 setA size: 500000 , setB size: 500000
generate data cost time: 897
realMixCnt: 249899 , mixRate:0.499798
Map collection cost time: 223
hllcMixCnt: 250263 , mixRatehllc:0.500526
mixRatehllc - mixRate: 0.000728
hllcMixCnt - realMixCnt: 364
hllc cost time: 658
********************begin r :{0.5},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.5 setA size: 500000 , setB size: 500000
generate data cost time: 868
realMixCnt: 249895 , mixRate:0.49979
Map collection cost time: 245
hllcMixCnt: 249916 , mixRatehllc:0.499832
mixRatehllc - mixRate: 0.000042
hllcMixCnt - realMixCnt: 21
hllc cost time: 724
********************begin r :{0.01},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.01 setA size: 100000 , setB size: 100000
generate data cost time: 110
realMixCnt: 1026 , mixRate:0.01026
Map collection cost time: 28
hllcMixCnt: 569 , mixRatehllc:0.00569
mixRatehllc - mixRate: -0.004570
hllcMixCnt - realMixCnt: -457
hllc cost time: 95
********************begin r :{0.05},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.05 setA size: 100000 , setB size: 100000
generate data cost time: 91
realMixCnt: 5024 , mixRate:0.05024
Map collection cost time: 26
hllcMixCnt: 5439 , mixRatehllc:0.05439
mixRatehllc - mixRate: 0.004150
hllcMixCnt - realMixCnt: 415
hllc cost time: 131
********************begin r :{0.1},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.1 setA size: 100000 , setB size: 100000
generate data cost time: 93
realMixCnt: 9925 , mixRate:0.09925
Map collection cost time: 28
hllcMixCnt: 10201 , mixRatehllc:0.10201
mixRatehllc - mixRate: 0.002760
hllcMixCnt - realMixCnt: 276
hllc cost time: 141
********************begin r :{0.2},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 500000 ,r : 0.2 setA size: 100000 , setB size: 100000
generate data cost time: 90
realMixCnt: 19983 , mixRate:0.19983
Map collection cost time: 32
hllcMixCnt: 19936 , mixRatehllc:0.19936
mixRatehllc - mixRate: -0.000470
hllcMixCnt - realMixCnt: -47
hllc cost time: 128
********************begin r :{0.5},testSetLength:{100000} ,m:{18} ***********************
tatolCount: 200000 ,r : 0.5 setA size: 100000 , setB size: 100000
generate data cost time: 121
realMixCnt: 50027 , mixRate:0.50027
Map collection cost time: 35
hllcMixCnt: 49726 , mixRatehllc:0.49726
mixRatehllc - mixRate: -0.003010
hllcMixCnt - realMixCnt: -301
hllc cost time: 137
********************begin r :{0.01},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 20000000 ,r : 0.01 setA size: 200000 , setB size: 200000
generate data cost time: 247
realMixCnt: 1991 , mixRate:0.009955
Map collection cost time: 38
hllcMixCnt: 2118 , mixRatehllc:0.01059
mixRatehllc - mixRate: 0.000635
hllcMixCnt - realMixCnt: 127
hllc cost time: 200
********************begin r :{0.05},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 4000000 ,r : 0.05 setA size: 200000 , setB size: 200000
generate data cost time: 225
realMixCnt: 10000 , mixRate:0.05
Map collection cost time: 71
hllcMixCnt: 9751 , mixRatehllc:0.048755
mixRatehllc - mixRate: -0.001245
hllcMixCnt - realMixCnt: -249
hllc cost time: 273
********************begin r :{0.1},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 2000000 ,r : 0.1 setA size: 200000 , setB size: 200000
generate data cost time: 224
realMixCnt: 19974 , mixRate:0.09987
Map collection cost time: 71
hllcMixCnt: 19810 , mixRatehllc:0.09905
mixRatehllc - mixRate: -0.000820
hllcMixCnt - realMixCnt: -164
hllc cost time: 300
********************begin r :{0.2},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.2 setA size: 200000 , setB size: 200000
generate data cost time: 243
realMixCnt: 40093 , mixRate:0.200465
Map collection cost time: 82
hllcMixCnt: 40549 , mixRatehllc:0.202745
mixRatehllc - mixRate: 0.002280
hllcMixCnt - realMixCnt: 456
hllc cost time: 297
********************begin r :{0.5},testSetLength:{200000} ,m:{18} ***********************
tatolCount: 400000 ,r : 0.5 setA size: 200000 , setB size: 200000
generate data cost time: 283
realMixCnt: 99874 , mixRate:0.49937
Map collection cost time: 88
hllcMixCnt: 99730 , mixRatehllc:0.49865
mixRatehllc - mixRate: -0.000720
hllcMixCnt - realMixCnt: -144
hllc cost time: 306
********************begin r :{0.01},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 50000000 ,r : 0.01 setA size: 500000 , setB size: 500000
generate data cost time: 678
realMixCnt: 5148 , mixRate:0.010296
Map collection cost time: 181
hllcMixCnt: 3895 , mixRatehllc:0.00779
mixRatehllc - mixRate: -0.002506
hllcMixCnt - realMixCnt: -1253
hllc cost time: 673
********************begin r :{0.05},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 10000000 ,r : 0.05 setA size: 500000 , setB size: 500000
generate data cost time: 820
realMixCnt: 25131 , mixRate:0.050262
Map collection cost time: 185
hllcMixCnt: 24850 , mixRatehllc:0.0497
mixRatehllc - mixRate: -0.000562
hllcMixCnt - realMixCnt: -281
hllc cost time: 647
********************begin r :{0.1},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 5000000 ,r : 0.1 setA size: 500000 , setB size: 500000
generate data cost time: 691
realMixCnt: 49911 , mixRate:0.099822
Map collection cost time: 187
hllcMixCnt: 50951 , mixRatehllc:0.101902
mixRatehllc - mixRate: 0.002080
hllcMixCnt - realMixCnt: 1040
hllc cost time: 690
********************begin r :{0.2},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 2500000 ,r : 0.2 setA size: 500000 , setB size: 500000
generate data cost time: 888
realMixCnt: 250250 , mixRate:0.5005
Map collection cost time: 212
hllcMixCnt: 249358 , mixRatehllc:0.498716
mixRatehllc - mixRate: -0.001784
hllcMixCnt - realMixCnt: -892
hllc cost time: 608
********************begin r :{0.5},testSetLength:{500000} ,m:{18} ***********************
tatolCount: 1000000 ,r : 0.5 setA size: 500000 , setB size: 500000
generate data cost time: 840
realMixCnt: 249691 , mixRate:0.499382
Map collection cost time: 230
hllcMixCnt: 249833 , mixRatehllc:0.499666
mixRatehllc - mixRate: 0.000284
hllcMixCnt - realMixCnt: 142
hllc cost time: 714
Process finished with exit code 0
发表评论
-
hllc 不同M 的 小基数的误差率
2017-12-22 14:20 462结论: testHllcError(10, tt) ... -
newExecuteStatementOperation single session
2017-10-16 09:52 568var udfNotInited = true o ... -
yarn spark
2017-09-19 14:08 282--master yarn --deploy-mode cli ... -
test code 09-18-2
2017-09-18 18:47 315object FunnelUtil { var gson ... -
交流 code 09-18
2017-09-18 18:45 303object DataProcess extends Ap ... -
mvn + scala support
2017-09-15 10:00 400<build> <p ... -
THREAD TEST
2017-09-12 18:07 356val THREAD_POOL_SIZE = 10 ... -
json
2017-09-07 10:21 388val gson: Gson = new GsonBuil ... -
curreying function
2017-08-09 15:27 290benchmark2("hllc") ... -
file io
2017-02-04 20:10 303window 下通过 source 读文件各种鬼 改用 Buf ...
相关推荐
基数排序算法,这是分不错的算法,实现起来有点难,用静态链表实现,每次改变结点的指向,希望有用,多多指教
基数排序是一种非比较型整数排序算法,它的原理是将整数按位数切割成不同的数字,然后按每个位数分别比较。这种算法对于大数据量的排序尤其有效,因为其时间复杂度为线性,即O(n*k),其中n是待排序的元素数量,k是每...
该算法用C++语言实现了基数排序算法,已经调试通过,在Linux系统环境中运行结果正常
基数排序是一种非比较型整数排序算法,其原理是将整数按位数切割成不同的数字,然后按每个位数分别比较。这种算法对于大数据量的排序尤其有效,尤其是在数值范围较大的情况下。在Linux系统,特别是Ubuntu 13.04这样...
基数排序是一种非比较型整数排序算法,它的原理是将整数按位数切割成不同的数字,然后按每个位数分别比较。这种算法是通过分配、收集和再分配的步骤来实现的,它不依赖于元素之间的比较,而是依赖于数字的位数。这种...
为了验证新算法的有效性,研究者进行了实验对比。实验结果显示,与传统的插入排序、选择排序、冒泡排序等算法相比,新提出的算法不仅排序速度快,而且稳定性好。特别是对于大规模数据集,新算法的优势更加明显,能够...
基数排序是一种非比较型整数排序算法,其原理是将整数按位数切割成不同的数字,然后按每个位数分别比较。在C++中实现基数排序,主要涉及到数组、计数排序以及位操作等技术。以下是关于基数排序及其C++实现的详细解释...
基数排序是一种非比较型整数排序算法,其原理是将整数按位数切割成不同的数字,然后按每个位数分别比较。递归算法则是一种解决问题的方法,它解决问题的每一个子问题都是原问题的缩小版,直到子问题简单到可以直接...
程序的输出结果和分析也没有详细描述,这部分通常是验证算法正确性的关键环节。 【设计总结】部分,作者回顾了链式基数排序的原理,包括静态链表的使用、分配和收集的步骤,以及对每个关键码位的处理。作者也表达了...
标题中的"(附有VC和VB源码)VC编写基数排序算法,生成DLL给VB调用"揭示了这个压缩包文件包含的内容。这是一个编程项目,它使用C++(VC即Visual C++)实现了基数排序算法,并将该算法封装到一个动态链接库(DLL)中,...
基数排序是一种非比较型整数排序算法,其原理是将整数按位数切割成不同的数字,然后按每个位数分别比较。它不是基于比较的排序,而是通过创建额外的数据结构,比如计数器或者桶,来达到排序的目的。在处理大量数据时...
这个是一个基数排序的算法,算法演示,方便大家学习使用。
基数估计算法是一种在大数据场景下广泛应用于估算大规模数据集中唯一元素数量的高效方法。它在处理海量数据时,能够以可接受的误差范围内快速估算基数,而无需存储所有元素,这在资源有限的情况下尤其重要。 首先,...
基数排序是一种非比较型整数排序算法,它的原理是将整数按位数切割成不同的数字,然后按每个位数分别比较。这种算法对于大数据量的排序尤其有效,因为它可以避免大量的比较操作,而是通过计数和分配来完成排序。 ...
基数排序是一种非比较型整数排序算法,其原理是将整数按位数切割成不同的数字,然后按每个位数分别比较。这种算法对于大数据量的排序尤其有效,因为它是线性时间复杂度的,即O(nk),其中n是元素数量,k是数字的最大...
### Java实现基数排序算法 #### 实现原理 基数排序是一种非常高效的非比较型整数排序算法,它通过按数字的各个位数进行排序来实现序列的整体有序化。该算法的关键在于能够有效地处理多位数,避免了传统的两两比较...
针对这两种排序算法的特点,本文介绍了一种名为“超快速排序”的新型排序算法,该算法旨在结合快速排序和基数排序的优点,从而实现更高效的排序效果。 #### 二、算法原理 ##### 2.1 快速排序简介 快速排序是一种...
基数排序是一种非比较型整数排序算法,它的原理是将整数按位数切割成不同的数字,然后按每个位数分别比较。这种排序方法不需要进行记录关键字间的比较,而是通过“分配”和“收集”操作对单逻辑关键字进行排序。 1....