struct streaming SQL udf udaf

lingzhi007

浏览: 127096 次
性别:
来自: 杭州

最近访客更多访客>>

morelily

gaojingsong

gaz0301

jiedushi

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

spark 学习

object StructuredNetworkWordCount {
def main(args: Array[String]) {
    if (args.length < 2) {
      System.err.println("Usage: StructuredNetworkWordCount <hostname> <port>")
      System.exit(1)
    }

    val host = args(0)
    val port = args(1).toInt

    val spark = SparkSession
      .builder
      .appName("StructuredNetworkWordCount")
          .config("spark.default.parallelism",3)
      .getOrCreate()

    import spark.implicits._

    // Create DataFrame representing the stream of input lines from connection to host:port
    val lines = spark.readStream
      .format("socket")
      .option("host", host)
      .option("port", port)
      .load()

    val words = lines.as[String]
      .map(x => {println("**:"+x);x.split(" ")})
      .filter(_.length == 3)
       .map(x => (x(0),new Timestamp(x(1).toLong),x(2)))
      .withColumnRenamed("_1","dim1").withColumnRenamed("_2","time").withColumnRenamed("_3","imeisi")
    words.printSchema()

    val wordCounts2 = words
      .withWatermark("time", "10 minutes")
    wordCounts2.registerTempTable("uv")
    spark.udf.register("doubleString",Utils.udfDoubleString _ )
    spark.udf.register("HLLCUDAFInt", new HLLCUDAFInt() )

    val wordCounts = spark.sql(" select * from (select time,doubleString(dim1) as aa, doubleString(dim1) as bb ,HLLCUDAFInt(imeisi) as uv from uv group by time,doubleString(dim1)) tampa ")
     // 如何拆新表, insert into ,其实不用, 用临时表就ok , sql 负责一点.

    val query = wordCounts.writeStream
      .outputMode("update")
      .foreach(new ForeachWriter[Row] {
        override def process(value: Row): Unit = {
          println(s" ${value.getAs[String](0)}   ${value.getAs[String](1)}     ${value.getAs[String](2)}   ${value.getAs[Int](3)}   ")
        }
        override def close(errorOrNull: Throwable): Unit = {}
        override def open(partitionId: Long, version: Long): Boolean = true
      }).start()

    query.awaitTermination()
}
}

class HLLCUDAFInt extends UserDefinedAggregateFunction{ //ctrl+I实现复写方法
    override def inputSchema: StructType = StructType(Array(StructField("input", StringType, true)))
    override def bufferSchema: StructType = StructType(Array(StructField("hllcbyte",BinaryType , true)))
    override def dataType: DataType = LongType
    override def deterministic: Boolean = true
    override def initialize(buffer: MutableAggregationBuffer): Unit = {buffer(0)= {
      val hllc = new HLLCounter(14)
      val bytes1 = ByteBuffer.allocate(hllc.maxLength())
      hllc.writeRegisters(bytes1)
      bytes1.array
    }}
    override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
      val hllc = new HLLCounter(14)
      hllc.readRegisters(ByteBuffer.wrap(buffer.getAs[Array[Byte]](0)))
      hllc.add(input.getAs[String](0))
      val bytes1 = ByteBuffer.allocate(hllc.maxLength())
      hllc.writeRegisters(bytes1)
      buffer(0) = bytes1.array
    }
    override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
      val hllc = new HLLCounter(14)
      hllc.readRegisters(ByteBuffer.wrap(buffer1.getAs[Array[Byte]](0)))
      val hllc2 = new HLLCounter(14)
      hllc2.readRegisters(ByteBuffer.wrap(buffer2.getAs[Array[Byte]](0)))
      hllc.merge(hllc2)
      val bytes1 = ByteBuffer.allocate(hllc.maxLength())
      hllc.writeRegisters(bytes1)
      buffer1(0) = bytes1.array
    }
    override def evaluate(buffer: Row): Any = {
      val hllc = new HLLCounter(14)
      hllc.readRegisters(ByteBuffer.wrap(buffer.getAs[Array[Byte]](0)))
      hllc.getCountEstimate
    }
}

Aggregator 写法

class HllcdistinctValue extends Aggregator[Row, HLLCounter, Long] {
      // A zero value for this aggregation. Should satisfy the property that any b + zero = b
      def zero: HLLCounter = new HLLCounter()
      // Combine two values to produce a new value. For performance, the function may modify `buffer`
      // and return it instead of constructing a new object
      def reduce(buffer: HLLCounter, employee: Row): HLLCounter = {
        buffer.add(employee.getString(0))
    buffer
}
// Merge two intermediate values
def merge(b1: HLLCounter, b2: HLLCounter): HLLCounter = {
    b1.merge(b2)
    b1
}
// Transform the output of the reduction
def finish(reduction: HLLCounter): Long = reduction.getCountEstimate
// Specifies the Encoder for the intermediate value type
def bufferEncoder: Encoder[HLLCounter] = Encoders.javaSerialization
// Specifies the Encoder for the final output value type
def outputEncoder: Encoder[Long] = Encoders.scalaLong
}

用法：

   val averageSalary = new HllcdistinctValue().toColumn
    // words 已经为 dateframe 结构了.
    // Generate running word count
    val windowedCounts = words
      //.groupBy("word", "timestamp").count()
      .withWatermark("timestamp", "10 minutes")
      // .groupBy(window($"timestamp", windowDuration, slideDuration), $"word").agg(averageSalary)
      .groupBy(window($"timestamp", windowDuration, slideDuration)).agg(averageSalary)

分享到：

pipiline tf token | spark , jar

2017-08-15 18:06
浏览 705
评论(0)
分类:非技术
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论