基于apache drill 的HDFS查询json 单机实例

lvdccyb

浏览: 417374 次

最近访客更多访客>>

WindyQin

bigstar119

jiazhigang

JoinerSep

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

java

drill

参考 https://drill.apache.org/docs/json-data-model/

假设有原始数据在hdfs上：
hdfs://dc1:8020/xf/mytest/ia/2017/0208/details/part-00000
多条数据，按行存储的json文件，实际上是spark saveAsTextFile方法生成。
格式如下(已删除部分数据）

{
    "afterOpenDay": 9,
    "basic": {
        "availableMoney": 24063.51344060898,
        "closeReason": 0,
        "cutEarning": 0,
        "end_date": "20170222",
        "ism_id": "170208206199185",
        "losePercentage": 0,
        "profitPercentage": 0,
        "start_date": "20170209",
        "tm_close": -1,
        "totalMoney": 23600,
        "user_id": "8888"
    },
    "closeDay": true,
    "dailySummary": [
        {
            "TN": 0,
            "annualProfitRate": 0,
            "asset": 23515.820100307465,
            "commission": 75.1798996925354,
            "cost": 21560.179899692535,
            "day": "20170209",
            "floatProfit": -84.1798996925354,
            "freeMoney": 2039.8201003074646,
            "maketValue": 21476,
            "profitRate": -0.0035669449022260762
        },
        {
            "TN": 1,
            "annualProfitRate": 0,
            "asset": 23585.904140472412,
            "commission": 81.09585952758789,
            "cost": 20668.095859527588,
            "day": "20170210",
            "floatProfit": -14.09585952758789,
            "freeMoney": 2931.904140472412,
            "maketValue": 20654,
            "profitRate": -0.0005972821833723683
        },
        {
            "TN": 2,
            "annualProfitRate": 0,
            "asset": 23830.72134065628,
            "commission": 88.27865934371948,
            "cost": 18535.27865934372,
            "day": "20170213",
            "floatProfit": 230.72134065628052,
            "freeMoney": 5064.7213406562805,
            "maketValue": 18766,
            "profitRate": 0.009776327993910192
        },
        {
            "TN": 3,
            "annualProfitRate": 0,
            "asset": 23887.72134065628,
            "commission": 88.27865934371948,
            "cost": 18535.27865934372,
            "day": "20170214",
            "floatProfit": 287.7213406562805,
            "freeMoney": 5064.7213406562805,
            "maketValue": 18823,
            "profitRate": 0.012191582231198327
        },
        {
            "TN": 4,
            "annualProfitRate": 0,
            "asset": 23652.72134065628,
            "commission": 88.27865934371948,
            "cost": 18535.27865934372,
            "day": "20170215",
            "floatProfit": 52.72134065628052,
            "freeMoney": 5064.7213406562805,
            "maketValue": 18588,
            "profitRate": 0.002233955112554259
        },
        {
            "TN": 5,
            "annualProfitRate": 0,
            "asset": 23716.917340755463,
            "commission": 94.08265924453735,
            "cost": 17737.082659244537,
            "day": "20170216",
            "floatProfit": 116.91734075546265,
            "freeMoney": 5862.917340755463,
            "maketValue": 17854,
            "profitRate": 0.004954124608282316
        },
        {
            "TN": 6,
            "annualProfitRate": 0,
            "asset": 23595.554340839386,
            "commission": 100.44565916061401,
            "cost": 16380.445659160614,
            "day": "20170217",
            "floatProfit": -4.445659160614014,
            "freeMoney": 7219.554340839386,
            "maketValue": 16376,
            "profitRate": -0.00018837538816161075
        },
        {
            "TN": 7,
            "annualProfitRate": 0,
            "asset": 23780.802600860596,
            "commission": 106.1973991394043,
            "cost": 15649.197399139404,
            "day": "20170220",
            "floatProfit": 180.8026008605957,
            "freeMoney": 7950.802600860596,
            "maketValue": 15830,
            "profitRate": 0.007661127155109988
        },
        {
            "TN": 8,
            "annualProfitRate": 0,
            "asset": 24011.805600643158,
            "commission": 113.19439888000488,
            "cost": 13659.194399356842,
            "day": "20170221",
            "floatProfit": 411.80560064315796,
            "freeMoney": 9940.805600643158,
            "maketValue": 14071,
            "profitRate": 0.01744938985776093
        }
    ]
}

（1）下载并启动apache drill
bin/drill-embedded
（2）配置storage plugin,dc1是机器hostname
http://dc1:8047/storage

{
"type": "file",
"enabled": true,
"connection": "hdfs://dc1:8020",
"config": null,
"workspaces": {
    "root": {
      "location": "/",
      "writable": false,
      "defaultInputFormat": null
    },
    "tmp": {
      "location": "/tmp",
      "writable": true,
      "defaultInputFormat": null
    },
    "ism": {
      "location": "/wx/mytest/ia/2017",
      "writable": true,
      "defaultInputFormat": "json"
    }
},
"formats": {
    "psv": {
      "type": "text",
      "extensions": [
        "tbl"
      ],
      "delimiter": "|"
    },
    "csv": {
      "type": "text",
      "extensions": [
        "csv"
      ],
      "delimiter": ","
    },
    "tsv": {
      "type": "text",
      "extensions": [
        "tsv"
      ],
      "delimiter": "\t"
    },
    "httpd": {
      "type": "httpd",
      "logFormat": "%h %t \"%r\" %>s %b \"%{Referer}i\"",
      "timestampFormat": null
    },
    "parquet": {
      "type": "parquet"
    },
    "json": {
      "type": "json",
      "extensions": [
        "json"
      ]
    },
    "avro": {
      "type": "avro"
    },
    "sequencefile": {
      "type": "sequencefile",
      "extensions": [
        "seq"
      ]
    },
    "csvh": {
      "type": "text",
      "extensions": [
        "csvh"
      ],
      "extractHeader": true,
      "delimiter": ","
    }
}
}

（3）修改配置
http://dc1:8047/options
store.json.read_numbers_as_double 改为true，这个是因为我这边的json数据，有的浮点数输出为整数如5.0直接输出为5，导致错误”DATA_READ ERROR: You tried to write a Float8 type when you are using a ValueWriter of type ...“

(4)执行sql语句，这里dfs.ism.表示使用的是storage plguin 中的dfs里面配置的工作目录为ism

a.basic.ism_id，表示使用json文件中的basic字段（basic是个OBJECT类型）里面的ism_id字段

0: jdbc:drill:zk=local> select a.basic.ism_id as ism_id,a.dailySummary.asset as asset from dfs.ism.`0208/details/part-00000` a limit 10;
+------------------+---------------------+
|      ism_id      |        asset        |
+------------------+---------------------+
| 170208206199185 | 23515.820100307465 |
| 170208206199187 | 23585.904140472412 |
| 170208206199188 | 23830.72134065628   |
| 170208206199189 | 23887.72134065628   |
| 170208206199191 | 23652.72134065628   |
| 170208206199196 | 23716.917340755463 |
| 170208206199199 | 23595.554340839386 |
| 170208206199201 | 23780.802600860596 |
| 170208206199206 | 24011.805600643158 |
| 170208206199209 | 24063.51344060898   |
+------------------+---------------------+
10 rows selected (0.898 seconds)

[思考问题]上述字段中，如果遇到数组应该如何处理？
比如，要查询dailySummary 中的每日资产asset?
参考Drill官方文档，使用子查询（nest query)和FLATTEN函数，
FLATTEN用于将数组扁平化，即1行拆分成多行数据。

0: jdbc:drill:zk=local> select b.ism_id,b.daily.asset as asset from (select a.basic.ism_id as ism_id,FLATTEN(a.dailySummary) as daily from dfs.ism.`0208/details/part-00000` a ) b limit 10;

+------------------+---------------------+
|      ism_id      |        asset        |
+------------------+---------------------+
| 170208206199185 | 23515.820100307465 |
| 170208206199185 | 23585.904140472412 |
| 170208206199185 | 23830.72134065628   |
| 170208206199185 | 23887.72134065628   |
| 170208206199185 | 23652.72134065628   |
| 170208206199185 | 23716.917340755463 |
| 170208206199185 | 23595.554340839386 |
| 170208206199185 | 23780.802600860596 |
| 170208206199185 | 24011.805600643158 |
| 170208206199185 | 24063.51344060898   |
| 170208206199187 | 20130.834299087524 |
| 170208206199187 | 19987.834299087524 |
| 170208206199187 | 20333.938299179077 |
| 170208206199187 | 20277.938299179077 |
| 170208206199187 | 20153.938299179077 |
| 170208206199187 | 20321.938299179077 |
| 170208206199187 | 20165.137598991394 |
| 170208206199187 | 20376.137598991394 |
| 170208206199187 | 20496.137598991394 |
| 170208206199187 | 20428.81975889206   |
+------------------+---------------------+
20 rows selected (0.978 seconds)

上述查询也可以通过web方式
http://dc1:8047/query
查询获得。

分享到：

java.lang Enum Thread.State，jstack显示 ...

2017-03-27 18:09
浏览 1177
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论