`
jickcai
  • 浏览: 246973 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

spark 抽样

    博客分类:
  • web
阅读更多

 use sessiondb;

 set NUM_SAMPLE = 30;
CREATE EXTERNAL TABLE task_samples
(
date_str string,
task_id string,
review_type string,
task_type string,
score string,
user_name string
)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
LOCATION "afs://dd.afs.baidu.com:9902/user/xx/sp-cbreview-task/samples/{DATE}";
 
drop view if exists samples_data_db;
create temporary view samples_data_db as
SELECT 
date_str,task_id,review_type,task_type,score,user_name
FROM
(
SELECT 
*,
row_number() over (partition by user_name order by rand()) as `rnk`
FROM
task_samples
)
WHERE
rnk <= ${NUM_SAMPLE};
 
 
INSERT OVERWRITE DIRECTORY "afs://tt.afs.baidu.com:9902/user/xx/sp-cbreview-task/samples-stat/{DATE}"
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
SELECT 
date_str,review_type,task_type,user_name,sum(pass_cnt) as pass_cnt,sum(un_pass_cnt) as un_pass_cnt , sum(samples_cnt) as samples_cnt
FROM
( SELECT 
date_str,review_type,task_type,user_name,
COUNT(IF(score="0", NULL, score)) as pass_cnt,
COUNT(IF(score="1", NULL, score)) as un_pass_cnt,
0 as samples_cnt
FROM  task_samples
GROUP BY date_str,review_type,task_type,user_name
UNION
SELECT 
date_str,review_type,task_type,user_name,0 as pass_cnt,0 as un_pass_cnt, count(*) as samples_cnt
FROM samples_data_db
GROUP BY date_str,review_type,task_type,user_name
)b
GROUP BY date_str,review_type,task_type,user_name
ORDER BY user_name ASC;

<iframe style="display: none !important;"></iframe>

<iframe style="display: none !important;"></iframe>

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics