`

hive sql

sql 
阅读更多
insert overwrite table AD_MATE_FT PARTITION (pt='2012-03-05 10')
select
    temp0.mrid as material_id ,
    temp9.name as material_name,
    'pv' as AD_EFT_MATE_CAL_TYPE,
    temp0.uid as temp0,
    temp0.bussiness_id as bussiness_id ,
    'HOURLY' as time_type,
    '2012-03-05 10' as date_desc,
    temp0.effect_sum as ad_eft_mate_cnt,
    temp1.EXPOSE_sum as ad_show_cnt,
    temp2.click_sum as ad_click_cnt,
    temp3.ad_percent as ad_click_rate,
    temp4.ad_browse as pageview_cnt,
    temp5.ad_clibro as pvcnt_div_adclick,
    temp7.ad_step as  ad_bounce_rate,
    temp6.ad_percon as ad_conv_rate,
    temp8.ad_avgtime as avg_stay_time,
    to_date(unix_timestamp())
from (select * from sum0_effect_hour_temp where  pt='2012-03-05 10')temp0
      join (select * from sum0_expose_hour_temp where  pt='2012-03-05 10')temp1
         on (temp0.mrid=temp1.mrid and temp1.pt=temp0.pt and temp0.uid=temp1.uid and temp0.bussiness_id=temp1.bussiness_id)
       join(select * from sum0_click_hour_temp where  pt='2012-03-05 10')temp2
         on(temp2.mrid=temp1.mrid and temp1.pt=temp2.pt and temp2.uid=temp1.uid and temp2.bussiness_id=temp1.bussiness_id)
       join(select * from sum0_click_percent_hour_temp where  pt='2012-03-05 10')temp3
         on(temp2.mrid=temp3.mrid and temp3.pt=temp2.pt and temp2.uid=temp3.uid and temp2.bussiness_id=temp3.bussiness_id)
       join(select * from sum0_click_browse_hour_temp where  pt='2012-03-05 10')temp4
         on(temp4.mrid=temp3.mrid and temp3.pt=temp4.pt and temp4.uid=temp3.uid and temp4.bussiness_id=temp3.bussiness_id)
       join(select * from  sum0_clibro_hour_rate_temp where  pt='2012-03-05 10')temp5
         on(temp4.mrid=temp5.mrid and temp5.pt=temp4.pt and temp4.uid=temp5.uid and temp4.bussiness_id=temp5.bussiness_id)
       join(select * from sum0_per_con_hour_temp where  pt='2012-03-05 10')temp6
         on(temp6.mrid=temp5.mrid and temp5.pt=temp6.pt and temp6.uid=temp5.uid and temp6.bussiness_id=temp5.bussiness_id)
       join(select * from  sum0_steprate_hour_temp where  pt='2012-03-05 10')temp7
         on(temp6.mrid=temp7.mrid and temp7.pt=temp6.pt and temp6.uid=temp7.uid and temp6.bussiness_id=temp7.bussiness_id)
       join(select * from  avg0_time_hour_temp where  pt='2012-03-05 10')temp8
         on(temp8.mrid=temp7.mrid and temp7.pt=temp8.pt and temp8.uid=temp7.uid and temp8.bussiness_id=temp7.bussiness_id)
       join(select * from  AD_MATERIAL_DIM)temp9
         on(temp8.mrid=temp9.id and temp8.uid=temp9.uid )
  where temp0.pt=temp1.pt =temp2.pt=temp3.pt=temp4.pt=temp5.pt=temp6.pt=temp7.pt=temp8.pt;





按小时计算效果表的总记录
create table if not exists sum0_effect_hour_temp
(

effect_sum int,
MRID string,
uid string,
bussiness_id string
)
PARTITIONED BY (pt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
STORED AS RCFILE;


insert overwrite table sum0_effect_hour_temp PARTITION (pt='2012-03-05 10')
  select
  COUNT(effect_id) as effect_sum ,
  temp0.mrid as mrid,
  temp0.bussiness_id as bussiness_id,
  temp0.uid as uid
  from
  (
select * from ad_PATH where pt='2012-03-05 10'
  ) temp0
  left outer join
  (
select effect_id AS effect_id  ,PAGE_ID AS PAGE_ID ,pt as pt
from  ad_effect
where pt='2012-03-05 10'
  ) temp1
  on(temp1.PAGE_ID=temp0.PAGE_ID)
  where temp0.pt=temp1.pt and bussiness_id is not NULL
  GROUP BY temp0.mrid,temp0.uid,temp0.bussiness_id ;

按小时计算点击表的曝光数

create table if not exists sum0_expose_hour_temp
(
EXPOSE_sum int,
MRID string,
uid string,
bussiness_id string

)
PARTITIONED BY (pt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
STORED AS RCFILE;

  insert overwrite table sum0_expose_hour_temp PARTITION (pt='2012-03-05 10')
  select
     COUNT(temp1.PAGE_ID) as EXPOSE_sum,
     temp0.mrid as mrid,
     temp0.bussiness_id as bussiness_id,
     temp0.uid as uid
  from (select * from ad_PATH where pt='2012-03-05 10' and BUSSINESS_ID is not NULL ) temp0
     left outer join(
      select  PAGE_ID ,pt from ad_expose_click WHERE pt='2012-03-05 10' AND action='expose'
     ) temp1
   on(temp1.PAGE_ID=temp0.PAGE_ID)
    where temp0.pt=temp1.pt
          GROUP BY temp0.mrid,temp0.uid,temp0.bussiness_id ;


按小时计算点击表的点击数

create table if not exists sum0_click_hour_temp
(
click_sum int,
MRID string,
uid string,
bussiness_id string

)
PARTITIONED BY (pt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
STORED AS RCFILE;

insert overwrite table sum0_click_hour_temp PARTITION (pt='2012-03-05 10')
  select
     COUNT(temp1.PAGE_ID) as click_sum,
     temp0.mrid as mrid,
     temp0.bussiness_id as bussiness_id,
     temp0.uid as uid
  from (select * from ad_PATH where pt='2012-03-05 10' and BUSSINESS_ID is not NULL ) temp0
     left outer join(
      select  PAGE_ID ,pt from ad_expose_click WHERE pt='2012-03-05 10' AND action='click'
     ) temp1
   on(temp1.PAGE_ID=temp0.PAGE_ID)
    where temp0.pt=temp1.pt
          GROUP BY temp0.mrid,temp0.uid,temp0.bussiness_id ;


按小时的点击率
create table if not exists sum0_click_percent_hour_temp
(
ad_percent double,
MRID string,
uid string,
bussiness_id string

)
PARTITIONED BY (pt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
STORED AS RCFILE;

insert overwrite table sum0_click_percent_hour_temp PARTITION (pt='2012-03-05 10')
  select
    click.click_sum/expose.EXPOSE_sum as ad_percent,
   click.mrid as mrid,
   click.uid as uid,
   click.bussiness_id as bussiness_id
from
   (select * from sum0_click_hour_temp where pt='2012-03-05 10') click
   JOIN(select pt as pt ,uid as uid,bussiness_id as bussiness_id  ,mrid as mrid ,EXPOSE_sum as EXPOSE_sum
  from sum0_expose_hour_temp WHERE pt ='2012-03-05 10'  )expose 
       on (click.MRID=expose.mrid and click.uid=expose.uid and click.bussiness_id=expose.bussiness_id)
  where expose.pt=click.pt;


小时级页面浏览数
create table if not exists sum0_click_browse_hour_temp
(
ad_browse int,
MRID string,
uid string,
bussiness_id string

)
PARTITIONED BY (pt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
STORED AS RCFILE;

insert overwrite table sum0_click_browse_hour_temp PARTITION (pt='2012-03-05 10')
  select
  COUNT(distinct(tmp2.page_id)) as ad_browse,
  tmp2.mrid as mrid,
  tmp2.uid as uid,
  tmp2.bussiness_id as bussiness_id
  from (select * from ad_effect where pt ='2012-03-05 10') tmp1
  join (select page_id, pt, SESSION_ID, bussiness_id, mrid ,uid from ad_PATH where pt='2012-03-05 10'
        and bussiness_id is not NULL) tmp2              
    on (tmp1.page_id = tmp2.page_id)
  join (select pt as pt,SESSION_ID as SESSION_ID, bussiness_id, uid from ad_SESSION
           where SOURCE_TYPE ='Direct' and pt = '2012-03-05 10') tmp3
    on (tmp2.SESSION_ID=tmp3.SESSION_ID and tmp2.uid=tmp3.uid and tmp2.bussiness_id=tmp3.bussiness_id)
  where   tmp1.pt=tmp2.pt=tmp3.pt
   GROUP BY tmp2.mrid,tmp2.uid,tmp2.bussiness_id;

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

安小时网页浏览/广告点击

create table if not exists sum0_clibro_hour_rate_temp
(
ad_clibro double,
MRID string,
uid string,
bussiness_id string
)PARTITIONED BY (pt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
STORED AS RCFILE;

insert overwrite table sum0_clibro_hour_rate_temp PARTITION (pt='2012-03-05 10')
select
   temp0.ad_browse/temp1.click_sum as ad_clibro,
   temp0.mrid as MRID,
   temp0.uid as uid,
   temp0.bussiness_id as bussiness_id
   from (select * from sum0_click_browse_hour_temp where pt = '2012-03-05 10')temp0
          join(select bussiness_id,uid,pt,click_sum as click_sum, mrid as mrid from sum0_click_hour_temp where pt='2012-03-05 10') temp1
         on(temp0.mrid=temp1.mrid and temp1.pt=temp0.pt and temp0.uid=temp1.uid and temp0.bussiness_id=temp1.bussiness_id )
  where temp0.pt=temp1.pt;


转化率

create table if not exists sum0_per_con_hour_temp
(
ad_percon double,
MRID string,
uid string,
bussiness_id string
)PARTITIONED BY (pt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
STORED AS RCFILE;

insert overwrite table sum0_per_con_hour_temp PARTITION (pt='2012-03-05 10')
select
temp0.effect_sum/temp1.click_sum as ad_clibro,
temp0.mrid as MRID,
temp0.uid as uid,
temp0.bussiness_id as bussiness_id
from
  (select * from sum0_effect_hour_temp where pt ='2012-03-05 10' ) temp0
join(select * from sum0_click_hour_temp where pt ='2012-03-05 10') temp1
on(temp0.mrid=temp1.mrid and temp1.pt=temp0.pt and temp0.uid=temp1.uid and temp0.bussiness_id=temp1.bussiness_id );
where temp0.pt=temp1.pt;


跳出率:

进入1步的离开数量
create table if not exists sum0_step1_hour_temp
(
ad_step1 int ,
MRID string,
uid string,
bussiness_id string
)PARTITIONED BY (pt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
STORED AS RCFILE;

 

insert overwrite table sum0_step1_hour_temp PARTITION (pt='2012-03-05 10')
    select
       COUNT(distinct(temp0.session_id)) as ad_step1,
       temp0.MRID as MRID,
       temp0.uid as uid,
       temp0.bussiness_id as bussiness_id
       from (select * from AD_PATH where pt ='2012-03-05 10' and session_step='1') temp0 
          join ( select * from ad_session where pt ='2012-03-05 10' and source_type='3') temp1
           on(temp0.SESSION_ID=temp1.SESSION_ID and temp0.uid=temp1.uid and temp0.bussiness_id=temp1.bussiness_id)
     where temp0.pt=temp1.pt
      GROUP BY temp0.mrid,temp0.uid,temp0.bussiness_id ;
进入所有步的离开数量
create table if not exists sum0_stepn_hour_temp
(
ad_stepn int ,
MRID string,
uid string,
bussiness_id string
)PARTITIONED BY (pt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
STORED AS RCFILE;

 

insert overwrite table sum0_stepn_hour_temp PARTITION (pt='2012-03-05 10')
    select
       COUNT(distinct(temp0.session_id)) as ad_stepn,
       temp0.MRID as MRID,
       temp0.uid as uid,
       temp0.bussiness_id as bussiness_id
       from (select * from AD_PATH where pt ='2012-03-05 10' ) temp0 
          join ( select * from ad_session where pt ='2012-03-05 10' and source_type='3') temp1
           on(temp0.SESSION_ID=temp1.SESSION_ID and temp0.uid=temp1.uid and temp0.bussiness_id=temp1.bussiness_id)
     where temp0.pt=temp1.pt
      GROUP BY temp0.mrid,temp0.uid,temp0.bussiness_id ;


// 计算跳出率
create table if not exists sum0_steprate_hour_temp
(
ad_step double,
MRID string,
uid string,
bussiness_id string
)PARTITIONED BY (pt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
STORED AS RCFILE;

insert overwrite table sum0_steprate_hour_temp PARTITION (pt='2012-03-05 10')
select
       temp0.ad_step1/temp1.ad_stepn as ad_step,
       temp0.MRID as MRID,
       temp0.uid as uid,
       temp0.bussiness_id as bussiness_id
      from (select * from sum0_step1_hour_temp where pt ='2012-03-05 10' ) temp0 
      join ( select * from sum0_stepn_hour_temp where pt ='2012-03-05 10' )temp1
      on(
        temp0.uid=temp1.uid and temp0.bussiness_id=temp1.bussiness_id and temp0.mrid =temp0.mrid
       )

where temp0.pt=temp1.pt;


平均停留时间:

create table if not exists avg0_time_hour_temp
(
ad_avgtime double,
MRID string,
uid string,
bussiness_id string
)PARTITIONED BY (pt string)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\001'
LINES TERMINATED BY '\n'
STORED AS RCFILE;

insert overwrite table avg0_time_hour_temp PARTITION (pt='2012-03-05 10')
select
     avg(temp1.session_stay_seconds)  as ad_avgtime,
     temp0.MRID as MRID,
       temp0.uid as uid,
       temp0.bussiness_id as bussiness_id
      from (select * from ad_path where  pt ='2012-03-05 10') temp0
          join
            (select * from ad_session where pt ='2012-03-05 10') temp1
            on (temp0.SESSION_ID=temp1.SESSION_ID and temp0.uid=temp1.uid and temp0.bussiness_id=temp1.bussiness_id)
         where temp0.pt=temp1.pt
            GROUP BY temp0.mrid,temp0.uid,temp0.bussiness_id ;


分享到:
评论

相关推荐

    基于 Antlr4 的 Hive SQL 解析.zip

    在本课程设计中,主题是“基于Antlr4的Hive SQL解析”,这涉及到了两个主要的技术领域:Antlr4和Hive SQL。Antlr4是一个强大的解析工具,用于生成解析器和词法分析器,它能处理各种语言的语法,包括SQL。而Hive SQL...

    Hive SQL 编译过程详解

    本文将深入探讨Hive SQL如何被编译成MapReduce任务,以及在这个过程中涉及到的关键原理。 1. MapReduce实现基本SQL操作的原理: - **Join的实现原理**:在Hive中,Join操作通常通过MapReduce来实现。例如,在一个...

    HiveSQL实战题目.docx

    HiveSQL 实战题目 HiveSQL 是一种基于 Hadoop 的数据仓库工具,用于存储、查询和分析大规模数据。本文档提供了 HiveSQL 实战题目,旨在帮助读者熟悉 HiveSQL 的基本概念和应用场景。 一、HiveSQL 基础 HiveSQL 是...

    最强HiveSQL开发指南.pdf

    《最强HiveSQL开发指南》是一本专注于Hive性能调优和实战操作的教程,旨在帮助读者深入理解和熟练运用Hive进行大数据处理。Hive作为Apache Hadoop生态系统中的一个组件,主要用于处理和分析大规模分布式存储的数据。...

    hivesql语句练习

    1.上传tar包 2.解压 tar -zxvf hive-1.2.1.tar.gz 3.安装mysql数据库 推荐yum 在线安装 4.配置hive (a)配置HIVE_HOME环境变量 vi conf/hive-env.sh ... hive -e ‘sql’ bin/hive -e 'select * from t_test'

    HiveSQL编译原理

    HiveSQL编译原理是大数据处理领域中的一个重要概念,它主要涉及到如何将用户提交的SQL语句转换为可执行的MapReduce或者Tez任务。在Hadoop生态系统中,Hive作为一个基于HDFS的数据仓库工具,提供了对大规模数据集进行...

    Hive SQL练习题库

    ### Hive SQL练习题库知识点详解 #### 一、Hive SQL概述 Hive 是一个基于 Hadoop 的数据仓库工具,可以将结构化的数据文件映射为一张数据库表,并提供简单的 SQL 查询功能,使不熟悉 MapReduce 的用户可以利用 SQL...

    hive sql详解 经典

    Hive SQL详解经典 在大数据处理领域,Hive是一个基于Hadoop的数据仓库工具,它允许用户使用SQL(称为HiveQL)查询存储在Hadoop分布式文件系统(HDFS)中的大型数据集。这篇博客深入探讨了Hive SQL的使用,帮助用户...

    深入浅出Hive企业级架构优化、Hive Sql优化、压缩和分布式缓存

    Apache Hive 是一个基于 Hadoop 的数据仓库工具,可以将结构化的数据文件映射为一张数据库表,并提供简单的 SQL 查询功能,使不熟悉 MapReduce 的开发人员也能用 SQL 语言进行数据查询。Hive 在内部将 SQL 语句转换...

    HiveSQL解析原理.docx

    ### HiveSQL解析原理详解 #### 一、引言 Hive作为一款建立在Hadoop之上的数据仓库系统,被广泛应用于大数据处理场景中。对于众多企业和组织而言,它不仅是存储和管理海量数据的有效手段,同时也是进行数据分析的...

    SQL、Hive SQL等SQL血缘解析工具

    // 具有子查询的sql String hql = "select id,name from (select id from table_1 where id={p0}) t1 inner join (select name --this is name\n from table_2) t2"; // 获取id字段的血缘 LineageNode idNode = ...

    hivesql计算访问者年龄的SQL

    hivesql计算访问者年龄的SQL

    HiveSQL详细和优化

    HiveSQL详细和优化,以及部分个人点评 This is a brief tutorial that provides an introduction on how to use Apache Hive HiveQL with Hadoop Distributed File System. This tutorial can be your first step ...

    HiveSQL优化手册

    ### HiveSQL优化手册 #### 数据倾斜优化 **数据倾斜**是指在执行Hive查询时,由于数据分布不均,导致一部分任务处理的数据量远大于其他任务的现象,这将严重影响整个查询的性能。针对数据倾斜问题,可以从以下几个...

    HIVE-SQL开发规范.docx

    【秘密】Hive SQL 开发规范 Hive作为Hadoop生态系统中的重要组件,为企业提供了对大规模数据集的高效处理和分析能力。它允许用户通过类似于SQL的查询语言(HiveQL)来操作分布式存储的数据,简化了MapReduce编程...

    hive sql解析

    hivesql解析 支持版本<!--<hive.version>1.0.0</hive.version> <hadoop.version>2.4.1</hadoop.version> --> <hive.version>2.0.1</hive.version> <hadoop.version>2.7.2</hadoop.version>

Global site tag (gtag.js) - Google Analytics