hive学习笔记

流浪鱼

浏览: 1697541 次
性别:
来自: 北京

最近访客更多访客>>

mlx09

luffy2341

kannan22

lesliehanhan

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

hive

==========学习笔记
hive提供了一个被称为Hive查询语句的SQL方言，来查询存储再Hadoop集群中的数据
hive和mysql方言最接近
1.hive最大的限制就是hive不支持记录基本的插入、更新和删除操作，但是用户可以通过查询生成新表或者将查询结构导入到文件中。
2.hive最适合数据仓库应用程序，其可以维护海量数据，而且可以对数据进行挖掘，然后形成意见和报告。

===================hiveSQL语句====================
========================================创建表相关=======================================
--创建表docs
create table docs(line string);
--给docs加载数据，数据来源是docs文件
LOAD DATA INPATH 'docs' overwrite into table docs;
--创建表word_counts
create table word_counts as
    select word,count(1) as count from
        (select explode(split(line,'\s')) as word from docs) w
    group by word
    order by word ;
--创建表 employees
create table employees(
    name string,
    salary float,
    subordinates array<string>,
    deductions map<string,float>,
    address struct<street:string,city:string,zip:int>
);
--拷贝userinfo表结构创建新表
create table if not exists userinfo2 like userinfo ;
--创建外部表 exchange是关键字
create external table if not exists stocks (
    exchange2 string ,
    symbol string ,
    ymd string ,
    price_open float ,
    price_high float ,
    price_low float ,
    price_close float ,
    volume int ,
    price_adj_close float
) row format delimited fields terminated by ','
location '/data/stocks';
--创建外部表采用表结构复制
create external table if not exists exchange3
like employees
location 'paht/to/data';
--创建分区表 partitioned指定分区字段
create table employees (
    name string ,
    salary float ,
    subordinates array<string>,
    deductions map<string,float>,
    address struct<street:string ,city:string ,state:string , zip:int>
) partitioned by (country:string,state:string);
--查看表所有分区
show partitions employees ;
show partitions employees partition(country='US',state='ak');
--创建外部分区表
create external table if not exists log_messages (
    hms int ,
    serverty string,
    server string,
    process_id int,
    message string
)
partitioned by (year int ,month int ,day int)
row format delimited fields terminated by '\t';
--修改分区
alter table log_messages add partition(year=2012,month=1,day=2)
location 'hdfs://nds1/data/log_messages/2012/01/02';
--查看表的详细结构信息
describe extended userinfo;
--查看分区数据所在的路径
describe extended log_messages partition(year=2012,month=1,day=2);
--显示表结构
describe toss1;
--显示表
show tables;
--显示表属性
show tblproperties userinfo;
--查看表某一列信息
describe userinfo.id
--删除表
drop table if exists employees ;
--表重命名
alter table log_messages rename to logmsgs;
--增加表分区
alter table log_messages add if not exists
partition(year=2011,month=1,day=1) location 'logs/2011/01/01'
partition(year=2011,month=1,day=2) location 'logs/2011/01/02'
partition(year=2011,month=1,day=3) location 'logs/2011/01/03';
--修改某分区的路径
alter table log_messages partition (year=2011,month=1,day=2)
set location 's3n://ourbucket/log/2011/01/02';
--删除某个分区
alter table log_messages drop if exists partition (year=2011,month=1,day=2);
--修改列信息
alter table log_messages
change column hms hours_minutes_seconds int
comment 'the hours ,minutes , and seconds part of the timestamp'
alter serverty;
--增加列 long使用的话报错
alter table log_messages add columns(
app_name string comment 'Application name',
session_id int comment 'The current session id '
);
--删除或替换列
alter table log_messages replace columns(
    hours_mins_secs int comment 'hour,minute,seconds from timestamp',
    serverty string comment 'the message severity',
    message string comment 'the rest of zhe message '
);
--修改表属性
alter table log_messages set tblproperties(
'notes'='the process id is no longer captured;'
)
--修改表存储属性
alter table log_messages
partition(year=2011,month=1,day=1)
set fileformat sequencefile ;
--把分区打成一个hadoop压缩包（HAR）
alter table log_messages archive partition(year=2011,month=1,day=1);
--解压缩
alter table log_messages unarchive partition(year=2011,month=1,day=1);
--分区不能删除
alter table log_messages partition(year=2011,month=1,day=1) disble no_drop;
--分区能删除
alter table log_messages partition(year=2011,month=1,day=1) enable no_drop;
--分区不能查询
alter table log_messages partition(year=2011,month=1,day=1) disble offline;
--分区能查询
alter table log_messages partition(year=2011,month=1,day=1) enable offline;
===================================数据库相关===========================================
--显示数据库
show databases ;
--创建数据库
create database if not exists financials;
--创建数据库修改默认存储位置
create database financials location '/home/yarn/dabases';
--create database financials comment 'financials数据库备注';
--查看数据库信息
describe database analysis;
--使用数据库
use financials;
--删除数据库
drop database if exists financials cascade ;
--修改数据库
alter database financials set deproperties ('edited-by'='haoqimin');
=========================================数据相关（导入数据）=================================
hive没有行级别的数据插入、数据更新和数据删除操作，那么往表装装数据唯一的途径就是大量的数据装载，或者通过其他方式把数据写入到正确的目录下。
--分区表装载数据
load data local inpath '${env:home}/california-empoyees'
overwrite into table employees
partition (country='us', state='ca');
--管理表装载数据
load data local inpath '${env:home}/california-empoyees'
overwrite into table employees;
--查询语句像表插入数据
insert overwrite table empoyees
partition(country='us', state='or')
select * from staged_employees se
where se.cnty = 'us' and se.st = 'or';
--动态分区插入
insert overwrite table employees
partition(country,state)
select se.name ,se.cnty,se.st from staged_employees;
--导出数据
insert overwrite local directory 'http://ns1/temp/ca_employess'
select name ,salary,address from employees se where se.state= 'ca';
========================================查询数据select========================================
--查询subordinates列数组值的第一个元素
select name ,subordinates[0] from employees;
--查询引用map元素
select name ,deductions["state taxes"] from employees;
--查询引用struct
select name ,address.city from employees;
--使用正则来指定列
select symbol ,'price.*' from stocks;
--limit语句
select upper(name),salary,deductions["Federal Taxes"],round(salary*(1-deductions["Federal Taxes"])) from employees limit 2;
--case when then使用--
select name , salary , case
                    when salary < 50000.0 then 'low'
                    when salary >=50000.0 and salary < 70000.0 then 'middle'
                    when salary >=70000.0 salary < 100000.0 then 'high'
                    else 'very hile '
                end as bracket from employees;
--like使用
select name , address.street from employees where address.street like '%Ave';
--group by语句使用
select year(ymd),avg(price_close) from stocks
    where exchange2 = 'nasdaq' and symbol = 'aapl'
    group by year(ymd)
    having avg(price_close)>50.0;
--inner join语句使用
select a.ymd , a.price_close,b.price_close from stocks a inner join stocks b
    on a.ymd = b.ymd
    where a.symbol='aapl' and b.symbol = 'ibm';
--left outer join
select s.ymd ,s.symbol,s.price_close,d.dividend
    from stocks s left outer join dividends d on s.ymd = d.ymd and s.symbol = d.symbol where s.symbol = 'apple';
===========使用函数============
select ascii(name) from userinfo;