HIVE 处理日志，自定义inputformat 完整版

wbj0110

浏览: 1645521 次
性别:
来自: 上海

最近访客更多访客>>

一往无前bhz

ninja2006

loginboot

u012363178

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Hive

hive

为何要设置此功能是由于 hive fields terminated by '||||' 不支持字符串导致

将你的inputformat类打成jar包，如MyInputFormat.jar
将MyInputFormat.jar放到 hive/lib里，然后就可以建表了
假设你的inputFormat类路径是com.hive.myinput
则建表语句为：create table tbname(name stirng,id int, ...) stored as INPUTFORMAT 'com.hive.myinput' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'

HiveIgnoreKeyTextOutputFormat是系统自带的outputformat类，你也可以自定义

由于hive是基于hadoop集群运行的，所以hadoop/lib里面也必须放入MyInputFormat.jar,

此功能需要二个CLASS 类：ClickstreamInputFormat ClickstreamRecordReader

[java]view plaincopyprint? 
package com.jd.cloud.clickstore;  
  
import java.io.IOException;    
  
import org.apache.hadoop.io.LongWritable;    
import org.apache.hadoop.io.Text;    
import org.apache.hadoop.mapred.FileSplit;    
import org.apache.hadoop.mapred.InputSplit;    
import org.apache.hadoop.mapred.JobConf;    
import org.apache.hadoop.mapred.JobConfigurable;    
import org.apache.hadoop.mapred.RecordReader;    
import org.apache.hadoop.mapred.Reporter;    
import org.apache.hadoop.mapred.TextInputFormat;  
  
/**  
 * 自定义hadoop的 org.apache.hadoop.mapred.InputFormat  
 *   
 * @author winston  
 *   
 */    
public class ClickstreamInputFormat extends TextInputFormat implements    
        JobConfigurable {    
    
    public RecordReader<LongWritable, Text> getRecordReader(    
            InputSplit genericSplit, JobConf job, Reporter reporter)    
            throws IOException {    
    
        reporter.setStatus(genericSplit.toString());    
        return new ClickstreamRecordReader((FileSplit) genericSplit,job);    
    }    
}    

[java]view plaincopyprint? 
package com.jd.cloud.clickstore;  
  
import java.io.IOException;  
import java.io.InputStream;  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.FSDataInputStream;  
import org.apache.hadoop.fs.FileSystem;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.io.compress.CompressionCodec;  
import org.apache.hadoop.io.compress.CompressionCodecFactory;  
import org.apache.hadoop.mapred.FileSplit;  
import org.apache.hadoop.util.LineReader;  
import org.apache.hadoop.mapred.RecordReader;  
  
  
public class ClickstreamRecordReader implements  
        RecordReader<LongWritable, Text> {  
  
  
    private CompressionCodecFactory compressionCodecs = null;  
    private long start;  
    private long pos;  
    private long end;  
    private LineReader lineReader;  
    int maxLineLength;  
  
    public ClickstreamRecordReader(FileSplit inputSplit, Configuration job)  
            throws IOException {  
        maxLineLength = job.getInt("mapred.ClickstreamRecordReader.maxlength",  
                Integer.MAX_VALUE);  
        start = inputSplit.getStart();  
        end = start + inputSplit.getLength();  
        final Path file = inputSplit.getPath();  
        compressionCodecs = new CompressionCodecFactory(job);  
        final CompressionCodec codec = compressionCodecs.getCodec(file);  
  
        // Open file and seek to the start of the split  
        FileSystem fs = file.getFileSystem(job);  
        FSDataInputStream fileIn = fs.open(file);  
        boolean skipFirstLine = false;  
        if (codec != null) {  
            lineReader = new LineReader(codec.createInputStream(fileIn), job);  
            end = Long.MAX_VALUE;  
        } else {  
            if (start != 0) {  
                skipFirstLine = true;  
                --start;  
                fileIn.seek(start);  
            }  
            lineReader = new LineReader(fileIn, job);  
        }  
        if (skipFirstLine) {  
            start += lineReader.readLine(new Text(), 0,  
                    (int) Math.min((long) Integer.MAX_VALUE, end - start));  
        }  
        this.pos = start;  
    }  
  
    public ClickstreamRecordReader(InputStream in, long offset, long endOffset,  
            int maxLineLength) {  
        this.maxLineLength = maxLineLength;  
        this.lineReader = new LineReader(in);  
        this.start = offset;  
        this.pos = offset;  
        this.end = endOffset;  
    }  
  
    public ClickstreamRecordReader(InputStream in, long offset, long endOffset,  
            Configuration job) throws IOException {  
        this.maxLineLength = job.getInt(  
                "mapred.ClickstreamRecordReader.maxlength", Integer.MAX_VALUE);  
        this.lineReader = new LineReader(in, job);  
        this.start = offset;  
        this.pos = offset;  
        this.end = endOffset;  
    }  
  
    public LongWritable createKey() {  
        return new LongWritable();  
    }  
  
    public Text createValue() {  
        return new Text();  
    }  
  
    /** 
     * Reads the next record in the split. get usefull fields from the raw nginx 
     * log. 
     *  
     * @param key 
     *            key of the record which will map to the byte offset of the 
     *            record's line 
     * @param value 
     *            the record in text format 
     * @return true if a record existed, false otherwise 
     * @throws IOException 
     */  
    public synchronized boolean next(LongWritable key, Text value)  
            throws IOException {  
        // Stay within the split  
        while (pos < end) {  
            key.set(pos);  
            int newSize = lineReader.readLine(value, maxLineLength,  
                    Math.max((int) Math.min(Integer.MAX_VALUE, end - pos),  
                            maxLineLength));  
  
            if (newSize == 0)  
                return false;  
  
            String str = value.toString().toLowerCase()  
                    .replaceAll("\\@\\_\\@", "\001");  
            value.set(str);  
            pos += newSize;  
  
            if (newSize < maxLineLength)  
                return true;  
        }  
  
        return false;  
    }  
  
    public float getProgress() {  
        if (start == end) {  
            return 0.0f;  
        } else {  
            return Math.min(1.0f, (pos - start) / (float) (end - start));  
        }  
    }  
  
    public synchronized long getPos() throws IOException {  
        return pos;  
    }  
  
    public synchronized void close() throws IOException {  
        if (lineReader != null)  
            lineReader.close();  
    }  
      
    // 测试 输出  
    //public static void main(String ags[]){  
    //    String str1 ="123@_@abcd@_@fk".replaceAll("\\@\\_\\@", "\001");  
    //    System.out.println(str1);  
    //}  
}  

1.上传到 HIVE 服务器上 JAVAC 编译

[plain]view plaincopyprint? 
javac -cp ./:/usr/lib/hadoop/hadoop-common.jar:/home/op1/hadoop/hadoop-core-1.0.3.jar:/usr/lib/hadoop/lib/commons-logging-1.1.1.jar */**/*/*/*  

2.JAR 打包类文件

[java]view plaincopyprint? 
jar -cf ClickstreamInputFormat.jar /home/op1/uerdwdb/src/  

3.复制 Hive/lib Hadoop/lib 文件夹内

4.Hive 创建表命令

[sql]view plaincopyprint? 
create table hive_text(num int,name string,`add` string)  
stored as INPUTFORMAT 'com.jd.cloud.clickstore.ClickstreamInputFormat'   
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'   
location '/home/op1/uerdwdb/text.txt';  

http://blog.csdn.net/iquicksandi/article/details/8533699

大家可以加我个人微信号：scccdgf

或者关注soledede的微信公众号：soledede

微信公众号：

分享到：

基于hive的日志分析系统 | hive支持sql大全

2014-08-29 15:41
浏览 1224
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论