存入hbase的方法

刘小小尘

浏览: 67572 次
性别:
来自: 上海

最近访客更多访客>>

小白到此一游

zhenggm

蕃薯耀

a1002323289

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

1、通过mapreduce的方式存入hbase，只有map，其实reduce阶段也是一样的

代码如下：

import java.io.IOException; 

import org.apache.commons.logging.Log; 

import org.apache.commons.logging.LogFactory; 

import org.apache.hadoop.conf.Configuration; 

import org.apache.hadoop.conf.Configured; 

import org.apache.hadoop.hbase.HBaseConfiguration; 

import org.apache.hadoop.hbase.client.HTable; 

import org.apache.hadoop.hbase.client.Put; 

import org.apache.hadoop.hbase.util.Bytes; 

import org.apache.hadoop.io.LongWritable; 

import org.apache.hadoop.io.NullWritable; 

import org.apache.hadoop.io.Text; 

import org.apache.hadoop.mapreduce.Job; 

import org.apache.hadoop.mapreduce.Mapper; 

import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 

import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; 

import org.apache.hadoop.util.GenericOptionsParser; 

import org.apache.hadoop.util.Tool; 

import org.apache.hadoop.util.ToolRunner; 

public class HBaseImport extends Configured implements Tool{ 

static final Log LOG = LogFactory.getLog(HBaseImport.class); 

public static final String JOBNAME = "MRImport "; 

public static class Map extends Mapper<LongWritable , Text, NullWritable, NullWritable>{ 

Configuration configuration = null; 

HTable xTable = null; 

private boolean wal = true; 

static long count = 0; 

@Override 

protected void cleanup(Context context) throws IOException, 

InterruptedException { 

// TODO Auto-generated method stub 

super.cleanup(context); 

xTable.flushCommits(); 

xTable.close(); 

} 

@Override 

protected void map(LongWritable key, Text value, Context context) 

throws IOException, InterruptedException { 

String all[] = value.toString().split("/t"); 

If(all.length==2){ 

put = new Put(Bytes.toBytes(all[0]))); put.add(Bytes.toBytes("xxx"),Bytes.toBytes("20110313"),Bytes.toBytes(all[1])); 

} 

if (!wal) { 

put.setWriteToWAL(false); 

} 

xTable.put(put); 

if ((++count % 100)==0) { 

context.setStatus(count +" DOCUMENTS done!"); 

context.progress(); 

System.out.println(count +" DOCUMENTS done!"); 

} 

} 

@Override 

protected void setup(Context context) throws IOException, 

InterruptedException { 

// TODO Auto-generated method stub 

super.setup(context); 

configuration = context.getConfiguration(); 

xTable = new HTable(configuration,"testKang"); 

xTable.setAutoFlush(false); 

xTable.setWriteBufferSize(12*1024*1024); 

wal = true; 

} 

} 

@Override 

public int run(String[] args) throws Exception { 

String input = args[0]; 

Configuration conf = HBaseConfiguration.create(getConf()); 

conf.set("hbase.master", "m0:60000"); 

Job job = new Job(conf,JOBNAME); 

job.setJarByClass(HBaseImport.class); 

job.setMapperClass(Map.class); 

job.setNumReduceTasks(0); 

job.setInputFormatClass(TextInputFormat.class); 

TextInputFormat.setInputPaths(job, input); 

job.setOutputFormatClass(NullOutputFormat.class); 

return job.waitForCompletion(true)?0:1; 

} 

public static void main(String[] args) throws IOException { 

Configuration conf = new Configuration(); 

String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 

int res = 1; 

try { 

res = ToolRunner.run(conf, new HBaseImport (), otherArgs); 

} catch (Exception e) { 

e.printStackTrace(); 

} 

System.exit(res); 

} 

}

2、通过Java程序入库
Java多线程读取本地磁盘上的文件，以HTable.put(put)的方式完成数据写入

import java.io.BufferedReader; 

import java.io.File; 

import java.io.FileReader; 

import java.io.IOException; 

import java.util.ArrayList; 

import org.apache.hadoop.conf.Configuration; 

import org.apache.hadoop.hbase.HBaseConfiguration; 

import org.apache.hadoop.hbase.client.HTable; 

import org.apache.hadoop.hbase.client.Put; 

public class InsertContactJava { 

public static long startTime; 

public static long rowkey = 0; //起始rowkey 

public static final int lineCount = 100000; //每次提交时录入的行数 

public static String tableName = "usercontact_kang"; //录入目的表名 

public static int countLie = 8; //表的列数 

public static void main(String[] args) throws IOException { 

startTime = System.currentTimeMillis() / 1000; 

System.out.println("start time = " + startTime); 

Thread t1 = new Thread() { 

@Override 

public void run() { 

try { 

insert_one("/run/jar/123"); 

//loadByLieWithVector("/run/jar/123"); 

//loadByLieWithArrayList("/run/jar/123"); 

} catch (IOException e) { 

e.printStackTrace(); 

} 

} 

}; 

t1.start(); 

} 

public static void insert_one(String path) throws IOException { 

Configuration conf = HBaseConfiguration.create(); 

HTable table = new HTable(conf, tableName); 

File f = new File(path); 

ArrayList<Put> list = new ArrayList<Put>(); 

BufferedReader br = new BufferedReader(new FileReader(f)); 

String tmp = br.readLine(); 

int count = 0; 

while (tmp != null) { 

if (list.size() > 10000) { 

table.put(list); 

table.flushCommits(); 

list.clear(); 

} else { 

String arr_value[] = tmp.toString().split("/t", 10); 

String first[] = arr_value[0].split("~", 5); 

String second[] = arr_value[1].split("~", 5); 

String rowname = getIncreasRowKey(); 

String firstaccount = first[0]; 

String firstprotocolid = first[1]; 

String firstdomain = first[2]; 

String inserttime = Utils.getToday("yyyyMMdd"); 

String secondaccount = second[0]; 

String secondprotocolid = second[1]; 

String seconddomain = second[2]; 

String timescount = Integer.valueOf(arr_value[2]).toString(); 

Put p = new Put(rowname.getBytes()); 

p.add(("ucvalue").getBytes(), "FIRSTACCOUNT".getBytes(), 

firstaccount.getBytes()); 

p.add(("ucvalue").getBytes(), "FIRSTDOMAIN".getBytes(), 

firstdomain.getBytes()); 

p.add(("ucvalue").getBytes(), "FIRSTPROTOCOLID".getBytes(), 

firstprotocolid.getBytes()); 

p.add(("ucvalue").getBytes(), "INSERTTIME".getBytes(), 

inserttime.getBytes()); 

p.add(("ucvalue").getBytes(), "SECONDACCOUNT".getBytes(), 

secondaccount.getBytes()); 

p.add(("ucvalue").getBytes(), "SECONDDOMAIN".getBytes(), 

seconddomain.getBytes()); 

p.add(("ucvalue").getBytes(), "SECONDPROTOCOLID".getBytes(), 

secondprotocolid.getBytes()); 

p.add(("ucvalue").getBytes(), "TIMESCOUNT".getBytes(), 

timescount.getBytes()); 

list.add(p); 

} 

tmp = br.readLine(); 

count++; 

} 

if (list.size() > 0) { 

table.put(list); 

table.flushCommits(); 

} 

table.close(); 

System.out.println("total = " + count); 

long endTime = System.currentTimeMillis() / 1000; 

long costTime = endTime - startTime; 

System.out.println("end time = " + endTime); 

System.out.println(path + ": cost time = " + costTime); 

}

两种方式的优劣比较

MapReduce方式：

开始会很快，但是由于mr和hbase竞争资源，到一个特定的时间点会变很慢

Java程序方式：

多客户端，多线程同时入库，目前看来是最好的方式，client和regionserver分开，硬盘读写分开，瓶颈只在网络和内存上。咨询了一些牛人，大多推荐这种方式，并且一定要多客户端，多线程。

分享到：