hadoop 求平均时间

xjward

浏览: 35826 次

最近访客更多访客>>

xx5333

liushicheng1

u014083580

xgf502429862

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

hadoop

sqoop 求订单完成是平均时间精确到0.1天

package hdfs.demo3;


import hdfs.constants.DemoLineInputFormat;
import hdfs.constants.Utils;

import java.io.IOException;
import java.math.BigDecimal;
import java.sql.Timestamp;
import java.util.Date;
import java.util.Vector;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


public class Demo3 {  
 
	//解析 hdfs中的数据
	//DAYTIME 在下标为 51位置的值
	//RCV_DATE 在下标为 47位置的值
	//DISPOSEID 在下标为 24位置的值
	  public static class MapHDFS extends Mapper<LongWritable,Text, LongWritable, Text>{
		  	LongWritable	rkey	= new LongWritable();
		  	Text	rval	= new Text();
			long time; //一次交易花费的时间
			String[] strs=new String[1];
			public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
				strs=value.toString().split("\1");
				  
				Date startTime= Utils.getDate(strs[51]);
				Date stopTime= Utils.getDate(strs[47]);
				if(startTime==null||stopTime==null){
					return ;
				}
				time=stopTime.getTime()/1000-startTime.getTime()/1000;
				if(time<0){ //错误数据
					return ;
				}
				rkey.set(time%1000);  //毫秒 变 秒
				rval.set(time/3600+"") ; // 精确到小时 
				context.write(rkey, rval); // 输出到Combiner中 并组合
			}
	  }
	  
	  static class Combin extends Reducer<LongWritable,Text, LongWritable, Text>{
		  LongWritable	wkey	= new LongWritable();  
		  Text	val	= new Text();
		  public  void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
			  int sum =0;
			  int count=0;
			  for(Text v:values){
				  count++;
				  sum+=Long.valueOf(v.toString());
			  }
			  val.set("#"+count+"#"+sum);
			  context.write(wkey,val);
		  }
	  }

	public static class Reduce extends Reducer<LongWritable, Text, Text, Text> {
		Text writKey = new Text();
		Text writValue = new Text();
		public void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
			long count=0;
			long sum = 0;
			String [] strs=new String [2];
			for (Text element : values) {
				strs=element.toString().split("#");
				count+=Long.valueOf(strs[1]); // key
				sum+=Long.valueOf(strs[2]); // value
			}
			Timestamp tt=new Timestamp(sum/count*3600*1000);
			writKey.set(count+"");
			writValue.set(getDate(tt.getTime())+"\t"+getDayDate(tt.getTime()));
			context.write(writKey, writValue);
		}

	}
	
	/**
	 *  取天 精确到0.1天
	 * @param tim 时间戳 毫秒级别
	 * @return
	 */
	private static String getDayDate(Long tim){
		String str =tim+":时间：\t";
		str+=new BigDecimal(tim/(1000*60*60*24.0)).setScale(2, BigDecimal.ROUND_HALF_UP)+"天 ";
		return str;
	}
	/**
	 * @param args
	 * @throws IOException
	 */
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		
		/**
		 * ./sqoop import --connect jdbc:oracle:thin:@10.10.35.65:1521/crm_standby  
		 *  --username CRMDEV --password crmdev --table ORDR_MAIN 
		 * --where 'rownum<1000' --fields-terminated-by '\0x001' --lines-terminated-by '\0x002'
		 */
		String outputPath="hdfs://172.19.121.125:9000/user/admin/ORDR_MAIN";
		new Demo3().avgTime(outputPath,conf);
	}
	 
	
	private void avgTime(String path,Configuration conf){
			Path outPath=new Path(path+"_avg");
			try {
				FileSystem fs=outPath.getFileSystem(conf);
				fs.deleteOnExit(outPath);
			} catch (IOException e1) {
//				e1.printStackTrace();
			}
		try {
			Job job = new Job(conf, "tAvgTime");
			
			// 设置map
			job.setMapperClass(MapHDFS.class);
			job.setCombinerClass(Combin.class);
			job.setReducerClass(Reduce.class);
			
			// 设置输出类型 (付款时间和结账时间的  时间戳值)
			job.setOutputKeyClass(LongWritable.class);
			job.setOutputValueClass(Text.class);
			
			job.setInputFormatClass(DemoLineInputFormat.class);
			job.setOutputFormatClass(TextOutputFormat.class);
			
			FileInputFormat.addInputPath(job, new Path(path));
			// 设置输入输出路径
			FileOutputFormat.setOutputPath(job, new Path(path+"_avg"));
			
			job.waitForCompletion(true);
			} catch (Exception e) {
				e.printStackTrace();
			}
					
	}
	
	
}

main 方法中贴出的sqoop 语句由于指定了字段分隔符和行分隔符所以需要自定义LineReader分割文件;

参考读取text文件的方法

复制 org.apache.hadoop.mapreduce.lib.input.LineInputFormat 为 DemoLineInputFormat

复制 org.apache.hadoop.mapreduce.lib.input.LineRecordReader 为 DemoLineRecordReader

复制 org.apache.hadoop.util.LineReader 为 DemoLineReader

job 设置读取文件的 InputFormat

job.setInputFormatClass(DemoLineInputFormat.class);

修改他们的引用关系:

DemoLineInputFormat 中的 LineRecordReader 修改为DemoLineRecordReader

DemoLineRecordReader 中的 LineReader 修改为DemoLineReader

修改 DemoLineReader 的常量LF 为sqoop导出语句中的行分隔符 '\2'

private static final byte LF = '\2'; // 默认为'\n' //HDFS文件的换行符

分享到：

hadoop 二次排序插入数据库 | 正则表达式之获取匹配与非获取匹配

2013-02-27 17:11
浏览 805
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论