`

Hadoop-MapReduce

阅读更多

MapReduce中Shuffle过程:

原理参考 :
http://weixiaolu.iteye.com/blog/1474172


 

数据类型:

1.数据类型都实现Writable接口,以便这些类型定义的数据可以被序列化进行网络传输和文件存储。

2.基本数据类型

    BooleanWritable、ByteWritable、DoubleWritable

    FloatWritable、IntWritable、LongWritable

    Text:使用UTF8格式存储文本

    NULLWritable:当<key,value>中的key或value为空时使用

MapReduce-—WordCount:(MapReduce编程模型与以下程序一致)

package com.fb.hadoop.mapreduce;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
 
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class WordCount extends Configured implements Tool{
	//Map Class
	public static class Mapper extends org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, IntWritable>{
		private Text mapOutputKey=new Text();
		private final static IntWritable mapOutputValue=new IntWritable(1);	
		@Override
		public void map(LongWritable key, Text value,
				org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			 
			 String lineValue=value.toString();
			 StringTokenizer stringTokenizer=new StringTokenizer(lineValue);
			 while(stringTokenizer.hasMoreTokens()){
				 String wordValue=stringTokenizer.nextToken();
				 //set value
				 mapOutputKey.set(wordValue);
				 //output
				 context.write(mapOutputKey, mapOutputValue);
			 }			
		}		
	}
	//Reduce Class
    public static class Reducer extends org.apache.hadoop.mapreduce.Reducer<Text, IntWritable,Text, IntWritable>{
    	private   IntWritable  outputValue=new IntWritable();	
		@Override
		public void reduce(Text key, Iterable<IntWritable> values,
				org.apache.hadoop.mapreduce.Reducer<Text, IntWritable, Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			 int sum=0;
			 //iterator
			 for(IntWritable value:values){
				 sum+=value.get();
			 }
			 outputValue.set(sum);
			 context.write(key, outputValue);			 
		}
		
	}
	//Driver
    public int run(String[] args) throws Exception{
    	//get configuretion
    	Configuration conf=getConf();
    	//create Job
    	Job job=Job.getInstance(conf,this.getClass().getSimpleName());
    	//run jar
    	job.setJarByClass(this.getClass());
    	//set Job
    	//input -> map -> reduce -> output
    	//1.input
    	Path inPath=new Path(args[0]);
    	FileInputFormat.addInputPath(job, inPath);
    	//2.map
    	job.setMapperClass(Mapper.class);
    	job.setMapOutputKeyClass(Text.class);
    	job.setOutputValueClass(IntWritable.class);
    	 
    	//3.shuffle
 /******************shuffle****************************
      //1.partitioner
    	 job.setPartitionerClass(cls);
      //2.sort
          job.setSortComparatorClass(cls);
      //3.optional,combiner
          job.setCombinerClass(cls);
      //4.group
          job.setGroupingComparatorClass(cls);
******************shuffle****************************/
    	//4.reduce
    	job.setReducerClass(Reducer.class);
    	job.setOutputKeyClass(Text.class);
    	job.setOutputValueClass(IntWritable.class);
    	//set reduce number
    	   //job.setNumReduceTasks(10);
    	
    	//5.output
    	Path outPath=new Path(args[1]);
    	FileOutputFormat.setOutputPath(job, outPath);
    	
    	//submit job
    	boolean isSuccess=job.waitForCompletion(true);
    	
    	return isSuccess?0:1;
    	
    }
    public static void main(String[] args) throws Exception {
    	Configuration conf=new Configuration();
		int status=ToolRunner.run(conf, new WordCount(),args);
		System.exit(status);
	}
}

    

 


 
 

  • 大小: 148.5 KB
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics