hadoop1.2.1 MultipleOutputs将结果输出到多个文件或文件夹

退役的龙弟弟

浏览: 460045 次
性别:
来自: 北京

最近访客更多访客>>

cuker919

hui963966800

xyz86868

一个java程序员

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

hadoop

hadoop1.2.1中使用MultipleOutputs将结果输出到多个文件或文件夹

使用步骤主要有三步：

1、在reduce或map类中创建MultipleOutputs对象，将结果输出

class reduceStatistics extends Reducer<Text, IntWritable, Text, IntWritable>{

	//将结果输出到多个文件或多个文件夹
	private MultipleOutputs<Text,IntWritable> mos;
    //创建对象
    protected void setup(Context context) throws IOException,InterruptedException {
        mos = new MultipleOutputs<Text, IntWritable>(context);
     }
    	
        //关闭对象
	protected void cleanup(Context context) throws IOException,InterruptedException {
        mos.close();
	}
}

2、在map或reduce方法中使用MultipleOutputs对象输出数据，代替congtext.write()

protected void reduce(Text key, Iterable<IntWritable> values, Context context)
			throws IOException, InterruptedException {
		IntWritable V = new IntWritable();
		int sum = 0;
		for(IntWritable value : values){
			sum = sum + value.get();
		}
		System.out.println("word:" + key.toString() + "     sum = " + sum);
		V.set(sum);

		//使用MultipleOutputs对象输出数据
		if(key.toString().equals("hello")){
			mos.write("hello", key, V);
		}else if(key.toString().equals("world")){
			mos.write("world", key, V);
		}else if(key.toString().equals("hadoop")){
			//输出到hadoop/hadoopfile-r-00000文件
			mos.write("hadoopfile", key, V, "hadoop/");
		}
		
	}

3、在创建job时，定义附加的输出文件，这里的文件名称与第二步设置的文件名相同

//定义附加的输出文件
			MultipleOutputs.addNamedOutput(job,"hello",TextOutputFormat.class,Text.class,IntWritable.class);
			MultipleOutputs.addNamedOutput(job,"world",TextOutputFormat.class,Text.class,IntWritable.class);
			MultipleOutputs.addNamedOutput(job,"hadoopfile",TextOutputFormat.class,Text.class,IntWritable.class);

完整代码：

package com.ru.hadoop.wordcount;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.lib.MultipleOutputFormat;
import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Progressable;

public class WordCount2 extends Configured{

	public static void main(String[] args) {
		String in = "/home/nange/work/test/word/";
		String out = "hdfs://localhost:9000/hdfs/test/wordcount/out/";
		
		Job job;
		try {
			//删除hdfs目录
			WordCount2 wc2 = new WordCount2();
			wc2.removeDir(out);
			
			job = new Job(new Configuration(), "wordcount Job");
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);
			job.setMapperClass(mapperString.class);
//			job.setCombinerClass(reduceStatistics.class);
			job.setReducerClass(reduceStatistics.class);
			
			//定义附加的输出文件
			MultipleOutputs.addNamedOutput(job,"hello",TextOutputFormat.class,Text.class,IntWritable.class);
			MultipleOutputs.addNamedOutput(job,"world",TextOutputFormat.class,Text.class,IntWritable.class);
			MultipleOutputs.addNamedOutput(job,"hadoopfile",TextOutputFormat.class,Text.class,IntWritable.class);
			
			FileInputFormat.addInputPath(job, new Path(in));
			FileOutputFormat.setOutputPath(job, new Path(out));
			job.waitForCompletion(true);
		} catch (IOException e) {
			e.printStackTrace();
		} catch (URISyntaxException e) {
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			e.printStackTrace();
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
	}
	
	public void removeDir(String filePath) throws IOException, URISyntaxException{
		String url = "hdfs://localhost:9000";
		FileSystem fs  = FileSystem.get(new URI(url), new Configuration());
		fs.delete(new Path(filePath));
	}
}


/**
 * 重写maptask使用的map方法 
 * @author nange
 *
 */
class mapperString extends Mapper<LongWritable, Text, Text, IntWritable>{
	//设置正则表达式的编译表达形式
	public static Pattern PATTERN = Pattern.compile(" ");
	Text K = new Text();
	IntWritable V = new IntWritable(1);
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		
		String[] words = PATTERN.split(value.toString());
		System.out.println("********" + value.toString());
		for(String word : words){
			K.set(word);
			context.write(K, V);
		}
	}
}

/**
 * 对单词做统计
 * @author nange
 *
 */
class reduceStatistics extends Reducer<Text, IntWritable, Text, IntWritable>{

	//将结果输出到多个文件或多个文件夹
	private MultipleOutputs<Text,IntWritable> mos;
	//创建MultipleOutputs对象
    protected void setup(Context context) throws IOException,InterruptedException {
        mos = new MultipleOutputs<Text, IntWritable>(context);
     }
    
	@Override
	protected void reduce(Text key, Iterable<IntWritable> values, Context context)
			throws IOException, InterruptedException {
		IntWritable V = new IntWritable();
		int sum = 0;
		for(IntWritable value : values){
			sum = sum + value.get();
		}
		System.out.println("word:" + key.toString() + "     sum = " + sum);
		V.set(sum);

		//使用MultipleOutputs对象输出数据
		if(key.toString().equals("hello")){
			mos.write("hello", key, V);
		}else if(key.toString().equals("world")){
			mos.write("world", key, V);
		}else if(key.toString().equals("hadoop")){
			//输出到hadoop/hadoopfile-r-00000文件
			mos.write("hadoopfile", key, V, "hadoop/");
		}
		
	}
	
	//关闭MultipleOutputs对象
	protected void cleanup(Context context) throws IOException,InterruptedException {
        mos.close();
	}
}

分享到：

mapreduce的reduce输出文件进行压缩 | hadoop调度器

2014-04-24 09:06
浏览 8651
评论(0)
分类:开源软件
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

hadoop1.2.1 MultipleOutputs将结果输出到多个文件或文件夹

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

hadoop1.2.1 MultipleOutputs将结果输出到多个文件或文件夹

评论

发表评论

相关推荐

MapReduce编程接口体系结构

hadoop优化

hadoop小文件处理以及解决方案（压缩技术）

mapreduce单元测试

HDFS中PathFilter类对路径进行过滤

Hadoop DistributedCache详解

mapreduce的reduce输出文件进行压缩

hadoop调度器

HDFS block块的副本存放策略

mapreduce驱动默认设置

mapreduce数据类型与java数据类型对应

MapReduce执行流程

hadoop常用指令

hdfs 架构

hadoop查看和离开安全模式

hbase的常用指令

hbase0.96.1.1安装配置

hadoop1.2.1安装配置

hadoop2.2.0伪分布式安装

eclipse远程连接hadoop服务器

最近访客更多访客>>