// cc AvroGenericMaxTemperature MapReduce program to find the maximum temperature, creating Avro output
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroCollector;
import org.apache.avro.mapred.AvroJob;
import org.apache.avro.mapred.AvroMapper;
import org.apache.avro.mapred.AvroReducer;
import org.apache.avro.mapred.AvroUtf8InputFormat;
import org.apache.avro.mapred.Pair;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
//vv AvroGenericMaxTemperature
public class AvroGenericMaxTemperature extends Configured implements Tool {
private static final Schema SCHEMA = new Schema.Parser().parse(
"{" +
" \"type\": \"record\"," +
" \"name\": \"WeatherRecord\"," +
" \"doc\": \"A weather reading.\"," +
" \"fields\": [" +
" {\"name\": \"year\", \"type\": \"int\"}," +
" {\"name\": \"temperature\", \"type\": \"int\"}," +
" {\"name\": \"stationId\", \"type\": \"string\"}" +
" ]" +
"}"
);
public static class MaxTemperatureMapper
extends AvroMapper<Utf8, Pair<Integer, GenericRecord>> {
private NcdcRecordParser parser = new NcdcRecordParser();
private GenericRecord record = new GenericData.Record(SCHEMA);
@Override
public void map(Utf8 line,
AvroCollector<Pair<Integer, GenericRecord>> collector,
Reporter reporter) throws IOException {
parser.parse(line.toString());
if (parser.isValidTemperature()) {
record.put("year", parser.getYearInt());
record.put("temperature", parser.getAirTemperature());
record.put("stationId", parser.getStationId());
collector.collect(
new Pair<Integer, GenericRecord>(parser.getYearInt(), record));
}
}
}
public static class MaxTemperatureReducer
extends AvroReducer<Integer, GenericRecord, GenericRecord> {
@Override
public void reduce(Integer key, Iterable<GenericRecord> values,
AvroCollector<GenericRecord> collector, Reporter reporter)
throws IOException {
GenericRecord max = null;
for (GenericRecord value : values) {
if (max == null ||
(Integer) value.get("temperature") > (Integer) max.get("temperature")) {
max = newWeatherRecord(value);
}
}
collector.collect(max);
}
private GenericRecord newWeatherRecord(GenericRecord value) {
GenericRecord record = new GenericData.Record(SCHEMA);
record.put("year", value.get("year"));
record.put("temperature", value.get("temperature"));
record.put("stationId", value.get("stationId"));
return record;
}
}
@Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.printf("Usage: %s [generic options] <input> <output>\n",
getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
JobConf conf = new JobConf(getConf(), getClass());
conf.setJobName("Max temperature");
FileInputFormat.addInputPath(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
AvroJob.setInputSchema(conf, Schema.create(Schema.Type.STRING));
AvroJob.setMapOutputSchema(conf,
Pair.getPairSchema(Schema.create(Schema.Type.INT), SCHEMA));
AvroJob.setOutputSchema(conf, SCHEMA);
conf.setInputFormat(AvroUtf8InputFormat.class);
AvroJob.setMapperClass(conf, MaxTemperatureMapper.class);
AvroJob.setReducerClass(conf, MaxTemperatureReducer.class);
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new AvroGenericMaxTemperature(), args);
System.exit(exitCode);
}
}
// ^^ AvroGenericMaxTemperature
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.avro.mapred.AvroCollector;
import org.apache.avro.mapred.AvroJob;
import org.apache.avro.mapred.AvroMapper;
import org.apache.avro.mapred.AvroReducer;
import org.apache.avro.mapred.AvroUtf8InputFormat;
import org.apache.avro.mapred.Pair;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import specific.WeatherRecord;
public class AvroSpecificMaxTemperature extends Configured implements Tool {
public static class MaxTemperatureMapper
extends AvroMapper<Utf8, Pair<Integer, WeatherRecord>> {
private NcdcRecordParser parser = new NcdcRecordParser();
private WeatherRecord record = new WeatherRecord();
@Override
public void map(Utf8 line,
AvroCollector<Pair<Integer, WeatherRecord>> collector,
Reporter reporter) throws IOException {
parser.parse(line.toString());
if (parser.isValidTemperature()) {
record.year = parser.getYearInt();
record.temperature = parser.getAirTemperature();
record.stationId = new Utf8(parser.getStationId());
collector.collect(
new Pair<Integer, WeatherRecord>(parser.getYearInt(), record));
}
}
}
public static class MaxTemperatureReducer extends
AvroReducer<Integer, WeatherRecord, WeatherRecord> {
@Override
public void reduce(Integer key, Iterable<WeatherRecord> values,
AvroCollector<WeatherRecord> collector,
Reporter reporter) throws IOException {
WeatherRecord max = null;
for (WeatherRecord value : values) {
if (max == null || value.temperature > max.temperature) {
max = newWeatherRecord(value);
}
}
collector.collect(max);
}
}
public static class MaxTemperatureCombiner extends
AvroReducer<Integer, WeatherRecord, Pair<Integer, WeatherRecord>> {
@Override
public void reduce(Integer key, Iterable<WeatherRecord> values,
AvroCollector<Pair<Integer, WeatherRecord>> collector,
Reporter reporter) throws IOException {
WeatherRecord max = null;
for (WeatherRecord value : values) {
if (max == null || value.temperature > max.temperature) {
max = newWeatherRecord(value);
}
}
collector.collect(new Pair<Integer, WeatherRecord>(key, max));
}
}
private static WeatherRecord newWeatherRecord(WeatherRecord value) {
WeatherRecord record = new WeatherRecord();
record.year = value.year;
record.temperature = value.temperature;
record.stationId = value.stationId;
return record;
}
@Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.printf("Usage: %s [generic options] <input> <output>\n",
getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
JobConf conf = new JobConf(getConf(), getClass());
conf.setJobName("Max temperature");
FileInputFormat.addInputPath(conf, new Path(args[0]));
FileOutputFormat.setOutputPath(conf, new Path(args[1]));
AvroJob.setInputSchema(conf, Schema.create(Schema.Type.STRING));
AvroJob.setMapOutputSchema(conf, Pair.getPairSchema(
Schema.create(Schema.Type.INT), WeatherRecord.SCHEMA$));
AvroJob.setOutputSchema(conf, WeatherRecord.SCHEMA$);
conf.setInputFormat(AvroUtf8InputFormat.class);
AvroJob.setMapperClass(conf, MaxTemperatureMapper.class);
AvroJob.setCombinerClass(conf, MaxTemperatureCombiner.class);
AvroJob.setReducerClass(conf, MaxTemperatureReducer.class);
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new AvroSpecificMaxTemperature(), args);
System.exit(exitCode);
}
}
import java.text.*;
import java.util.Date;
import org.apache.hadoop.io.Text;
public class NcdcRecordParser {
private static final int MISSING_TEMPERATURE = 9999;
private static final DateFormat DATE_FORMAT =
new SimpleDateFormat("yyyyMMddHHmm");
private String stationId;
private String observationDateString;
private String year;
private String airTemperatureString;
private int airTemperature;
private boolean airTemperatureMalformed;
private String quality;
public void parse(String record) {
stationId = record.substring(4, 10) + "-" + record.substring(10, 15);
observationDateString = record.substring(15, 27);
year = record.substring(15, 19);
airTemperatureMalformed = false;
// Remove leading plus sign as parseInt doesn't like them
if (record.charAt(87) == '+') {
airTemperatureString = record.substring(88, 92);
airTemperature = Integer.parseInt(airTemperatureString);
} else if (record.charAt(87) == '-') {
airTemperatureString = record.substring(87, 92);
airTemperature = Integer.parseInt(airTemperatureString);
} else {
airTemperatureMalformed = true;
}
airTemperature = Integer.parseInt(airTemperatureString);
quality = record.substring(92, 93);
}
public void parse(Text record) {
parse(record.toString());
}
public boolean isValidTemperature() {
return !airTemperatureMalformed && airTemperature != MISSING_TEMPERATURE
&& quality.matches("[01459]");
}
public boolean isMalformedTemperature() {
return airTemperatureMalformed;
}
public boolean isMissingTemperature() {
return airTemperature == MISSING_TEMPERATURE;
}
public String getStationId() {
return stationId;
}
public Date getObservationDate() {
try {
System.out.println(observationDateString);
return DATE_FORMAT.parse(observationDateString);
} catch (ParseException e) {
throw new IllegalArgumentException(e);
}
}
public String getYear() {
return year;
}
public int getYearInt() {
return Integer.parseInt(year);
}
public int getAirTemperature() {
return airTemperature;
}
public String getAirTemperatureString() {
return airTemperatureString;
}
public String getQuality() {
return quality;
}
}
import java.io.File;
import org.apache.avro.Schema;
import org.apache.avro.mapred.AvroJob;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class AvroProjection extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
if (args.length != 3) {
System.err.printf("Usage: %s [generic options] <input> <output> <schema-file>\n",
getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
String input = args[0];
String output = args[1];
String schemaFile = args[2];
JobConf conf = new JobConf(getConf(), getClass());
conf.setJobName("Avro projection");
FileInputFormat.addInputPath(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, new Path(output));
Schema schema = new Schema.Parser().parse(new File(schemaFile));
AvroJob.setInputSchema(conf, schema);
AvroJob.setMapOutputSchema(conf, schema);
AvroJob.setOutputSchema(conf, schema);
conf.setNumReduceTasks(0);
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new AvroProjection(), args);
System.exit(exitCode);
}
}
// cc AvroSort A MapReduce program to sort an Avro data file
import java.io.File;
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.avro.mapred.AvroCollector;
import org.apache.avro.mapred.AvroJob;
import org.apache.avro.mapred.AvroMapper;
import org.apache.avro.mapred.AvroReducer;
import org.apache.avro.mapred.Pair;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
//vv AvroSort
public class AvroSort extends Configured implements Tool {
static class SortMapper<K> extends AvroMapper<K, Pair<K, K>> {
public void map(K datum, AvroCollector<Pair<K, K>> collector,
Reporter reporter) throws IOException {
collector.collect(new Pair<K, K>(datum, null, datum, null));
}
}
static class SortReducer<K> extends AvroReducer<K, K, K> {
public void reduce(K key, Iterable<K> values,
AvroCollector<K> collector,
Reporter reporter) throws IOException {
for (K value : values) {
collector.collect(value);
}
}
}
@Override
public int run(String[] args) throws Exception {
if (args.length != 3) {
System.err.printf(
"Usage: %s [generic options] <input> <output> <schema-file>\n",
getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
String input = args[0];
String output = args[1];
String schemaFile = args[2];
JobConf conf = new JobConf(getConf(), getClass());
conf.setJobName("Avro sort");
FileInputFormat.addInputPath(conf, new Path(input));
FileOutputFormat.setOutputPath(conf, new Path(output));
Schema schema = new Schema.Parser().parse(new File(schemaFile));
AvroJob.setInputSchema(conf, schema);
Schema intermediateSchema = Pair.getPairSchema(schema, schema);
AvroJob.setMapOutputSchema(conf, intermediateSchema);
AvroJob.setOutputSchema(conf, schema);
AvroJob.setMapperClass(conf, SortMapper.class);
AvroJob.setReducerClass(conf, SortReducer.class);
JobClient.runJob(conf);
return 0;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new AvroSort(), args);
System.exit(exitCode);
}
}
// ^^ AvroSort
分享到:
相关推荐
在Hadoop生态系统中,`FileSystem API`是核心组件之一,它提供了一套接口,使得开发者可以方便地进行文件系统的操作,如...在学习和使用过程中,理解其核心概念、常用方法以及如何与其他Hadoop组件结合是至关重要的。
在深入探讨Lucene Field之前,我们先来...在实际应用中,结合其他相关技术,如Solr(基于Lucene的搜索引擎服务器)、Netty(高性能网络通信框架)和Hadoop(大数据处理框架),可以构建更复杂、高效的搜索解决方案。
22. 学习新技术:关注云计算、人工智能、大数据等领域的新技术,如Kubernetes、Spark、Hadoop等,拓宽视野。 23. 持续集成/持续部署(CI/CD):了解Jenkins、GitLab CI/CD等工具,实现自动化构建和部署,提高开发效率...
一步一步跟我学习lucene是对近期做lucene索引的总结,大家有问题的话联系本人的Q-Q: 891922381,同时本人新建Q-Q群:106570134(lucene,solr,netty,hadoop),如蒙加入,不胜感激,大家共同探讨,本人争取每日一博,...
- **终身学习的理念**:在快速变化的技术领域中,持续学习是跟上行业发展步伐的重要手段。通过不断学习新技术、新工具,IT从业者能够保持竞争力并适应市场需求的变化。 #### 乐于课堂实践 - **理论与实践相结合**:...
FourInOne(中文名字“四不像”)是一个四合一分布式计算框架,在写这个框架之前,我也看了老外写的其他开源框架,也对分布式计算进行了长时间的思考,当我们把复杂的hadoop当作一门学科学习时,似乎忘记了我们想...
以IBM的蓝云计算平台为例,其设计上大量使用了IBM的先进大规模计算技术,包含了虚拟化软件Xen和PowerVM,Linux操作系统映像,以及开源软件Hadoop。这样的设计使得数据中心可以使用类似于互联网的计算环境,从而更...
Fourinone(中文名字“四不像”)是一个四合一分布式计算框架,在写这个框架之前,我对分布式计算进行了长时间的思考,也看了老外写的其他开源框架,当我们把复杂的hadoop当作一门学科学习时,似乎忘记了我们想解决问题...
淘宝Fourinone(中文名字“四不像”)是一个四合一分布式计算框架,在写这个框架之前,我对分布式计算进行了长时间的思考,也看了老外写的其他开源框架,当我们把复杂的hadoop当作一门学科学习时,似乎忘记了我们想...
- **易于理解**:内容组织清晰,语言通俗易懂,即使是初学者也能轻松跟上学习节奏。 #### 四、读者反馈与评价 - **JavaLobby**:“如果你计划在应用中使用 Lucene 或者对 Lucene 能为你做什么感兴趣,这绝对是必读...