`

Hadoop SequenceFile Writer And Reader

 
阅读更多

 

package cn.edu.xmu.dm.mpdemo.ioformat;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;

/**
 * desc: SequenceFileWriter
 * <code>SequenceFileWriteDemo</code>
 * 
 * @author chenwq (irwenqiang@gmail.com)
 * @version 1.0 2012/05/19
 */
public class SequenceFileWriteDemo {
	private static final String[] DATA = { "One, two, buckle my shoe",
			"Three, four, shut the door", "Five, six, pick up sticks",
			"Seven, eight, lay them straight", "Nine, ten, a big fat hen" };

	public static void main(String[] args) throws IOException {
		String uri = args[0];
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(URI.create(uri), conf);
		Path path = new Path(uri);

		IntWritable key = new IntWritable();
		Text value = new Text();
		SequenceFile.Writer writer = null;
		try {
			/**
			 * fs: outputstream
			 * conf: configuration object
			 * key: the key' type
			 * value: the value's type
			 */
			writer = SequenceFile.createWriter(fs, conf, path, key.getClass(),
					value.getClass());
//			writer = SequenceFile.createWriter(fs, conf, path, key.getClass(),
//					value.getClass(), CompressionType.BLOCK);
			for (int i = 0; i < 100; i++) {
				key.set(100 - i);
				value.set(DATA[i % DATA.length]);
				System.out.printf("[%s]\t%s\t%s\n", writer.getLength(), key,
						value);
				writer.append(key, value);
			}
		} finally {
			IOUtils.closeStream(writer);
		}
	}
}

 

 

 

package cn.edu.xmu.dm.mpdemo.ioformat;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;

/**
 * desc: SequenceFileReader
 * <code>SequenceFileReadDemo</code>
 * 
 * @author chenwq (irwenqiang@gmail.com)
 * @version 1.0 2012/05/19
 */
public class SequenceFileReadDemo {
	public static void main(String[] args) throws IOException {
		String uri = args[0];
		Configuration conf = new Configuration();
		FileSystem fs = FileSystem.get(URI.create(uri), conf);
		Path path = new Path(uri);

		SequenceFile.Reader reader = null;
		try {
			reader = new SequenceFile.Reader(fs, path, conf);
			Writable key = (Writable) ReflectionUtils.newInstance(
					reader.getKeyClass(), conf);
			Writable value = (Writable) ReflectionUtils.newInstance(
					reader.getValueClass(), conf);
			long position = reader.getPosition();
			while (reader.next(key, value)) {
				String syncSeen = reader.syncSeen() ? "*" : "";
				System.out.printf("[%s%s]\t%s\t%s\n", position, syncSeen, key,
						value);
				position = reader.getPosition(); // beginning of next record
			}
		} finally {
			IOUtils.closeStream(reader);
		}
	}
}

 

 

 

使用Block压缩后的大小对比:

 

root@ubuntu:~# hadoop fs -ls mpdemo/
Found 2 items
-rw-r--r--   3 root supergroup       4788 2012-05-19 00:11 /user/root/mpdemo/seqinput
-rw-r--r--   3 root supergroup        484 2012-05-19 00:17 /user/root/mpdemo/seqinputblock
 

 

0
0
分享到:
评论

相关推荐

    sequencefile&mapfile代码

    这可能涉及到使用`SequenceFile.Writer`和`SequenceFile.Reader`类,以及相关的序列化和反序列化工具。 **MapFile** MapFile是SequenceFile的一种扩展,它提供了一种索引结构来加速查找特定键的数据。MapFile由两...

    11、hadoop环境下的Sequence File的读写与合并

    SequenceFile.Reader reader = new SequenceFile.Reader(conf, ..., ...); while (reader.next(key, value)) { // 处理键值对 } reader.close(); ``` 3. **Sequence File的合并** 合并多个Sequence Files...

    spark-SequenceFile及MapFile讲解

    例如,使用 SequenceFile.Writer 来写入记录,使用 SequenceFile.Reader 来读取记录。 二、MapFile MapFile 是一种特殊的 SequenceFile,是排序后的 SequenceFile。它可以将键值对按照 Key 的顺序进行排序,提供...

    content.zip

    SequenceFile是Hadoop生态系统中的一个重要组件,是一种高效、可靠的二进制文件格式,常用于存储大规模数据集。本篇将深入探讨SequenceFile及其在Java环境下的操作,结合给定的"content.zip"压缩包,我们将分析如何...

    Hadoop文件的存储格式实例详解

    本篇文章将深入探讨Hadoop文件的存储格式,尤其是SequenceFile格式,它是一种广泛使用的二进制文件格式,适合大规模数据处理。 首先,我们要了解最基础的1.txt纯文本格式。这种格式是最直观易读的,它由多行记录...

    C++ 读写 parquet 文件 Demo

    Parquet 是一种列式存储格式,被广泛用于大数据处理和分析场景,如 Apache Hadoop、Spark、Impala 等。它支持高效的读写操作,尤其适用于大规模数据处理,因为它的设计允许对数据进行快速的随机访问和压缩。C++ 是一...

Global site tag (gtag.js) - Google Analytics