Nutch: 读取 nutch抓取内容

黎明lm

浏览: 311983 次
性别:
来自: 北京

最近访客更多访客>>

baby孔祥超

jiazhigang

slipper-jay

woshiliukun

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

nutch

nutch

package org.apache.nutch;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class SequenceFileReader<K, V> extends Configured implements Tool{

    private boolean more = true;
     private K key = null;
     private V value = null;
     private SequenceFile.Reader in;

     public boolean nextKeyValue() throws IOException, InterruptedException {
            if (!more) {
              return false;
           }
           long pos = in.getPosition();
            key = (K) in.next(key);
            if (key == null ) {
              more = false;
              key = null;
              value = null;
            } else {
              value = (V) in.getCurrentValue(value);
            }
            return more;
          }
          public K getCurrentKey() {
            return key;
          }

          public V getCurrentValue() {
            return value;
          }
    @Override
    public int run(String[] arg0) throws Exception {
        Configuration conf = this.getConf();
            in = new SequenceFile.Reader(FileSystem.get(conf),new Path(arg0[0]),conf);
            DataOutputBuffer outBuf = new DataOutputBuffer();
            while(this.nextKeyValue()){
                System.out.println(this.getCurrentKey());
                System.out.println(this.getCurrentValue());
            }
            return 0;
    }
    public static void main(String[] args) {
        try{
        String file = "D:/serverpkg/asd/nutch-1.3/index/segments/20110902115211/parse_text/part-00000/data";
         if(null == args || args.length == 0)
             args = new String[]{file};
         int res = ToolRunner.run(new Configuration(), new SequenceFileReader<Text,Writable>(), args);
         System.exit(res);
        }catch(Exception e){
            e.printStackTrace();
        }
    }
}

0
顶

0
踩

分享到：

nutch1.3 command | hadoop IO(一)

2011-09-02 13:50
浏览 1373
评论(0)
分类:开源软件
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论