nutch1.4 Injector 详解

peigang

浏览: 172869 次
性别:
来自: 北京

最近访客更多访客>>

yxmzhg

yexiaoshunfeier

wd1282988143

the12thwolf

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

nutch

org.apache.nutch.crawl.InjectorURL注入器对象，nutch抓取入口。

代码如下：

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.crawl;

import java.io.*;
import java.text.SimpleDateFormat;
import java.util.*;

// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;

import org.apache.nutch.net.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;

/** This class takes a flat file of URLs and adds them to the of pages to be
 * crawled.  Useful for bootstrapping the system. 
 * The URL files contain one URL per line, optionally followed by custom metadata 
 * separated by tabs with the metadata key separated from the corresponding value by '='. <br>
 * Note that some metadata keys are reserved : <br>
 * - <i>nutch.score</i> : allows to set a custom score for a specific URL <br>
 * - <i>nutch.fetchInterval</i> : allows to set a custom fetch interval for a specific URL <br>
 * e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 \t userType=open_source
 **/
public class Injector extends Configured implements Tool {
  public static final Logger LOG = LoggerFactory.getLogger(Injector.class);
  
  /** metadata key reserved for setting a custom score for a specific URL */
  public static String nutchScoreMDName = "nutch.score";
  /** metadata key reserved for setting a custom fetchInterval for a specific URL */
  public static String nutchFetchIntervalMDName = "nutch.fetchInterval";

  /** Normalize and filter injected urls. */
  public static class InjectMapper implements Mapper<WritableComparable, Text, Text, CrawlDatum> {
    private URLNormalizers urlNormalizers;
    private int interval;
    private float scoreInjected;
    private JobConf jobConf;
    private URLFilters filters;
    private ScoringFilters scfilters;
    private long curTime;

    public void configure(JobConf job) {
      this.jobConf = job;
      urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
      interval = jobConf.getInt("db.fetch.interval.default", 2592000);
      filters = new URLFilters(jobConf);
      scfilters = new ScoringFilters(jobConf);
      scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
      curTime = job.getLong("injector.current.time", System.currentTimeMillis());
    }

    public void close() {}

    public void map(WritableComparable key, Text value,
                    OutputCollector<Text, CrawlDatum> output, Reporter reporter)
      throws IOException {
      String url = value.toString();              // value is line of text

      /**
       * 忽略"#"字符开头注释的行
       */
      if (url != null && url.trim().startsWith("#")) {
          /* Ignore line that start with # */
          return;
      }

      // if tabs : metadata that could be stored
      // must be name=value and separated by \t
      float customScore = -1f;
      int customInterval = interval;
      Map<String,String> metadata = new TreeMap<String,String>();
      
      //格式化URL
      if (url.indexOf("\t")!=-1){
    	  String[] splits = url.split("\t");
    	  url = splits[0];
    	  for (int s=1;s<splits.length;s++){
    		  // find separation between name and value
    		  int indexEquals = splits[s].indexOf("=");
    		  if (indexEquals==-1) {
    			  // skip anything without a =
    			  continue;		    
    		  }
    		  String metaname = splits[s].substring(0, indexEquals);
    		  String metavalue = splits[s].substring(indexEquals+1);
    		  if (metaname.equals(nutchScoreMDName)) {
    			  try {
    			  customScore = Float.parseFloat(metavalue);}
    			  catch (NumberFormatException nfe){}
    		  }
    		  else if (metaname.equals(nutchFetchIntervalMDName)) {
    			  try {
    				  customInterval = Integer.parseInt(metavalue);}
    			  catch (NumberFormatException nfe){}
    		  }
    		  else metadata.put(metaname,metavalue);
    	  }
      }
      try {
    	//过滤URL行为包括爬取规则过滤，网址格式格式化。详细参考：http://peigang.iteye.com/blog/1468984
        url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
        //过滤URL。 详细参考 http://peigang.iteye.com/blog/1469108
        url = filters.filter(url);             // filter the url
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) { LOG.warn("Skipping " +url+":"+e); }
        url = null;
      }
      if (url != null) {                          // if it passes
        value.set(url);                           // collect it
        CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, customInterval);
        datum.setFetchTime(curTime);
        // now add the metadata
        Iterator<String> keysIter = metadata.keySet().iterator();
        while (keysIter.hasNext()){
        	String keymd = keysIter.next();
        	String valuemd = metadata.get(keymd);
        	datum.getMetaData().put(new Text(keymd), new Text(valuemd));
        }
        if (customScore != -1) datum.setScore(customScore);
        else datum.setScore(scoreInjected);
        try {
        	//过滤结果集，详细参考：http://peigang.iteye.com/blog/1469143
        	scfilters.injectedScore(value, datum);
        } catch (ScoringFilterException e) {
        	if (LOG.isWarnEnabled()) {
        		LOG.warn("Cannot filter injected score for url " + url
        				+ ", using default (" + e.getMessage() + ")");
        	}
        }
        //输出
        output.collect(value, datum);
      }
    }
  }

  /** Combine multiple new entries for a url. */
  public static class InjectReducer implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
    public void configure(JobConf job) {}    
    public void close() {}

    private CrawlDatum old = new CrawlDatum();
    private CrawlDatum injected = new CrawlDatum();
    
    public void reduce(Text key, Iterator<CrawlDatum> values,
                       OutputCollector<Text, CrawlDatum> output, Reporter reporter)
      throws IOException {
      boolean oldSet = false;
      //遍历集合，设置CrawlDatum状态。
      while (values.hasNext()) {
        CrawlDatum val = values.next();
        if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
          injected.set(val);
          injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
        } else {
          old.set(val);
          oldSet = true;
        }
      }
      CrawlDatum res = null;
      if (oldSet) res = old; // don't overwrite existing value
      else res = injected;

      output.collect(key, res);
    }
  }

  public Injector() {}
  
  public Injector(Configuration conf) {
    setConf(conf);
  }
  
  /**
   * 初始化抓取数据库
   * @param crawlDb	爬取目录
   * @param urlDir	文件地址，该文件存储爬取URL列表
   * @throws IOException
   */
  public void inject(Path crawlDb, Path urlDir) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: starting at " + sdf.format(start));
      LOG.info("Injector: crawlDb: " + crawlDb);
      LOG.info("Injector: urlDir: " + urlDir);
    }

    //临时文件目录；首先从变量mapred.temp.dir中读取地址，如果不存在则临时目录根为“.” 当前目录
    Path tempDir =
      new Path(getConf().get("mapred.temp.dir", ".") +
               "/inject-temp-"+
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());	//创建job对象
    sortJob.setJobName("inject " + urlDir);		//设置job名称
    FileInputFormat.addInputPath(sortJob, urlDir);	//设置读取地址
    sortJob.setMapperClass(InjectMapper.class);		//设置map类

    FileOutputFormat.setOutputPath(sortJob, tempDir);	//设置map输出路径
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);	//设置排序类
    sortJob.setOutputKeyClass(Text.class);						//设置map输出的KEY类型
    sortJob.setOutputValueClass(CrawlDatum.class);				//设置map输出的VALUE类型
    sortJob.setLong("injector.current.time", System.currentTimeMillis());		
    JobClient.runJob(sortJob);

    /**
     * sortJob读取url进行合并后将结果输出到tempDir中。
     * mergeJob读取tempDir中的数据进行合并。
     */
    
    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);	//设置读取目录
    mergeJob.setReducerClass(InjectReducer.class);		//设置reducer类型
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    /**
     * 合并完毕后删除临时目录
     */
    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  }

  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(NutchConfiguration.create(), new Injector(), args);
    System.exit(res);
  }
  
  public int run(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println("Usage: Injector <crawldb> <url_dir>");
      return -1;
    }
    try {
      inject(new Path(args[0]), new Path(args[1]));
      return 0;
    } catch (Exception e) {
      LOG.error("Injector: " + StringUtils.stringifyException(e));
      return -1;
    }
  }

}

在crawl对象中引用了public void inject(Path crawlDb, Path urlDir)方法，

分享到：

nutch1.4 解析器 ParseSegment详解 | nutch1.4 ScoringFilter详解

2012-03-29 18:04
浏览 1167
评论(0)
分类:开源软件
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

nutch1.4 Injector 详解

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

nutch1.4 Injector 详解

评论

发表评论

相关推荐

Nutch1.7二次开发培训讲义

nutch-default.xml 配置范例

nutch本地模式调试环境配置

nutch分布式调试环境配置

nutch 正文提取流程解析

用Eclipse开发nutch准备工作

nutch1.4 CrawlDatum详解

nutch1.4 分布式爬取

nutch1.4：爬虫定时抓取设置

nutch1.4 开发：增加外部jar包

nutch1.4 爬虫父页面参数传递到子页面注意事项

nutch1.4 Fetcher详解

nutch1.4 Protocol接口解析

nutch1.4自定义字段开发实例

nutch1.4插件开发

nutch1.4 解析器 ParseSegment详解

nutch1.4 Generator详解

nutch1.4 ScoringFilter详解

nutch1.4 URLFilter详解

nutch1.4 URLNormalizers 详解

最近访客更多访客>>