`
peigang
  • 浏览: 170226 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

nutch1.4 Injector 详解

 
阅读更多

org.apache.nutch.crawl.InjectorURL注入器对象,nutch抓取入口。

代码如下:

 

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.nutch.crawl;

import java.io.*;
import java.text.SimpleDateFormat;
import java.util.*;

// Commons Logging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;

import org.apache.nutch.net.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;

/** This class takes a flat file of URLs and adds them to the of pages to be
 * crawled.  Useful for bootstrapping the system. 
 * The URL files contain one URL per line, optionally followed by custom metadata 
 * separated by tabs with the metadata key separated from the corresponding value by '='. <br>
 * Note that some metadata keys are reserved : <br>
 * - <i>nutch.score</i> : allows to set a custom score for a specific URL <br>
 * - <i>nutch.fetchInterval</i> : allows to set a custom fetch interval for a specific URL <br>
 * e.g. http://www.nutch.org/ \t nutch.score=10 \t nutch.fetchInterval=2592000 \t userType=open_source
 **/
public class Injector extends Configured implements Tool {
  public static final Logger LOG = LoggerFactory.getLogger(Injector.class);
  
  /** metadata key reserved for setting a custom score for a specific URL */
  public static String nutchScoreMDName = "nutch.score";
  /** metadata key reserved for setting a custom fetchInterval for a specific URL */
  public static String nutchFetchIntervalMDName = "nutch.fetchInterval";

  /** Normalize and filter injected urls. */
  public static class InjectMapper implements Mapper<WritableComparable, Text, Text, CrawlDatum> {
    private URLNormalizers urlNormalizers;
    private int interval;
    private float scoreInjected;
    private JobConf jobConf;
    private URLFilters filters;
    private ScoringFilters scfilters;
    private long curTime;

    public void configure(JobConf job) {
      this.jobConf = job;
      urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
      interval = jobConf.getInt("db.fetch.interval.default", 2592000);
      filters = new URLFilters(jobConf);
      scfilters = new ScoringFilters(jobConf);
      scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
      curTime = job.getLong("injector.current.time", System.currentTimeMillis());
    }

    public void close() {}

    public void map(WritableComparable key, Text value,
                    OutputCollector<Text, CrawlDatum> output, Reporter reporter)
      throws IOException {
      String url = value.toString();              // value is line of text

      /**
       * 忽略"#"字符开头注释的行
       */
      if (url != null && url.trim().startsWith("#")) {
          /* Ignore line that start with # */
          return;
      }

      // if tabs : metadata that could be stored
      // must be name=value and separated by \t
      float customScore = -1f;
      int customInterval = interval;
      Map<String,String> metadata = new TreeMap<String,String>();
      
      //格式化URL
      if (url.indexOf("\t")!=-1){
    	  String[] splits = url.split("\t");
    	  url = splits[0];
    	  for (int s=1;s<splits.length;s++){
    		  // find separation between name and value
    		  int indexEquals = splits[s].indexOf("=");
    		  if (indexEquals==-1) {
    			  // skip anything without a =
    			  continue;		    
    		  }
    		  String metaname = splits[s].substring(0, indexEquals);
    		  String metavalue = splits[s].substring(indexEquals+1);
    		  if (metaname.equals(nutchScoreMDName)) {
    			  try {
    			  customScore = Float.parseFloat(metavalue);}
    			  catch (NumberFormatException nfe){}
    		  }
    		  else if (metaname.equals(nutchFetchIntervalMDName)) {
    			  try {
    				  customInterval = Integer.parseInt(metavalue);}
    			  catch (NumberFormatException nfe){}
    		  }
    		  else metadata.put(metaname,metavalue);
    	  }
      }
      try {
    	//过滤URL行为包括爬取规则过滤,网址格式格式化。详细参考:http://peigang.iteye.com/blog/1468984
        url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
        //过滤URL。 详细参考 http://peigang.iteye.com/blog/1469108
        url = filters.filter(url);             // filter the url
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) { LOG.warn("Skipping " +url+":"+e); }
        url = null;
      }
      if (url != null) {                          // if it passes
        value.set(url);                           // collect it
        CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_INJECTED, customInterval);
        datum.setFetchTime(curTime);
        // now add the metadata
        Iterator<String> keysIter = metadata.keySet().iterator();
        while (keysIter.hasNext()){
        	String keymd = keysIter.next();
        	String valuemd = metadata.get(keymd);
        	datum.getMetaData().put(new Text(keymd), new Text(valuemd));
        }
        if (customScore != -1) datum.setScore(customScore);
        else datum.setScore(scoreInjected);
        try {
        	//过滤结果集,详细参考:http://peigang.iteye.com/blog/1469143
        	scfilters.injectedScore(value, datum);
        } catch (ScoringFilterException e) {
        	if (LOG.isWarnEnabled()) {
        		LOG.warn("Cannot filter injected score for url " + url
        				+ ", using default (" + e.getMessage() + ")");
        	}
        }
        //输出
        output.collect(value, datum);
      }
    }
  }

  /** Combine multiple new entries for a url. */
  public static class InjectReducer implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
    public void configure(JobConf job) {}    
    public void close() {}

    private CrawlDatum old = new CrawlDatum();
    private CrawlDatum injected = new CrawlDatum();
    
    public void reduce(Text key, Iterator<CrawlDatum> values,
                       OutputCollector<Text, CrawlDatum> output, Reporter reporter)
      throws IOException {
      boolean oldSet = false;
      //遍历集合,设置CrawlDatum状态。
      while (values.hasNext()) {
        CrawlDatum val = values.next();
        if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
          injected.set(val);
          injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
        } else {
          old.set(val);
          oldSet = true;
        }
      }
      CrawlDatum res = null;
      if (oldSet) res = old; // don't overwrite existing value
      else res = injected;

      output.collect(key, res);
    }
  }

  public Injector() {}
  
  public Injector(Configuration conf) {
    setConf(conf);
  }
  
  /**
   * 初始化抓取数据库
   * @param crawlDb	爬取目录
   * @param urlDir	文件地址,该文件存储爬取URL列表
   * @throws IOException
   */
  public void inject(Path crawlDb, Path urlDir) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: starting at " + sdf.format(start));
      LOG.info("Injector: crawlDb: " + crawlDb);
      LOG.info("Injector: urlDir: " + urlDir);
    }

    //临时文件目录;首先从变量mapred.temp.dir中读取地址,如果不存在则临时目录根为“.” 当前目录
    Path tempDir =
      new Path(getConf().get("mapred.temp.dir", ".") +
               "/inject-temp-"+
               Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());	//创建job对象
    sortJob.setJobName("inject " + urlDir);		//设置job名称
    FileInputFormat.addInputPath(sortJob, urlDir);	//设置读取地址
    sortJob.setMapperClass(InjectMapper.class);		//设置map类

    FileOutputFormat.setOutputPath(sortJob, tempDir);	//设置map输出路径
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);	//设置排序类
    sortJob.setOutputKeyClass(Text.class);						//设置map输出的KEY类型
    sortJob.setOutputValueClass(CrawlDatum.class);				//设置map输出的VALUE类型
    sortJob.setLong("injector.current.time", System.currentTimeMillis());		
    JobClient.runJob(sortJob);

    /**
     * sortJob读取url进行合并后将结果输出到tempDir中。
     * mergeJob读取tempDir中的数据进行合并。
     */
    
    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);	//设置读取目录
    mergeJob.setReducerClass(InjectReducer.class);		//设置reducer类型
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    /**
     * 合并完毕后删除临时目录
     */
    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
  }

  public static void main(String[] args) throws Exception {
    int res = ToolRunner.run(NutchConfiguration.create(), new Injector(), args);
    System.exit(res);
  }
  
  public int run(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println("Usage: Injector <crawldb> <url_dir>");
      return -1;
    }
    try {
      inject(new Path(args[0]), new Path(args[1]));
      return 0;
    } catch (Exception e) {
      LOG.error("Injector: " + StringUtils.stringifyException(e));
      return -1;
    }
  }

}
 

在crawl对象中引用了public void inject(Path crawlDb, Path urlDir)方法,

分享到:
评论

相关推荐

    nutch1.4帮助文档

    nutch1.4帮助文档,学习nutch1.4必备,最新nutch1.4核心类解读!

    nutch_1.4配置

    ### Apache Nutch 1.4在Windows下的安装与配置详解 #### 一、Apache Nutch简介及重要性 Apache Nutch是一款用Java语言编写的开源网络爬虫项目,旨在自动化地抓取网页中的链接,检查并修复坏链接,以及创建已访问...

    nutch_1.4在windows下安装配置.pdf

    ### Nutch 1.4 在 Windows 下的安装与配置知识点详解 #### 一、Nutch 简介 - **定义**: Apache Nutch 是一款基于 Java 的开源网页爬虫项目,能够自动抓取互联网上的网页及其内部链接,并对其进行索引处理。 - **...

    apache-nutch-1.4

    Nutch 1.4是该项目的一个稳定版本,发布于2012年,尽管后续有更新的版本,但1.4版本因其稳定性及广泛的应用而备受青睐。在深入探讨Nutch 1.4的知识点之前,我们先来了解一下什么是Apache Nutch。 Apache Nutch是一...

    Nutch1.4_windows下eclipse配置图文详解.docx

    ### Nutch 1.4 在 Windows 下 Eclipse 配置图文详解 #### 一、环境准备与配置 **1.1 JDK 安装** - **版本选择**:文档中提到使用了 JDK1.6,官方下载地址为:[JDK6]...

    apache-nutch-1.4-bin.tar.gz.part2

    apache-nutch-1.4-bin.tar.gz.part2

    apache-nutch-1.4-bin.tar.gz

    在这个"apache-nutch-1.4-bin.tar.gz"压缩包中,包含了运行 Nutch 的所有必要组件和配置文件,适合初学者和开发者快速部署和实验。 **Nutch 的核心组成部分:** 1. **爬虫(Spider)**:Nutch 的爬虫负责在网络中...

    apache-nutch-1.4-src.tar.gz_nutch_搜索引擎

    在“apache-nutch-1.4-src.tar.gz”这个压缩包中,包含了Nutch 1.4版本的源代码,用户可以根据自己的需求对代码进行定制和扩展。 Nutch 的主要组件包括以下几个方面: 1. **网络爬虫(Crawler)**:Nutch 的网络...

    apache-nutch-1.4-bin.part2

    apache-nutch-1.4-bin.part2

    apache-nutch-1.4-bin.part1

    apache-nutch-1.4-bin.part1

    apache-nutch-1.4-bin.tar.gz.part1

    apache-nutch-1.4-bin.tar.gz.part1

    Nutch配置环境\Nutch1[1].4_windows下eclipse配置图文详解.docx

    本文将详细介绍如何在Windows环境下配置Nutch 1.4,并使用Eclipse进行开发。以下是你需要知道的关键步骤: 1. **安装JDK**: 在配置Nutch之前,首先确保已安装Java Development Kit (JDK)。这里推荐使用JDK 1.6。...

    nutch的源代码解析

    下面我们将详细探讨 Nutch 的注入(Injector)过程,这是整个爬取流程的第一步。 Injector 类在 Nutch 中的作用是将输入的 URL 集合并入到 CrawlDB(爬取数据库)中。这个过程主要包括三个主要步骤: 1. **URL ...

    Nutch中文教程nutcher.zip

    nutcher 是 Apache Nutch 的中文教程,在... Nutch流程控制源码详解(bin/crawl中文注释版) Nutch教程——URLNormalizer源码详解 Nutch参数配置——http.content.limit 文档截图:

    nutch使用&Nutch;入门教程

    四、Nutch工作流详解 Nutch的工作流程包括多个步骤,如生成段(Segments)、迭代抓取(Fetch)、解析(Parse)、更新链接数据库(Update)、生成新的种子(Generate)、选择待抓取页面(Select)、重定向处理(Fetch...

    eclipse配置nutch,eclipse配置nutch

    ### 二、Eclipse配置Nutch的步骤详解 #### 步骤1:创建Nutch项目 首先,在Eclipse中创建一个新的Java项目,选择“File &gt; New &gt; Project &gt; Java project”,然后点击“Next”。在创建项目时,选择“Create project ...

    Nutch的各项配置详解

    Nutch各个配置项的详细说明,非常详细的说明了每一项

Global site tag (gtag.js) - Google Analytics