Nutch1.2增加插件例子[转]

nhy520

浏览: 961033 次
性别:
来自: 北京

最近访客更多访客>>

yunzhu

k0521klb

remote_silence

prog

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

nutch 1.2 系统学习

Apache lucene Hadoop Ant XML

今尝试下给nutch1.2增加一个插件，于是到官网找了个例子，链接如下：

http://wiki.apache.org/nutch/WritingPluginExample-0.9

这个例子实现的的是推荐网站，就是写关键字在content里，当别人搜索这个关键字时，你推荐的网站在搜索结果中排前，要实现推荐必须在你的网页上加上

view plaincopy to clipboardprint?
<meta name="recommended" content="plugins" />
<meta name="recommended" content="plugins" />

这条属性才能被插件识别。

由于它这个例子是用nutch0.9的，而且1.2和0.9有些区别，于是要修改一些代码。步骤如下：

1.插件开放

1.1在src/plugin中新建一个文件夹recommend

1.2.在recommend目录下新建Plugin.xml和Build.xml文件，内容如下：

Plugin.xml

view plaincopy to clipboardprint?
<?xml version="1.0" encoding="UTF-8"?>
<plugin
   id="recommended"
   name="Recommended Parser/Filter"
   version="0.0.1"
   provider-name="nutch.org">

   <runtime>
      
      <library name="recommended.jar">
         <export name="*"/>
      </library>
   </runtime>

   
   <extension id="org.apache.nutch.parse.recommended.recommendedfilter"
              name="Recommended Parser"
              point="org.apache.nutch.parse.HtmlParseFilter">
      <implementation id="RecommendedParser"
                      class="org.apache.nutch.parse.recommended.RecommendedParser"/>
   </extension>

   
   <extension id="org.apache.nutch.parse.recommended.recommendedindexer"
              name="Recommended identifier filter"
              point="org.apache.nutch.indexer.IndexingFilter">
      <implementation id="RecommendedIndexer"
                      class="org.apache.nutch.parse.recommended.RecommendedIndexer"/>
   </extension>

   
   <extension id="org.apache.nutch.parse.recommended.recommendedSearcher"
              name="Recommended Search Query Filter"
              point="org.apache.nutch.searcher.QueryFilter">
      <implementation id="RecommendedQueryFilter"
                      class="org.apache.nutch.parse.recommended.RecommendedQueryFilter">
        <parameter name="fields" value="recommended"/>
        </implementation>
   </extension>

</plugin>
<?xml version="1.0" encoding="UTF-8"?>
<plugin
   id="recommended"
   name="Recommended Parser/Filter"
   version="0.0.1"
   provider-name="nutch.org">

<extension id="org.apache.nutch.parse.recommended.recommendedfilter"
              name="Recommended Parser"
              point="org.apache.nutch.parse.HtmlParseFilter">
      <implementation id="RecommendedParser"
                      class="org.apache.nutch.parse.recommended.RecommendedParser"/>
   </extension>

<extension id="org.apache.nutch.parse.recommended.recommendedindexer"
              name="Recommended identifier filter"
              point="org.apache.nutch.indexer.IndexingFilter">
      <implementation id="RecommendedIndexer"
                      class="org.apache.nutch.parse.recommended.RecommendedIndexer"/>
   </extension>

<extension id="org.apache.nutch.parse.recommended.recommendedSearcher"
              name="Recommended Search Query Filter"
              point="org.apache.nutch.searcher.QueryFilter">
      <implementation id="RecommendedQueryFilter"
                      class="org.apache.nutch.parse.recommended.RecommendedQueryFilter">
        <parameter name="fields" value="recommended"/>
        </implementation>
   </extension>

</plugin>

Build.xml

view plaincopy to clipboardprint?
<?xml version="1.0"?>

<project name="recommended" default="jar-core">

<import file="../build-plugin.xml"/>


<target name="deps-jar">
   <ant target="jar" inheritall="false" dir="../lib-xml"/>
</target>


<path id="plugin.deps">
   <fileset dir="${nutch.root}/build">
     <include name="**/lib-xml/*.jar" />
   </fileset>
</path>


<target name="deps-test">
   <ant target="deploy" inheritall="false" dir="../lib-xml"/>
   <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
   <ant target="deploy" inheritall="false" dir="../protocol-file"/>
</target>



<mkdir dir="${build.test}/data"/>
<copy file="data/recommended.html" todir="${build.test}/data"/>
</project>
<?xml version="1.0"?>

<path id="plugin.deps">
   <fileset dir="${nutch.root}/build">
     <include name="**/lib-xml/*.jar" />
   </fileset>
</path>

<target name="deps-test">
   <ant target="deploy" inheritall="false" dir="../lib-xml"/>
   <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
   <ant target="deploy" inheritall="false" dir="../protocol-file"/>
</target>

<mkdir dir="${build.test}/data"/>
<copy file="data/recommended.html" todir="${build.test}/data"/>
</project>

1.3.在recommended目录下建立\src\java\org\apache\nutch\parse\recommended目录。

1.4.增加RecommendedIndexer.java,RecommendedParser.java,RecommendedQueryFilter.java三个类，内容如下：

RecommendedIndexer.java

view plaincopy to clipboardprint?
package org.apache.nutch.parse.recommended;

// JDK import
import java.util.logging.Logger;

// Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

// Nutch imports
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;

// Lucene imports
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;

public class RecommendedIndexer implements IndexingFilter {

public static final Log LOG = LogFactory.getLog(RecommendedIndexer.class.getName());

private Configuration conf;

public RecommendedIndexer() {
}
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks)
    throws IndexingException {

    String recommendation = parse.getData().getMeta("recommended");

        if (recommendation != null) {
            Field recommendedField =
                new Field("recommended", recommendation,
                    Field.Store.YES, Field.Index.NOT_ANALYZED);
            recommendedField.setBoost(5.0f);
            doc.add("recommended",recommendedField);
            LOG.info("Added " + recommendation + " to the recommended Field");
        }

    return doc;
}

public void setConf(Configuration conf) {
    this.conf = conf;
}

public Configuration getConf() {
    return this.conf;
}

@Override
public void addIndexBackendOptions(Configuration conf) {
    // TODO Auto-generated method stub
}
}
package org.apache.nutch.parse.recommended;

// JDK import
import java.util.logging.Logger;

// Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

// Nutch imports
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;

// Lucene imports
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;

public class RecommendedIndexer implements IndexingFilter {

public static final Log LOG = LogFactory.getLog(RecommendedIndexer.class.getName());

private Configuration conf;

public RecommendedIndexer() {
}
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
    CrawlDatum datum, Inlinks inlinks)
    throws IndexingException {

String recommendation = parse.getData().getMeta("recommended");

        if (recommendation != null) {
            Field recommendedField =
                new Field("recommended", recommendation,
                    Field.Store.YES, Field.Index.NOT_ANALYZED);
            recommendedField.setBoost(5.0f);
            doc.add("recommended",recommendedField);
            LOG.info("Added " + recommendation + " to the recommended Field");
        }

return doc;
}

public void setConf(Configuration conf) {
this.conf = conf;
}

public Configuration getConf() {
return this.conf;
}

@Override
public void addIndexBackendOptions(Configuration conf) {
// TODO Auto-generated method stub
}
}

RecommendedParser.java

view plaincopy to clipboardprint?
package org.apache.nutch.parse.recommended;

// JDK imports
import java.util.Enumeration;
import java.util.Properties;
import java.util.logging.Logger;

// Nutch imports
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;

// Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

// W3C imports
import org.w3c.dom.DocumentFragment;

public class RecommendedParser implements HtmlParseFilter {

private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName());

private Configuration conf;

/** The Recommended meta data attribute name */
public static final String META_RECOMMENDED_NAME="recommended";

/**
   * Scan the HTML document looking for a recommended meta tag.
   */

@Override
public ParseResult filter(Content content, ParseResult parseResult,
    HTMLMetaTags metaTags, DocumentFragment doc) {
    // Trying to find the document's recommended term
    String recommendation = null;

    Properties generalMetaTags = metaTags.getGeneralTags();

    for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames.hasMoreElements(); ) {
        if (tagNames.nextElement().equals("recommended")) {
            System.out.println(generalMetaTags.getProperty("recommended"));
            recommendation = generalMetaTags.getProperty("recommended");
           LOG.info("Found a Recommendation for " + recommendation);
        }
    }

    if (recommendation == null) {
        LOG.info("No Recommendation");
    } else {
        LOG.info("Adding Recommendation for " + recommendation);
        Parse parse = parseResult.get(content.getUrl());

        parse.getData().getContentMeta().set(META_RECOMMENDED_NAME, recommendation);
    }

    return parseResult;
}

public void setConf(Configuration conf) {
    this.conf = conf;
}

public Configuration getConf() {
    return this.conf;
}

}
package org.apache.nutch.parse.recommended;

// JDK imports
import java.util.Enumeration;
import java.util.Properties;
import java.util.logging.Logger;

// Nutch imports
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;

// Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

// W3C imports
import org.w3c.dom.DocumentFragment;

public class RecommendedParser implements HtmlParseFilter {

private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName());

private Configuration conf;

/** The Recommended meta data attribute name */
public static final String META_RECOMMENDED_NAME="recommended";

/**
   * Scan the HTML document looking for a recommended meta tag.
   */

@Override
public ParseResult filter(Content content, ParseResult parseResult,
    HTMLMetaTags metaTags, DocumentFragment doc) {
    // Trying to find the document's recommended term
    String recommendation = null;

Properties generalMetaTags = metaTags.getGeneralTags();

    for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames.hasMoreElements(); ) {
        if (tagNames.nextElement().equals("recommended")) {
            System.out.println(generalMetaTags.getProperty("recommended"));
        recommendation = generalMetaTags.getProperty("recommended");
           LOG.info("Found a Recommendation for " + recommendation);
        }
    }

    if (recommendation == null) {
        LOG.info("No Recommendation");
    } else {
        LOG.info("Adding Recommendation for " + recommendation);
        Parse parse = parseResult.get(content.getUrl());

        parse.getData().getContentMeta().set(META_RECOMMENDED_NAME, recommendation);
    }

return parseResult;
}

public void setConf(Configuration conf) {
this.conf = conf;
}

public Configuration getConf() {
return this.conf;
}

}

RecommendedQueryFilter.java

view plaincopy to clipboardprint?
package org.apache.nutch.parse.recommended;

import org.apache.nutch.searcher.FieldQueryFilter;

import java.util.logging.Logger;

// Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class RecommendedQueryFilter extends FieldQueryFilter {
    private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName());

    public RecommendedQueryFilter() {
        super("recommended", 5f);
        LOG.info("Added a recommended query");
    }

}
package org.apache.nutch.parse.recommended;

import org.apache.nutch.searcher.FieldQueryFilter;

import java.util.logging.Logger;

// Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

public class RecommendedQueryFilter extends FieldQueryFilter {
private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName());

    public RecommendedQueryFilter() {
        super("recommended", 5f);
        LOG.info("Added a recommended query");
    }

}

1.5.在 src/plugin/build.xml 中的<target name="deploy"></target>中增加一行：

view plaincopy to clipboardprint?
<ant dir="recommended" target="deploy" />
<ant dir="recommended" target="deploy" />

1.6.运行cmd，切换到recommend目录，运行ant命令编译，插件开发完成。

1.7 让nutch识别你的插件

在conf/nutch-site.xml 中增加一下属性

2.编写插件测试类

2.1 在src/plugin中/recommend目录下新建一个data目录，在data目录下新建一个html文件recommended.html内容如下：

view plaincopy to clipboardprint?
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN">

<html lang="en">
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <title>recommended</title>
    <meta name="generator" content="TextMate http://macromates.com/">
    <meta name="author" content="Ricardo J. Méndez">
    <meta name="recommended" content="recommended-content"/>
    
</head>
<body>
    Recommended meta tag test.
</body>
</html>
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN">

<html lang="en">
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <title>recommended</title>
    <meta name="generator" content="TextMate http://macromates.com/">
    <meta name="author" content="Ricardo J. Méndez">
    <meta name="recommended" content="recommended-content"/>
    
</head>
<body>
    Recommended meta tag test.
</body>
</html>

2.2 在src/plugin中/recommend目录下新建src/test/org/apache/nutch/parse/recommended目录，增加TestRecommendedParser.java类，内容如下：

view plaincopy to clipboardprint?
package org.apache.nutch.parse.recommended;

import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;

import java.util.Properties;
import java.io.*;
import java.net.URL;

import junit.framework.TestCase;

/*
* Loads test page recommended.html and verifies that the recommended
* meta tag has recommended-content as its value.
*
*/
public class TestRecommendedParser extends TestCase {

private static final File testDir =
    new File("H:/project/SearchEngine/Nutch1.2/src/plugin/recommended/data");

public void testPages() throws Exception {
    pageTest(new File(testDir, "recommended.html"), "http://foo.com/",
             "recommended-content");

}

public void pageTest(File file, String url, String recommendation)
    throws Exception {

    String contentType = "text/html";
    InputStream in = new FileInputStream(file);

    ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
    byte[] buffer = new byte[1024];
    int i;
    while ((i = in.read(buffer)) != -1) {
      out.write(buffer, 0, i);
    }
    in.close();
    byte[] bytes = out.toByteArray();
    Configuration conf = NutchConfiguration.create();

    Content content =
      new Content(url, url, bytes, contentType, new Metadata(), conf);

    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html",content).get(content.getUrl());

    Metadata metadata = parse.getData().getContentMeta();

    assertEquals(recommendation, metadata.get("recommended"));
    assertTrue("somesillycontent" != metadata.get("recommended"));
}

}
package org.apache.nutch.parse.recommended;

import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;

import java.util.Properties;
import java.io.*;
import java.net.URL;

import junit.framework.TestCase;

/*
* Loads test page recommended.html and verifies that the recommended
* meta tag has recommended-content as its value.
*
*/
public class TestRecommendedParser extends TestCase {

private static final File testDir =
new File("H:/project/SearchEngine/Nutch1.2/src/plugin/recommended/data");

public void testPages() throws Exception {
pageTest(new File(testDir, "recommended.html"), "http://foo.com/",
"recommended-content");

}

public void pageTest(File file, String url, String recommendation)
throws Exception {

    String contentType = "text/html";
    InputStream in = new FileInputStream(file);

    ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
    byte[] buffer = new byte[1024];
    int i;
    while ((i = in.read(buffer)) != -1) {
      out.write(buffer, 0, i);
    }
    in.close();
    byte[] bytes = out.toByteArray();
    Configuration conf = NutchConfiguration.create();

    Content content =
      new Content(url, url, bytes, contentType, new Metadata(), conf);

    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html",content).get(content.getUrl());

    Metadata metadata = parse.getData().getContentMeta();

    assertEquals(recommendation, metadata.get("recommended"));
    assertTrue("somesillycontent" != metadata.get("recommended"));
}

}

2.3 用junit运行TestRecommendedParser.java测试。

本文来自CSDN博客，转载请标明出处：http://blog.csdn.net/laigood12345/archive/2010/10/09/5929388.aspx

更多实例：http://www.lsoba.cn

分享到：