heritrix爬取友人网（http://mobile.younet.com/）信息后遇到的问题

0 0

heritrix爬取友人网（http://mobile.younet.com/）信息后遇到的问题15

最近在使用heritrix爬取了http://mobile.younet.com/网站的网站产品页面后，在运行写入main函数的 Extractor后，控制台输出信息与《开发自己的搜索引擎》所给出的效果是一样的，问题是在我所定义的目标目录下并没有相应的txt文档与图片生成，我由于初学实在是解决不出来，贴出我用的两个类Extractor和ExtractYounetMobile希望大家能帮我找找是什么原因了。

我把抓取的mirror和精简的mySearch打包放了上来名字叫mySearchAndData。

# package com.backSearch.extractor;
#
# import java.io.File;
# import java.io.FileInputStream;
# import java.io.FileOutputStream;
# import java.util.regex.Matcher;
# import java.util.regex.Pattern;
#
# import org.htmlparser.Parser;
#
# import com.backSearch.extractor.younet.ExtractYounetMobile;
#
# public abstract class Extractor {
#
#     protected static final String NEWLINE = "\r\n";
#
#     /**
#      * 表示所有结果的输出路径
#      */
#     private String outputPath = "";
#
#     /**
#      * 表示当前正在被处理的文件
#      */
#     private String inputFilePath;
#
#     /**
#      * 表示当前所有被抓取的网页的镜象根目录在Heritrix用mirror目录表示
#      */
#     private String mirrorDir = "";
#
#     /**
#      * 用于存放被处理过后的产口的图片的目录
#      */
#     private String imageDir = "";
#
#     /**
#      * HTMLParser的实例
#      */
#     private Parser parser;
#
#     /**
#      * 对图片路径进行哈希的算法，这里采用MD5算法
#      */
#     protected static final String HASH_ALGORITHM = "md5";
#
#     /**
#      * 分隔符
#      */
#     public static final String SEPARATOR = "======================";
#
#     /**
#      * 装载需要的网页文件
#      *
#      */
#     public void loadFile(String path) {
#         try {
#             parser = new Parser(path);
#             inputFilePath = path;
#             parser.setEncoding("GB2312");
#         } catch (Exception e) {
#             e.printStackTrace();
#         }
#     }
#
#     /**
#      * 获取输出的路径
#      */
#     public String getOutputPath() {
#         return outputPath;
#     }
#
#     /**
#      * 设置输出的路径，通常在初始化Extractor时就应该做
#      */
#     public void setOutputPath(String outputPath) {
#         this.outputPath = outputPath;
#     }
#
#     public Parser getParser() {
#         return parser;
#     }
#
#     /**
#      * 使用正则来匹配并获得网页中的字符串
#      */
#     protected String getProp(String pattern, String match, int index) {
#         Pattern sp = Pattern.compile(pattern);
#         Matcher matcher = sp.matcher(match);
#         while (matcher.find()) {
#             return matcher.group(index);
#         }
#         return null;
#     }
#
#     /**
#      * 抽象方法，用于供子类实现。其功能主要是解释网页文件将产品信息保存到
#      *
#      */
#     public abstract void extract();
#
#     /**
#      * 获取正在处理的文件的路径
#      */
#     public String getInputFilePath() {
#         return inputFilePath;
#     }
#
#     /**
#      * 从mirror目录下拷贝文件至所设定的图片目录
#      * 该方法可能需要被改变
#      */
#     protected boolean copyImage(String image_url, String new_image_file) {
#
#         String dirs = image_url.substring(7);
#
#         try {
#             // instance the File as file_in and file_out
#             File file_in = new File(new File(mirrorDir), dirs);
#             if (file_in == null || !file_in.exists()) {
#                 file_in = new File("f:\\sousuo\\noimage.jpg");
#             }
#
#             File file_out = new File(new File(imageDir), new_image_file);
#
#             FileInputStream in1 = new FileInputStream(file_in);
#             FileOutputStream out1 = new FileOutputStream(file_out);
#
#             byte[] bytes = new byte[1024];
#             int c;
#             while ((c = in1.read(bytes)) != -1)
#                 out1.write(bytes, 0, c);
#
#             // close
#             in1.close();
#             out1.close();
#             return (true); // if success then return true
#         } catch (Exception e) {
#             e.printStackTrace();
#             return (false); // if fail then return false
#         }
#     }
#
#     public String getImageDir() {
#         return imageDir;
#     }
#
#     public void setImageDir(String imageDir) {
#         this.imageDir = imageDir;
#     }
#
#     public String getMirrorDir() {
#         return mirrorDir;
#     }
#
#     public void setMirrorDir(String mirrorDir) {
#         this.mirrorDir = mirrorDir;
#     }
#
#     public void setInputFilePath(String inputFilePath) {
#         this.inputFilePath = inputFilePath;
#     }
#
# // public static void main(String[] args) throws Exception {
# //
# //      Extractor extractor = new Extract163Moblie();
# //      extractor.setOutputPath("c:\\product\\test\\mobile\\");
# //      extractor.setImageDir("c:\\product\\test\\image\\");
# //      extractor.setMirrorDir("F:\\data \\163手机\\mirror\\");
# //
# //      traverse(extractor, new File("F:\\data \\163手机\\mirror\\mobile.163.com\\0011\\product\\0011000B\\product"));
# //      System.out.println(count);
# //
# // }
#     static int count = 0;
#
#     public static void main(String[] args) throws Exception {
#
#         Extractor extractor = new ExtractYounetMobile();
#         extractor.setOutputPath("F:\\product\\mobile\\");
#         extractor.setImageDir("F:\\product\\image\\");
#         extractor.setMirrorDir("F:\\YounetMobile-20100514064948846\\mirror\\");
#
#         //try {
#             //long s = System.currentTimeMillis();
#             traverse(extractor, new File("F:\\YounetMobile-20100514064948846\\mirror\\mobile.younet.com\\files\\"));
#             //long e = System.currentTimeMillis();
#             //System.out.println("1---------------------" + e);
#             //System.out.println("2---------------------" + s);
#             //System.out.println(" 用时： " + (e - s) / 1000 + " 秒");
#             System.out.println("总数" + count);
#     // } catch (Exception e) {
#         // e.printStackTrace();
#         //}
#
#     }
#
#
#     public static void traverse(Extractor extractor, File path)
#             throws Exception {
#         if (path == null) {
#             return;
#         }
#
#         if (path.isDirectory()) {
#             String[] files = path.list();
#             for (int i = 0; i < files.length; i++) {
#                 traverse(extractor, new File(path, files[i]));
#             }
#         } else {
#             if (path.getAbsolutePath().endsWith(".html")
#                     && path.getAbsolutePath().indexOf("_") == -1) {
#                 System.out.println(path);
#                 count++;
#                 extractor.loadFile(path.getAbsolutePath());
#                 extractor.extract();
#             }
#         }
#     }
#
# }
# package com.backSearch.extractor.younet;
#
# import java.io.BufferedWriter;
# import java.io.File;
# import java.io.FileWriter;
# import java.io.IOException;
# import java.util.Date;
#
# import org.htmlparser.Node;
# import org.htmlparser.NodeFilter;
# import org.htmlparser.Parser;
# import org.htmlparser.filters.AndFilter;
# import org.htmlparser.filters.HasAttributeFilter;
# import org.htmlparser.filters.HasChildFilter;
# import org.htmlparser.filters.TagNameFilter;
# import org.htmlparser.tags.ImageTag;
# import org.htmlparser.util.NodeIterator;
# import org.htmlparser.util.NodeList;
#
# import com.backSearch.extractor.Extractor;
# import com.backSearch.util.StringUtils;
#
#
# public class ExtractYounetMobile extends Extractor {
#
#     @Override
#     public void extract() {
#         BufferedWriter bw = null;
#         NodeFilter title_filter = new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class", "mo_tit"));
#         NodeFilter attribute_filter = new AndFilter(new TagNameFilter("p"), new HasChildFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp1 blue1"))));
#         NodeFilter img_filter = new AndFilter(new TagNameFilter("span"), new HasChildFilter(new TagNameFilter("img")));
#
#         //提取标题信息
#         try {
#             //Parser根据过滤器返回所有满足过滤条件的节点
#             // 迭代逐渐查找
#             NodeList nodeList=this.getParser().parse(title_filter);
#             NodeIterator it = nodeList.elements();
#             StringBuffer title = new StringBuffer();
#             while (it.hasMoreNodes()) {
#                 Node node = (Node) it.nextNode();
#                 String[] names = node.toPlainTextString().split(" ");
#                 for(int i = 0; i < names.length; i++)
#                     title.append(names[i]).append("-");
#                 title.append(new Date().getTime());
#                 //创建要生成的文件
#                 bw = new BufferedWriter(new FileWriter(new File(this.getOutputPath() + title + ".txt")));
#                 //获取当前提取页的完整URL地址
#                 int startPos = this.getInputFilePath().indexOf("mirror") + 6;
#                 String url_seg = this.getInputFilePath().substring(startPos);
#                 url_seg = url_seg.replaceAll("\\\\", "/");
#                 String url = "http:/" + url_seg;
#                 //写入当前提取页的完整URL 地址
#                 bw.write(url + NEWLINE);
#                 bw.write(names[0] + NEWLINE);
#                 bw.write(names[1] + NEWLINE);
#
#             }
#             // 重置Parser
#             this.getParser().reset();
#             Parser attNameParser = null;
#             Parser attValueParser = null;
#             //Parser parser=new Parser("http://www.sina.com.cn");
#             NodeFilter attributeName_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp1 blue1"));
#             NodeFilter attributeValue_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class", "gn_sp2"));
#             String attName = "";
#             String attValue = "";
#             // 迭代逐渐查找
#             nodeList=this.getParser().parse(attribute_filter);
#             it = nodeList.elements();
#             while (it.hasMoreNodes()) {
#                 Node node = (Node) it.nextNode();
#                 attNameParser = new Parser();
#                 attNameParser.setEncoding("UTF-8");
#                 attNameParser.setInputHTML(node.toHtml());
#                 NodeList attNameNodeList = attNameParser.parse(attributeName_filter);
#                 attName = attNameNodeList.elements().nextNode().toPlainTextString();
#
#                 attValueParser = new Parser();
#                 attValueParser.setEncoding("UTF-8");
#                 attValueParser.setInputHTML(node.toHtml());
#                 NodeList attValueNodeList = attValueParser.parse(attributeValue_filter);
#                 attValue = attValueNodeList.elements().nextNode().toPlainTextString();
#                 bw.write(attName.trim() + attValue.trim());
#                 bw.newLine();
#             }
#             // 重置Parser
#             this.getParser().reset();
#             String imgUrl = "";
#             String fileType ="";
#             // 迭代逐渐查找
#             nodeList=this.getParser().parse(img_filter);
#             it = nodeList.elements();
#             while (it.hasMoreNodes()) {
#                 Node node = (Node) it.nextNode();
#
#                 ImageTag imgNode = (ImageTag)node.getChildren().elements().nextNode();
#                 imgUrl = imgNode.getAttribute("src");
#                 fileType = imgUrl.trim().substring(imgUrl
#                         .lastIndexOf(".") + 1);
#                 //生成新的图片的文件名
#                 String new_iamge_file = StringUtils.encodePassword(imgUrl, HASH_ALGORITHM) + "." + fileType;
#                 //imgUrl = new HtmlPaserFilterTest().replace(new_iamge_file, "+", " ");
#                 //利用miorr目录下的图片生成的新的图片
#                 this.copyImage(imgUrl, new_iamge_file);
#                 bw.write(SEPARATOR + NEWLINE);
#                 bw.write(new_iamge_file + NEWLINE);
#             }
#
#
#         } catch(Exception e) {
#             e.printStackTrace();
#         } finally {
#             try{
#                 if (bw != null)
#                     bw.close();
#             }catch(IOException e){
#                 e.printStackTrace();
#             }
#         }
#
#     }
# }

编程综合