使用Java调用百度搜索（转）

wbj0110

浏览: 1638515 次
性别:
来自: 上海

最近访客更多访客>>

一往无前bhz

ninja2006

loginboot

u012363178

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

Java
搜索引擎
爬虫

使用Java调用百度搜索 Java 搜索引擎爬虫

search-demo托管于github

search-demo演示了如何利用Java来调用百度搜索和谷歌搜索，更多细节请到github上查看search-demo

自己没搜索引擎，又想要大规模的数据源，怎么办？可以对百度搜索和谷歌搜索善加利用，以小搏大，站在巨人的肩膀上。有很多的应用场景可以很巧妙地借助百度搜索和谷歌搜索来实现，比如网站的新闻采集，比如技术、品牌的新闻跟踪，比如知识库的收集，比如人机问答系统等，我之前做的一个准确率达百分之九十几的人机问答系统的数据源，其中一部分就是充分利用了百度搜索和谷歌搜索。在此演示的技术的基础上，可以容易地扩展到其他的搜索引擎，可以借鉴使用的NekoHTML+XPath或JSoup+CSSPath技术，轻松获取页面的自定义的内容。

实现方式一：NekoHTML+XPath

Java代码 


 
package org.apdplat.demo.search;  
  
import java.io.BufferedReader;  
import java.io.ByteArrayInputStream;  
import java.io.IOException;  
import java.io.InputStream;  
import java.io.InputStreamReader;  
import java.io.UnsupportedEncodingException;  
import java.net.URL;  
import java.util.ArrayList;  
import java.util.HashMap;  
import java.util.List;  
import java.util.Map;  
  
import org.cyberneko.html.parsers.DOMParser;  
import org.slf4j.Logger;  
import org.slf4j.LoggerFactory;  
import org.w3c.dom.Document;  
import org.w3c.dom.Node;  
import org.w3c.dom.NodeList;  
import org.xml.sax.InputSource;  
  
import com.sun.org.apache.xpath.internal.XPathAPI;  
import javax.xml.transform.TransformerException;  
import org.w3c.dom.DOMException;  
import org.xml.sax.SAXException;  
  
public class NekoHTMLBaiduSearcher implements Searcher{  
    private static final Logger LOG = LoggerFactory.getLogger(NekoHTMLBaiduSearcher.class);  
  
    public List<String> parse(String url, String xpathExpression) {  
        InputStream in = null;  
        try {  
            in = new URL(url).openStream();  
            return parse(in, xpathExpression);  
        } catch (Exception e) {  
            LOG.error("错误", e);  
        } finally {  
            if (in != null) {  
                try {  
                    in.close();  
                } catch (IOException e) {  
                    LOG.error("错误", e);  
                }  
            }  
        }  
        return null;  
    }  
  
    public List<String> parse(InputStream in, String xpathExpression) {  
        return parse(in, xpathExpression, "UTF-8");  
    }  
  
    public List<Map<String, String>> parseMore(InputStream in, String xpathExpression) {  
        return parseMore(in, xpathExpression, "UTF-8");  
    }  
  
    public List<Map<String, String>> parseMore(InputStream in, String xpathExpression, String encoding) {  
        DOMParser parser = new DOMParser();  
        List<Map<String, String>> list = new ArrayList<>();  
        try {  
            // 设置网页的默认编码  
            parser.setProperty(  
                    "http://cyberneko.org/html/properties/default-encoding",  
                    encoding);  
            /* 
             * The Xerces HTML DOM implementation does not support namespaces 
             * and cannot represent XHTML documents with namespace information. 
             * Therefore, in order to use the default HTML DOM implementation 
             * with NekoHTML's DOMParser to parse XHTML documents, you must turn 
             * off namespace processing. 
             */  
            parser.setFeature("http://xml.org/sax/features/namespaces", false);  
            parser.parse(new InputSource(new BufferedReader(new InputStreamReader(in, encoding))));  
            Document doc = parser.getDocument();  
            NodeList products = XPathAPI.selectNodeList(doc, xpathExpression.toUpperCase());  
            for (int i = 0; i < products.getLength(); i++) {  
                Node node = products.item(i);  
                String title = node.getTextContent();  
                Map<String, String> map = new HashMap<>();  
                map.put("title", title);  
                try {  
                    String href = node.getAttributes().getNamedItem("href").getTextContent();  
                    map.put("href", href);  
                } catch (Exception e) {  
                    LOG.error("提取链接失败",e);  
                }  
                list.add(map);  
            }  
        } catch (SAXException | IOException | TransformerException | DOMException e) {  
            LOG.error("错误", e);  
        }  
        return list;  
    }  
  
    public List<String> parse(InputStream in, String xpathExpression, String encoding) {  
        DOMParser parser = new DOMParser();  
        List<String> list = new ArrayList<>();  
        try {  
            // 设置网页的默认编码  
            parser.setProperty(  
                    "http://cyberneko.org/html/properties/default-encoding",  
                    encoding);  
            /* 
             * The Xerces HTML DOM implementation does not support namespaces 
             * and cannot represent XHTML documents with namespace information. 
             * Therefore, in order to use the default HTML DOM implementation 
             * with NekoHTML's DOMParser to parse XHTML documents, you must turn 
             * off namespace processing. 
             */  
            parser.setFeature("http://xml.org/sax/features/namespaces", false);  
            parser.parse(new InputSource(new BufferedReader(new InputStreamReader(in, encoding))));  
            Document doc = parser.getDocument();  
            NodeList products = XPathAPI.selectNodeList(doc, xpathExpression.toUpperCase());  
            for (int i = 0; i < products.getLength(); i++) {  
                Node node = products.item(i);  
                list.add(node.getTextContent());  
            }  
        } catch (SAXException | IOException | TransformerException | DOMException e) {  
            LOG.error("错误", e);  
        }  
        return list;  
    }  
  
    @Override  
    public List<Webpage> search(String url) {  
        InputStream in = null;  
        try {  
            in = new URL(url).openStream();  
            return search(in);  
        } catch (Exception e) {  
            LOG.error("错误", e);  
        } finally {  
            if (in != null) {  
                try {  
                    in.close();  
                } catch (IOException e) {  
                    LOG.error("错误", e);  
                }  
            }  
        }  
        return null;  
    }  
  
    public List<Webpage> search(InputStream in) {  
        //保证只读一次  
        byte[] datas = Tools.readAll(in);  
        if (LOG.isDebugEnabled()) {  
            try {  
                LOG.debug("内容：" + new String(datas, "UTF-8"));  
            } catch (UnsupportedEncodingException e) {  
                LOG.error("错误", e);  
            }  
        }  
  
        in = new ByteArrayInputStream(datas);  
  
        String totalXpathExpression = "//html/body/div/div/div/div[3]/p/span";  
        List<String> totals = parse(in, totalXpathExpression);  
        int total;  
        int len = 10;  
        if (totals != null && totals.size() == 1) {  
            String str = totals.get(0);  
            int start = 10;  
            if (str.indexOf("约") != -1) {  
                start = 11;  
            }  
            total = Integer.parseInt(str.substring(start).replace(",", "").replace("个", ""));  
            LOG.info("搜索结果数：" + total);  
        } else {  
            return null;  
        }  
        if (total < 1) {  
            return null;  
        }  
        if (total < 10) {  
            len = total;  
        }  
        List<Webpage> webpages = new ArrayList<>();  
        for (int i = 0; i < len; i++) {  
            String content = "";  
            String url = "";  
            String titleXpathExpression = "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/h3/a";  
            String summaryXpathExpression = "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/div[1]";  
            LOG.debug("titleXpathExpression:" + titleXpathExpression);  
            LOG.debug("summaryXpathExpression:" + summaryXpathExpression);  
            //重新构造输入流  
            in = new ByteArrayInputStream(datas);  
            List<String> titles = parse(in, titleXpathExpression);  
  
            //重新构造输入流  
            in = new ByteArrayInputStream(datas);  
            List<Map<String, String>> titleWithHrefs = parseMore(in, titleXpathExpression);  
            for (Map<String, String> titleWithHref : titleWithHrefs) {  
                String title = titleWithHref.get("title");  
                String href = titleWithHref.get("href");  
                LOG.debug(title + " " + titleWithHref.get("href"));  
                if (href != null) {  
                    content = Tools.getHTMLContent(href);  
                    url = href;  
                } else {  
                    LOG.info("页面正确提取失败");  
                }  
            }  
  
            //重新构造输入流  
            in = new ByteArrayInputStream(datas);  
            List<String> summaries = parse(in, summaryXpathExpression);  
            //处理百度知道1  
            if (titles != null && titles.size() == 1 && (summaries == null || summaries.isEmpty())) {  
                //重新构造输入流  
                in = new ByteArrayInputStream(datas);  
                String baiduZhidao1XpathExpression = "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/font[2]/div/div/p[2]";  
                LOG.debug("baiduZhidao1XpathExpression:" + baiduZhidao1XpathExpression);  
                summaries = parse(in, baiduZhidao1XpathExpression);  
            }  
            //处理百度知道2  
            if (titles != null && titles.size() == 1 && (summaries == null || summaries.isEmpty())) {  
                //重新构造输入流  
                in = new ByteArrayInputStream(datas);  
                String baiduZhidao2XpathExpression = "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/font[2]";  
                LOG.debug("baiduZhidao2XpathExpression:" + baiduZhidao2XpathExpression);  
                summaries = parse(in, baiduZhidao2XpathExpression);  
            }  
            //处理百度文库  
            if (titles != null && titles.size() == 1 && (summaries == null || summaries.isEmpty())) {  
                //重新构造输入流  
                in = new ByteArrayInputStream(datas);  
                String baiduWenkuXpathExpression = "//html/body/div/div/div/div[3]/div[2]/table[" + (i + 1) + "]/tbody/tr/td/font[1]";  
                LOG.debug("baiduWenkuXpathExpression:" + baiduWenkuXpathExpression);  
                summaries = parse(in, baiduWenkuXpathExpression);  
            }  
  
            if (titles != null && titles.size() == 1 && summaries != null && summaries.size() == 1) {  
                Webpage webpage = new Webpage();  
                webpage.setTitle(titles.get(0));  
                webpage.setUrl(url);  
                webpage.setSummary(summaries.get(0));  
                webpage.setContent(content);  
                webpages.add(webpage);  
            } else {  
                LOG.error("获取搜索结果列表项出错:" + titles + " - " + summaries);  
            }  
        }  
        if(webpages.size() < 10){              
            //处理百度百科  
            String titleXpathExpression = "//html/body/div/div/div/div[3]/div[2]/div/h3/a";  
            String summaryXpathExpression = "//html/body/div/div/div/div[3]/div[2]/div/div/p";  
            LOG.debug("处理百度百科 titleXpathExpression:" + titleXpathExpression);  
            LOG.debug("处理百度百科 summaryXpathExpression:" + summaryXpathExpression);  
            //重新构造输入流  
            in = new ByteArrayInputStream(datas);  
            List<String> titles = parse(in, titleXpathExpression);  
            //重新构造输入流  
            in = new ByteArrayInputStream(datas);  
            List<Map<String, String>> titleWithHrefs = parseMore(in, titleXpathExpression);  
            String content = "";  
            String url = "";  
            for (Map<String, String> titleWithHref : titleWithHrefs) {  
                String title = titleWithHref.get("title");  
                String href = titleWithHref.get("href");  
                LOG.debug(title + " " + titleWithHref.get("href"));  
                if (href != null) {  
                    content = Tools.getHTMLContent(href);  
                    url = href;  
                } else {  
                    LOG.info("页面正确提取失败");  
                }  
            }  
            //重新构造输入流  
            in = new ByteArrayInputStream(datas);  
            List<String> summaries = parse(in, summaryXpathExpression);  
            if (titles != null && titles.size() == 1 && summaries != null && summaries.size() == 1) {  
                Webpage webpage = new Webpage();  
                webpage.setTitle(titles.get(0));  
                webpage.setUrl(url);  
                webpage.setSummary(summaries.get(0));  
                webpage.setContent(content);  
                webpages.add(webpage);  
            } else {  
                LOG.error("获取搜索结果列表项出错:" + titles + " - " + summaries);  
            }  
        }  
        if (webpages.isEmpty()) {  
            return null;  
        }  
        return webpages;  
    }  
  
    public static void main(String[] args) {  
        String url = "http://www.baidu.com/s?pn=0&wd=杨尚川";  
          
        Searcher searcher = new NekoHTMLBaiduSearcher();  
        List<Webpage> webpages = searcher.search(url);  
        if (webpages != null) {  
            int i = 1;  
            for (Webpage webpage : webpages) {  
                LOG.info("搜索结果 " + (i++) + " ：");  
                LOG.info("标题：" + webpage.getTitle());  
                LOG.info("URL：" + webpage.getUrl());  
                LOG.info("摘要：" + webpage.getSummary());  
                LOG.info("正文：" + webpage.getContent());  
                LOG.info("");  
            }  
        } else {  
            LOG.error("没有搜索到结果");  
        }  
    }  
}  

实现方式二：JSoup+CSSPath

Java代码 


 
package org.apdplat.demo.search;  
  
import java.io.IOException;  
import java.util.ArrayList;  
import java.util.List;  
import org.jsoup.Jsoup;  
import org.jsoup.nodes.Document;  
import org.jsoup.nodes.Element;  
  
import org.slf4j.Logger;  
import org.slf4j.LoggerFactory;  
  
public class JSoupBaiduSearcher implements Searcher{  
    private static final Logger LOG = LoggerFactory.getLogger(JSoupBaiduSearcher.class);  
  
    @Override  
    public List<Webpage> search(String url) {  
        List<Webpage> webpages = new ArrayList<>();  
        try {  
            Document document = Jsoup.connect(url).get();  
            String cssQuery = "html body div#out div#in div#wrapper div#container.container_s p#page span.nums";  
            LOG.debug("total cssQuery: " + cssQuery);  
            Element totalElement = document.select(cssQuery).first();  
            String totalText = totalElement.text();   
            LOG.info("搜索结果：" + totalText);  
            int start = 10;  
            if (totalText.indexOf("约") != -1) {  
                start = 11;  
            }  
            int total = Integer.parseInt(totalText.substring(start).replace(",", "").replace("个", ""));  
            LOG.info("搜索结果数：" + total);  
            int len = 10;  
            if (total < 1) {  
                return null;  
            }  
            if (total < 10) {  
                len = total;  
            }  
            for (int i = 0; i < len; i++) {  
                String titleCssQuery = "html body div#out div#in div#wrapper div#container.container_s div#content_left table#" + (i + 1) + ".result tbody tr td.c-default h3.t a";  
                String summaryCssQuery = "html body div#out div#in div#wrapper div#container.container_s div#content_left table#" + (i + 1) + ".result tbody tr td.c-default div.c-abstract";  
                LOG.debug("titleCssQuery:" + titleCssQuery);  
                LOG.debug("summaryCssQuery:" + summaryCssQuery);  
                Element titleElement = document.select(titleCssQuery).first();  
                String href = "";  
                String titleText = "";  
                if(titleElement != null){  
                    titleText = titleElement.text();  
                    href = titleElement.attr("href");  
                }else{  
                    //处理百度百科  
                    titleCssQuery = "html body div#out div#in div#wrapper div#container.container_s div#content_left div#1.result-op h3.t a";  
                    summaryCssQuery = "html body div#out div#in div#wrapper div#container.container_s div#content_left div#1.result-op div p";  
                    LOG.debug("处理百度百科 titleCssQuery:" + titleCssQuery);  
                    LOG.debug("处理百度百科 summaryCssQuery:" + summaryCssQuery);  
                    titleElement = document.select(titleCssQuery).first();  
                    if(titleElement != null){  
                        titleText = titleElement.text();  
                        href = titleElement.attr("href");  
                    }  
                }  
                LOG.debug(titleText);  
                Element summaryElement = document.select(summaryCssQuery).first();  
                //处理百度知道  
                if(summaryElement == null){  
                    summaryCssQuery = summaryCssQuery.replace("div.c-abstract","font");  
                    LOG.debug("处理百度知道 summaryCssQuery:" + summaryCssQuery);  
                    summaryElement = document.select(summaryCssQuery).first();  
                }  
                String summaryText = "";  
                if(summaryElement != null){  
                    summaryText = summaryElement.text();   
                }  
                LOG.debug(summaryText);                  
                  
                if (titleText != null && !"".equals(titleText.trim()) && summaryText != null && !"".equals(summaryText.trim())) {  
                    Webpage webpage = new Webpage();  
                    webpage.setTitle(titleText);  
                    webpage.setUrl(href);  
                    webpage.setSummary(summaryText);  
                    if (href != null) {  
                        String content = Tools.getHTMLContent(href);  
                        webpage.setContent(content);  
                    } else {  
                        LOG.info("页面正确提取失败");  
                    }  
                    webpages.add(webpage);  
                } else {  
                    LOG.error("获取搜索结果列表项出错:" + titleText + " - " + summaryText);  
                }  
            }  
              
              
        } catch (IOException ex) {  
            LOG.error("搜索出错",ex);  
        }  
        return webpages;  
    }  
  
    public static void main(String[] args) {  
        String url = "http://www.baidu.com/s?pn=0&wd=杨尚川";  
          
        Searcher searcher = new JSoupBaiduSearcher();  
        List<Webpage> webpages = searcher.search(url);  
        if (webpages != null) {  
            int i = 1;  
            for (Webpage webpage : webpages) {  
                LOG.info("搜索结果 " + (i++) + " ：");  
                LOG.info("标题：" + webpage.getTitle());  
                LOG.info("URL：" + webpage.getUrl());  
                LOG.info("摘要：" + webpage.getSummary());  
                LOG.info("正文：" + webpage.getContent());  
                LOG.info("");  
            }  
        } else {  
            LOG.error("没有搜索到结果");  
        }  
    }  
}  

search-demo.rar (9.5 KB)
描述: 2013-10-18
下载次数: 40

search-demo-master-2013-10-23.zip (13.4 KB)
描述: 2013-10-23
下载次数: 5

分享到：

Gora – 大数据持久化 | Tailrank 网站架构（转）

2014-03-17 17:27
浏览 1088
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

使用Java调用百度搜索（转）

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

使用Java调用百度搜索（转）

评论

发表评论

相关推荐

RESTful API 设计

ConcurrentModificationException and a HashMap

Java Class卸载与ClassLoader ,class热替换

java:找出占用CPU资源最多的那个线程(HOW TO)

(转)一次让人难以忘怀的排查频繁Full GC过程

(转)关于施用full gc频繁的分析及解决

(转)How to Monitor Java Garbage Collection

(转)Understanding Java Garbage Collection

(转)How to Tune Java Garbage Collection

高并发---限流

java jvm 参数 -Xms -Xmx -Xmn -Xss 调优总结

Java注解与拦截器

ExecutorCompletionService

java获得CPU使用率，内存使用率

CountDownLatch

Cron 表达式

maven 刷新

java多线程总结五：线程池的原理及实现

BlockingQueue

Java多线程-新特征-信号量Semaphore

最近访客更多访客>>