`
iluoxuan
  • 浏览: 581971 次
  • 性别: Icon_minigender_1
  • 来自: 北京
社区版块
存档分类
最新评论

定向网站爬虫---初级例子

 
阅读更多

1:url处理和html解析

 

package com.xiaoshuo.util;

import java.util.ArrayList;
import java.util.List;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.xiaoshuo.to.Chapter;
import com.xiaoshuo.to.UrlTO;

/**
 * 解析html的处理类
 * @author lijunqing
 *
 */
public class PaserUrlUtil {
    
    private HttpClient httpClient=new DefaultHttpClient();
    
    /**
     * 获得html的string字符串
     * @param url
     * @return
     * @throws Exception
     */
    public String getHtmlStr(String url) throws Exception {
        HttpGet httpGet=new HttpGet(url);
        HttpResponse response;
        String htmlStr=null;
        try {
            response=httpClient.execute(httpGet);
            HttpEntity entity=response.getEntity();
            if(entity != null) {
                htmlStr=new String(EntityUtils.toString(entity));
                htmlStr=new String(htmlStr.getBytes("ISO-8859-1"), "gbk"); // 读取乱码解决
            }
        } catch(Exception e) {
            e.printStackTrace();
        }
        return htmlStr;
    }
    
    /**
     * 获得document
     * @param url
     * @return
     * @throws Exception
     */
    public Document getDocument(String url) throws Exception{
        Thread.currentThread().sleep(5000*2);
        return Jsoup.parse(getHtmlStr(url));
    }
    
    /**
     * 获得种类url连接
     * @return
     * @throws Exception 
     */
    public List<UrlTO> getCategoryUrls(String url) throws Exception{
        Document doc = getDocument(url);
        List<UrlTO> urlList = new ArrayList<UrlTO>();
        Elements elements = doc.select(".navlist").select("li").select("a");
        String categoryUrl= null;
        UrlTO urlTO=null;
        for(Element element:elements){
            categoryUrl = element.attr("href");
            urlTO = new UrlTO();
            urlTO.setDeptValue(1);
            urlTO.setUrl(categoryUrl);
            urlList.add(urlTO);
        }
        return urlList;
    }
    
    /***
     * 通过分类url获得所有的该类下书籍url
     * @param categoryUrl
     * @return
     * @throws Exception 
     */
    public List<UrlTO> getBookUrls(String categoryUrl) throws Exception{
        System.out.println("bookUrls-处理进入 deptvalue-==1-");
        List<UrlTO> urlTOList = new ArrayList<UrlTO>();
        UrlTO urlTO = new UrlTO();
        urlTO.setDeptValue(2);
        String nextUrl = getNextBookUrl(categoryUrl);
        while(nextUrl != null && !nextUrl.trim().equals("")){
            System.out.println("bookUrls--"+nextUrl);
            urlTO.setUrl(nextUrl);
            nextUrl = getNextBookUrl(nextUrl);
            urlTOList.add(urlTO);
        }
        return urlTOList;
    }
    
    /**
     * 获得下一个分页连接
     * @param categoryUrl
     * @return
     * @throws Exception
     */
    public String getNextBookUrl(String categoryUrl) throws Exception{
        Document doc = getDocument(categoryUrl);
        Elements elements = doc.select("#pagelink").select("strong +a");
        if(elements == null){
            return null;
        }
        return elements.first().attr("href");
    }
    
    /**
     * 获取每个页面书籍详情url
     * @param categoryUrl
     * @return
     * @throws Exception
     */
    public List<UrlTO> getDetailUrlList(String categoryUrl) throws Exception{
        Document doc = getDocument(categoryUrl);
        Elements elements = doc.select(".grid").select("tr");
        String detailUrl = null;
        List<UrlTO> urlTOList = new ArrayList<UrlTO>();
        UrlTO urlTO = new UrlTO();
        for(Element element:elements){
          detailUrl =  element.select("td").first().attr("href");
          urlTO.setDeptValue(3);
          urlTO.setUrl(detailUrl);
          urlTOList.add(urlTO);
        }
        return urlTOList;
    }
    
    public UrlTO getToReadUrl(String detailUrl) throws Exception{
        Document doc = getDocument(detailUrl);
        UrlTO urlTO = new UrlTO();
        String toReadUrl=doc.select("#bt_1").select("a").first().attr("href");
        urlTO.setDeptValue(4);
        urlTO.setUrl(toReadUrl);
        return urlTO;
    }
    
    /**
     * 获得chapter的url
     * @param url
     * @return
     * @throws Exception
     */
    public List<UrlTO> getChapterList(String detailUrl) throws Exception {

        Document doc= getDocument(detailUrl);
        Elements elements=doc.select(".list").select("dd").select("a");
        List<UrlTO> urlList=new ArrayList<UrlTO>();
        UrlTO urlTO = new UrlTO();
        String chapterUrl= null;
        for(Element element: elements) {
            chapterUrl = detailUrl + element.attr("href");
            urlTO.setDeptValue(5);
            urlTO.setUrl(chapterUrl);
        }
        return urlList;
    }
    
    /**
     * 
     * @param chapterUrl
     * @return
     * @throws Exception
     */
    public Chapter getChapter(String chapterUrl) throws Exception {
        Document doc=getDocument(chapterUrl);
        Chapter chapter=new Chapter();
        String name=doc.select("h1").text();
        String content=doc.select(".width").text();
        chapter.setName(name);
        chapter.setContent(content);
        return chapter;
    }
    
}

 2:url实体类

package com.xiaoshuo.to;

/**
 * url保存类
 * @author lijunqing
 *
 */
public class UrlTO {

    private Integer deptValue;

    private String url;

    public Integer getDeptValue() {
        return deptValue;
    }

    public void setDeptValue(Integer deptValue) {
        this.deptValue=deptValue;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url=url;
    }
    
    public String toString(){
       return "dept="+deptValue+"--url--"+url;
    }

}

 3:队列类

package com.xiaoshuo.url;

import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;

import com.xiaoshuo.to.UrlTO;

/**
 * 保存访问的url
 * @author lijunqing
 */
public class LinkQueue {

    // 已经访问的url集合
    private static Set<Object> visitedUrl=new HashSet<Object>();

    // 未被访问的url集合
    private static Queue<Object> unVisitedUrl=new LinkedList<Object>();

    public static Queue<Object> getUnVisitedUrl() {
        return unVisitedUrl;
    }

    public static void removeVisitedUrl(String url) {
        visitedUrl.remove(url);
    }

    public static Object unVisitedPoll() {
        return unVisitedUrl.poll();
    }
    
    public static void addVisitedUrl(String url){
        System.out.println("已经访问的url--"+url);
        visitedUrl.add(url);
    }

    public static void addUnVisitedUrl(UrlTO url) {
       if(url!= null && !url.getUrl().trim().equals("")&& !visitedUrl.contains(url.getUrl())){
           System.out.println("想队列中添加新的url"+url.getUrl());
           unVisitedUrl.offer(url);
       }
    }

    public static Integer getVisitedUrlNum() {
        return visitedUrl.size();
    }

    public static  boolean unVisitedUrlEmpty() {
        return unVisitedUrl.isEmpty();
    }
}
 

4:crawler爬虫类

package com.xiaoshuo.service;

import java.util.ArrayList;
import java.util.List;

import org.junit.Test;

import com.xiaoshuo.to.UrlTO;
import com.xiaoshuo.url.LinkQueue;
import com.xiaoshuo.util.PaserUrlUtil;

/**
 * 宽度优先
 * @author lijunqing
 *
 */
public class Crawler {
    
    PaserUrlUtil paseUrlUtil = new PaserUrlUtil();
    
    /**
     * 初始化种子
     * @param url
     */
    public void initCrawlerBySeed(String url){
        UrlTO urlTO = new UrlTO();
        urlTO.setDeptValue(0);
        urlTO.setUrl(url);
        LinkQueue.addUnVisitedUrl(urlTO);
        System.out.println("UrlTO-----"+urlTO);
    }
    
    /**
     * 宽度优先搜索
     * @throws Exception
     */
    public void crawlerByBSF() throws Exception{
        // 种子url
        String url = "http://www.shuoshuo520.com/";
        //种子入队
        initCrawlerBySeed(url);
        System.out.println("feeds-----"+url);
        while(!LinkQueue.unVisitedUrlEmpty()){
            UrlTO visitUrl = (UrlTO)LinkQueue.unVisitedPoll();
            if(visitUrl == null)
                continue;
            //放入已经访问的url中
            
            List<UrlTO> unVisitUrlList = null;
            Integer deptValue = visitUrl.getDeptValue();
            String nextUrl = visitUrl.getUrl();
            
            LinkQueue.addVisitedUrl(nextUrl);
            System.out.println("正在处理的url实体--deptValue--"+deptValue+"--url--"+nextUrl);
            
            if(deptValue == 0){
                unVisitUrlList = paseUrlUtil.getCategoryUrls(nextUrl);
            }else if(deptValue == 1){
                unVisitUrlList = paseUrlUtil.getBookUrls(nextUrl);
            }else if(deptValue == 2){
                unVisitUrlList = paseUrlUtil.getDetailUrlList(nextUrl);
            }else if(deptValue == 3){
                unVisitUrlList = new ArrayList<UrlTO>();
                unVisitUrlList.add(paseUrlUtil.getToReadUrl(nextUrl));
            }else if(deptValue == 4){
                unVisitUrlList = paseUrlUtil.getChapterList(nextUrl);
            }else if(deptValue == 5){
               //最后一层
            }
            
            for(UrlTO urlTO: unVisitUrlList){
                LinkQueue.addUnVisitedUrl(urlTO);
                
            }
            
            
            
        }
    }
}
 

5:其实原理差不多,爬虫要定制智能,我的意图是获得该网站数据 到直接插入到数据库中 ,然后建立索引,所以我把每个页面处理封装成对象 插入到数据库中,

6:爬虫的html解析可以用正则表达式,可以把所有的方法重写一个方法 通过配置文件传递表达式或者参数实现对 其他网站的爬虫数据 

分享到:
评论
1 楼 春天好 2016-07-23  
博主写的很好,赞一个,多谢分享 *(^-^*)
分享一个免费好用的云端爬虫开发平台
http://www.shenjianshou.cn/

相关推荐

Global site tag (gtag.js) - Google Analytics