定向网站爬虫---初级例子

iluoxuan

浏览: 585562 次
性别:
来自: 北京

最近访客更多访客>>

czl026

java_my_life

hejin_sl

yzzh9

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

java
lucene
数据结构和算法

1：url处理和html解析

package com.xiaoshuo.util;

import java.util.ArrayList;
import java.util.List;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.xiaoshuo.to.Chapter;
import com.xiaoshuo.to.UrlTO;

/**
 * 解析html的处理类
 * @author lijunqing
 *
 */
public class PaserUrlUtil {
    
    private HttpClient httpClient=new DefaultHttpClient();
    
    /**
     * 获得html的string字符串
     * @param url
     * @return
     * @throws Exception
     */
    public String getHtmlStr(String url) throws Exception {
        HttpGet httpGet=new HttpGet(url);
        HttpResponse response;
        String htmlStr=null;
        try {
            response=httpClient.execute(httpGet);
            HttpEntity entity=response.getEntity();
            if(entity != null) {
                htmlStr=new String(EntityUtils.toString(entity));
                htmlStr=new String(htmlStr.getBytes("ISO-8859-1"), "gbk"); // 读取乱码解决
            }
        } catch(Exception e) {
            e.printStackTrace();
        }
        return htmlStr;
    }
    
    /**
     * 获得document
     * @param url
     * @return
     * @throws Exception
     */
    public Document getDocument(String url) throws Exception{
        Thread.currentThread().sleep(5000*2);
        return Jsoup.parse(getHtmlStr(url));
    }
    
    /**
     * 获得种类url连接
     * @return
     * @throws Exception 
     */
    public List<UrlTO> getCategoryUrls(String url) throws Exception{
        Document doc = getDocument(url);
        List<UrlTO> urlList = new ArrayList<UrlTO>();
        Elements elements = doc.select(".navlist").select("li").select("a");
        String categoryUrl= null;
        UrlTO urlTO=null;
        for(Element element:elements){
            categoryUrl = element.attr("href");
            urlTO = new UrlTO();
            urlTO.setDeptValue(1);
            urlTO.setUrl(categoryUrl);
            urlList.add(urlTO);
        }
        return urlList;
    }
    
    /***
     * 通过分类url获得所有的该类下书籍url
     * @param categoryUrl
     * @return
     * @throws Exception 
     */
    public List<UrlTO> getBookUrls(String categoryUrl) throws Exception{
        System.out.println("bookUrls-处理进入 deptvalue-==1-");
        List<UrlTO> urlTOList = new ArrayList<UrlTO>();
        UrlTO urlTO = new UrlTO();
        urlTO.setDeptValue(2);
        String nextUrl = getNextBookUrl(categoryUrl);
        while(nextUrl != null && !nextUrl.trim().equals("")){
            System.out.println("bookUrls--"+nextUrl);
            urlTO.setUrl(nextUrl);
            nextUrl = getNextBookUrl(nextUrl);
            urlTOList.add(urlTO);
        }
        return urlTOList;
    }
    
    /**
     * 获得下一个分页连接
     * @param categoryUrl
     * @return
     * @throws Exception
     */
    public String getNextBookUrl(String categoryUrl) throws Exception{
        Document doc = getDocument(categoryUrl);
        Elements elements = doc.select("#pagelink").select("strong +a");
        if(elements == null){
            return null;
        }
        return elements.first().attr("href");
    }
    
    /**
     * 获取每个页面书籍详情url
     * @param categoryUrl
     * @return
     * @throws Exception
     */
    public List<UrlTO> getDetailUrlList(String categoryUrl) throws Exception{
        Document doc = getDocument(categoryUrl);
        Elements elements = doc.select(".grid").select("tr");
        String detailUrl = null;
        List<UrlTO> urlTOList = new ArrayList<UrlTO>();
        UrlTO urlTO = new UrlTO();
        for(Element element:elements){
          detailUrl =  element.select("td").first().attr("href");
          urlTO.setDeptValue(3);
          urlTO.setUrl(detailUrl);
          urlTOList.add(urlTO);
        }
        return urlTOList;
    }
    
    public UrlTO getToReadUrl(String detailUrl) throws Exception{
        Document doc = getDocument(detailUrl);
        UrlTO urlTO = new UrlTO();
        String toReadUrl=doc.select("#bt_1").select("a").first().attr("href");
        urlTO.setDeptValue(4);
        urlTO.setUrl(toReadUrl);
        return urlTO;
    }
    
    /**
     * 获得chapter的url
     * @param url
     * @return
     * @throws Exception
     */
    public List<UrlTO> getChapterList(String detailUrl) throws Exception {

        Document doc= getDocument(detailUrl);
        Elements elements=doc.select(".list").select("dd").select("a");
        List<UrlTO> urlList=new ArrayList<UrlTO>();
        UrlTO urlTO = new UrlTO();
        String chapterUrl= null;
        for(Element element: elements) {
            chapterUrl = detailUrl + element.attr("href");
            urlTO.setDeptValue(5);
            urlTO.setUrl(chapterUrl);
        }
        return urlList;
    }
    
    /**
     * 
     * @param chapterUrl
     * @return
     * @throws Exception
     */
    public Chapter getChapter(String chapterUrl) throws Exception {
        Document doc=getDocument(chapterUrl);
        Chapter chapter=new Chapter();
        String name=doc.select("h1").text();
        String content=doc.select(".width").text();
        chapter.setName(name);
        chapter.setContent(content);
        return chapter;
    }
    
}

2：url实体类

package com.xiaoshuo.to;

/**
 * url保存类
 * @author lijunqing
 *
 */
public class UrlTO {

    private Integer deptValue;

    private String url;

    public Integer getDeptValue() {
        return deptValue;
    }

    public void setDeptValue(Integer deptValue) {
        this.deptValue=deptValue;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url=url;
    }
    
    public String toString(){
       return "dept="+deptValue+"--url--"+url;
    }

}

3：队列类

package com.xiaoshuo.url;

import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;

import com.xiaoshuo.to.UrlTO;

/**
 * 保存访问的url
 * @author lijunqing
 */
public class LinkQueue {

    // 已经访问的url集合
    private static Set<Object> visitedUrl=new HashSet<Object>();

    // 未被访问的url集合
    private static Queue<Object> unVisitedUrl=new LinkedList<Object>();

    public static Queue<Object> getUnVisitedUrl() {
        return unVisitedUrl;
    }

    public static void removeVisitedUrl(String url) {
        visitedUrl.remove(url);
    }

    public static Object unVisitedPoll() {
        return unVisitedUrl.poll();
    }
    
    public static void addVisitedUrl(String url){
        System.out.println("已经访问的url--"+url);
        visitedUrl.add(url);
    }

    public static void addUnVisitedUrl(UrlTO url) {
       if(url!= null && !url.getUrl().trim().equals("")&& !visitedUrl.contains(url.getUrl())){
           System.out.println("想队列中添加新的url"+url.getUrl());
           unVisitedUrl.offer(url);
       }
    }

    public static Integer getVisitedUrlNum() {
        return visitedUrl.size();
    }

    public static  boolean unVisitedUrlEmpty() {
        return unVisitedUrl.isEmpty();
    }
}

4：crawler爬虫类

package com.xiaoshuo.service;

import java.util.ArrayList;
import java.util.List;

import org.junit.Test;

import com.xiaoshuo.to.UrlTO;
import com.xiaoshuo.url.LinkQueue;
import com.xiaoshuo.util.PaserUrlUtil;

/**
 * 宽度优先
 * @author lijunqing
 *
 */
public class Crawler {
    
    PaserUrlUtil paseUrlUtil = new PaserUrlUtil();
    
    /**
     * 初始化种子
     * @param url
     */
    public void initCrawlerBySeed(String url){
        UrlTO urlTO = new UrlTO();
        urlTO.setDeptValue(0);
        urlTO.setUrl(url);
        LinkQueue.addUnVisitedUrl(urlTO);
        System.out.println("UrlTO-----"+urlTO);
    }
    
    /**
     * 宽度优先搜索
     * @throws Exception
     */
    public void crawlerByBSF() throws Exception{
        // 种子url
        String url = "http://www.shuoshuo520.com/";
        //种子入队
        initCrawlerBySeed(url);
        System.out.println("feeds-----"+url);
        while(!LinkQueue.unVisitedUrlEmpty()){
            UrlTO visitUrl = (UrlTO)LinkQueue.unVisitedPoll();
            if(visitUrl == null)
                continue;
            //放入已经访问的url中
            
            List<UrlTO> unVisitUrlList = null;
            Integer deptValue = visitUrl.getDeptValue();
            String nextUrl = visitUrl.getUrl();
            
            LinkQueue.addVisitedUrl(nextUrl);
            System.out.println("正在处理的url实体--deptValue--"+deptValue+"--url--"+nextUrl);
            
            if(deptValue == 0){
                unVisitUrlList = paseUrlUtil.getCategoryUrls(nextUrl);
            }else if(deptValue == 1){
                unVisitUrlList = paseUrlUtil.getBookUrls(nextUrl);
            }else if(deptValue == 2){
                unVisitUrlList = paseUrlUtil.getDetailUrlList(nextUrl);
            }else if(deptValue == 3){
                unVisitUrlList = new ArrayList<UrlTO>();
                unVisitUrlList.add(paseUrlUtil.getToReadUrl(nextUrl));
            }else if(deptValue == 4){
                unVisitUrlList = paseUrlUtil.getChapterList(nextUrl);
            }else if(deptValue == 5){
               //最后一层
            }
            
            for(UrlTO urlTO: unVisitUrlList){
                LinkQueue.addUnVisitedUrl(urlTO);
                
            }
            
            
            
        }
    }
}

5：其实原理差不多，爬虫要定制智能,我的意图是获得该网站数据到直接插入到数据库中，然后建立索引，所以我把每个页面处理封装成对象插入到数据库中，

6：爬虫的html解析可以用正则表达式，可以把所有的方法重写一个方法通过配置文件传递表达式或者参数实现对其他网站的爬虫数据

分享到：

爬虫-------解决的问题 | 特定网站爬虫---原理篇

2012-11-07 17:57
浏览 3615
评论(1)
分类:Web前端
查看更多

1 楼春天好 2016-07-23

博主写的很好，赞一个，多谢分享 *(^-^*)
分享一个免费好用的云端爬虫开发平台
http://www.shenjianshou.cn/

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

定向网站爬虫---初级例子

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

定向网站爬虫---初级例子

评论

发表评论

相关推荐

protobuf-dt插件

java循环标签

java程序性能优化 --阅读

jetty invalid entry CRC问题

基础题目

guice注入

eclipse快捷键

java clone

ThreadLocal

hession

冒泡和快速排序java

java生产者和消费者模型三种实现

单例模式

freemarker的使用

java 引用类型和内存泄露

java泛型

filter执行顺序

spring rmi远程调用

spring mvc返回204状态码

editplus去掉多余空行

最近访客更多访客>>