jsoup解析网页

hzywy

浏览: 170329 次
性别:
来自: 长沙

最近访客更多访客>>

jaymsimusic

chewp_ibm

llx156158574

testjiang3

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

java web 开发

jsoup

jsoup功能很强大，我用它来解析网页很轻松。但我只用了它一小部分功能，已经足矣。现在是我解析百度的一个小示例

package top100.bis;

import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import top100.bean.AlbumInfo;

public class Top100AlbumInfoDown {
    /**
    * @author hz
    * 用于http://www.top100.cn巨鲸专辑下载
    */
    private static String basePath = "http://www.top100.cn";
    /**
    * 默认构造器
    */
    private Top100AlbumInfoDown(){}

    /**
    * 给定专辑列表页面,返回AlbumInfo列表
    * @param url 带有分页参数的专辑列表地址,如:http://www.top100.cn/album/index.php?area=mainland&pages=1
    * @return list
    */
    private static List<AlbumInfo> parser(String urlstr){
        List<AlbumInfo> infoList = new ArrayList<AlbumInfo>();
        System.out.println("当前请求的url是"+urlstr);
        try{
            Document doc=Jsoup.connect(urlstr).timeout(10000).get();   //超时时间设置很有用
            wash(doc);
            String classStr1 = "Listen_allsingerbox";
            Elements infoDivList1 = doc.getElementsByClass(classStr1);
            if(infoDivList1==null){
                return null;
            }
            for(Element infoDiv1:infoDivList1){
                AlbumInfo albumInfo1 = getAlbumInfo(urlstr,infoDiv1);
                infoList.add(albumInfo1);
            }
            String classStr2 = "Listen_allsingerbox top100_bgf2";
            Elements infoDivList2 = doc.getElementsByClass(classStr2);
            if(infoDivList2==null){
                return null;
            }
            for(Element infoDiv2:infoDivList2){
                AlbumInfo albumInfo2 = getAlbumInfo(urlstr,infoDiv2);
                infoList.add(albumInfo2);
            }

        } catch (MalformedURLException e) {
            e.printStackTrace();
            return null;
        } catch (IOException e) {
            e.printStackTrace();
            return null;
        }

        return infoList;
    }

    /**
    * 抓取每个div中的albumInfo
    * @param url
    * @param infoDiv
    * @return albumInfo
    */
    private static AlbumInfo getAlbumInfo(String url,Element infoDiv){
        AlbumInfo albumInfo = new AlbumInfo();
        Element imageLink = infoDiv.getElementsByTag("a").first();
        Element image=imageLink.select("img[src$=.jpg]").first();
        String imageUrl=image.attr("src");
        albumInfo.setAlbumImageUrl(imageUrl);

        Element songLink =infoDiv.getElementsByTag("a").get(1);
        String linkHref = songLink.attr("href");
        albumInfo.setAlbumSongLinkUrl(basePath+linkHref);
        String albumName = songLink.attr("title");
        albumInfo.setAlbumName(albumName.trim());

        Element abstructLink = infoDiv.getElementsByTag("p").first();
        String abstruct = String.valueOf(abstructLink).replaceAll("<.*?>", "");
        albumInfo.setAlbumAbstruct(abstruct);
        int startIndex = url.indexOf("=");
        int endIndex = url.lastIndexOf("&");
        String area = url.substring(startIndex+1,endIndex);
        albumInfo.setAlbumArea(area);
        return albumInfo;
    }

    public static List<AlbumInfo> getAlbumInfoByUrl(String url){
        List<AlbumInfo> list = new ArrayList<AlbumInfo>();
        for(int i=1;i<SysConstant.PAGE;i++){
            String temp = "&pages=";
            String site = url+temp+i;//拼出带页码参数的专辑网址
            //System.out.println(site);
            if(site == null || "".equals(site)){
                continue;
            }
            List<AlbumInfo> top100AlbumInfoList = parser(site);
            if(top100AlbumInfoList==null){
                continue;
            }
            list.addAll(top100AlbumInfoList);
        }
        HashSet<AlbumInfo> hashSet = new HashSet<AlbumInfo>(list);
        list.clear();
        list.addAll(hashSet);
        return list ;
    }

    private static void wash(Document doc)
        {
            Elements script=doc.select("script");
            for(Element element : script)
            {
                element.remove();
            }
            Elements form=doc.select("form");
            for(Element element : form)
            {
                element.remove();
            }
            Elements meta=doc.select("meta");
            for(Element element : meta)
            {
                element.remove();
            }
            Elements style=doc.select("style");
            for(Element element : style)
            {
                element.remove();
            }
            Elements iframe=doc.select("iframe");
            for(Element element : iframe)
            {
                element.remove();
            }
            Elements font=doc.select("font");
            for(Element element : font)
            {
                element.remove();
            }
        }

    public static void main(String[] args) {
        //Top100AlbumInfoDown down = new Top100AlbumInfoDown();
        String url= "http://www.top100.cn/album/index.php?area=mainland";
        List<AlbumInfo> list = getAlbumInfoByUrl(url);
        for(AlbumInfo tempInfo:list){
            System.out.println(tempInfo.getAlbumName()+"||"+tempInfo.getAlbumSongLinkUrl());
        }
    }
}

分享到：

springmvc上传 | fileupload多文件上传

2012-03-31 09:24
浏览 4481
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论