【转】通过搜索关键和页码字进入百度 -

chenlong_1988
浏览: 187316 次
性别:
最近访客更多访客>>

sr_www
eman
qq_33915826
zjy_369
博主相关

博客
微博
相册
留言
关于我
文章分类

社区版块

存档分类

【转】通过搜索关键和页码字进入百度

      阅读更多
    
通过搜索关键和页码字进入百度，例如“http://www.baidu.com/s?wd=搜索关键字&pn=页码（这里的页码计算公式是：10*（页码-1））
找到真正有用的那一行，把信息读出来，

按照每一个项的特点把他们切开，例如URL前面都有“class=f”……
然后存起来
下面是代码：
 
package org.qisou;
 
import java.io.*;
import java.net.*;
/*

 * 这是一个获取百度的类，它可以用一个搜索关键字来初始化，然后将抓取的结果存到SearchItem的对象数组里

 */

public class CatchBaidu {
 
    /**

     * @param args

     * strKey: 搜索关键字

     */

    private String strKey;

    private String[] info;

    private String source;

    private SearchItem[] ItemList;

    /*

     * 构造函数，使用搜索关键字构造

     */

    public CatchBaidu(String strKey,int pages){

        this.strKey=strKey;

        pages=(pages-1)*10;

        this.ItemList = new SearchItem[10];

        this.source="http://www.baidu.com/s?wd="+this.strKey+"&tn=cfish828_pg&pn="+pages;

        this.source=Socket(this.source);

        info = new String[40];

        this.GetBaidu();

    }

    public CatchBaidu(String strKey){

        this.strKey=strKey;

        this.ItemList = new SearchItem[10];

        this.source="http://www.baidu.com/s?wd="+this.strKey+"&tn=cfish828_pg&pn="+1;

        this.source=Socket(this.source);

        info = new String[40];

        this.GetBaidu();

    }

    /*

     * 使用socket获取制定baidu页面，并对页面进行初级筛选，找出有用的一行返回

     */

    public String Socket(String strPage){

        String strServer="www.baidu.com";

        String s=null;

        try {

            String hostname = strServer;

            int port = 80;

            InetAddress addr = InetAddress.getByName(hostname);

            Socket socket = new Socket(addr, port); //建立一个Socket
 
            //发送命令

            BufferedWriter wr = new BufferedWriter(new OutputStreamWriter(socket.getOutputStream(), "UTF8"));

            wr.write("GET " + strPage + " HTTP/1.0 ");

            wr.write("HOST:" + strServer + " ");

            wr.write(" ");

            wr.flush();
 
            //接收返回的结果

            BufferedReader rd = new BufferedReader(new InputStreamReader(socket.getInputStream()));

            String line;

            while (true) {

                line = rd.readLine();

                if(line.length()>=75){

                    if(line.startsWith("<table border="0" cellpadding="0" cellspacing="0"><tr><td class=f><a href="")){

                        s=line;

                        break;

                    }

                }

            }

            wr.close();

            rd.close();

        } 

        catch (Exception e) {

            System.out.println(e.toString());

        }

        return s;

    }//获取百度有用的字段；

    public String ClearOnce(String s){

        int first;

        int last;

        first = s.indexOf("<");

        last = s.indexOf(">");

        StringBuilder builder = new StringBuilder(s);

        builder.delete(first,last+1);

        return builder.toString();

    }//清除一次<>

    public String Clear(String s){

        while(s.indexOf(">")>0&s.indexOf(">")<s.length()){

            s=this.ClearOnce(s);

        }

        return s;

    }//清除所有的<>

    public String ClearK(String s){

        return s.substring(0,s.indexOf("- "));

    }//去掉百度快照

    public String GetUrl(String s){

        int first;

        int lest;

        if((first=s.indexOf("class=f"))!=-1){

            lest=s.indexOf("" target");

            s=s.substring(first+17,lest);

            return s;

        }else

        return "";

    }//获取URL

    public void GetBaidu(){

        int br;

        for(int i=0;i<40;i++){

            br=this.source.indexOf("");

            info[i]=this.source.substring(0,br);

            this.source=this.source.substring(br+4,this.source.length());

        }//将数据进行分段，每四段代表一个信息

        for(int i=0;i<40;i++){

            if((i+1)%4==1){

                info[i+3]=GetUrl(info[i]);

            }

            info[i]=Clear(info[i]);

            if((i+1)%4==3){

                info[i]=ClearK(info[i]);

            }

            //System.out.println(st[i]);

        }//将信息整理，从新存储，顺序为 标题、简介、页面信息、URL

        for(int i=0;i<10;i++){

            int j=i*4;

            SearchItem item = new SearchItem(info[j],info[j+3],info[j+1],info[j+2]);

            this.ItemList[i]=item;

        }//将每条信息存入对象数组

    }//把信息分段存入数组

    public SearchItem[] GetBaiduItemList(){

        return ItemList;

    }
 
    public static void main(String[] args){

        //System.out.print("QiSou.cn Search: ");

        //KeyboardInput input = new KeyboardInput();

        //String search = input.readString();

        CatchBaidu obj = new CatchBaidu("forest",4);

        SearchItem[] itemlist = obj.GetBaiduItemList();

        for(int i=0;i<10;i++){

            itemlist[i].print();

        }

    }
 
}
 
.
 
package org.qisou;
 
public class SearchItem {
 
    /**

     * @param args

     * title: 标题

     * URL:链接地址

     * synopsis:简介

     * info:页面信息

     */

    //private String ID;

    private String title;

    private String URL;

    private String synopsis;

    private String info;

    public String GetTitle(){

        return title;

    }

    public String GetURL(){

        return URL;

    }

    public String GetSynopsis(){

        return synopsis;

    }

    public String GetInfo(){

        return info;

    }
 
    public SearchItem(String title ,String URL ,String synopsis ,String info){

        this.title=title;

        this.URL=URL;

        this.synopsis=synopsis;

        this.info=info;

    }

    public SearchItem(){

    }

    public void print(){

        System.out.println(this.GetURL());

        System.out.println(this.GetTitle());

        System.out.println(this.GetSynopsis());

        System.out.println(this.GetInfo());

    }

    public static void main(String[] args) {

        // TODO Auto-generated method stub

        SearchItem obj = new SearchItem("title","URL","synopsis","info");

        obj.print();
 
    }
 
}
分享到：
      
      【转】nekohtml+xpath范例，及注意事项
      |
      Android中的HTTP通信
    
    2012-11-14 14:51
浏览 966
评论(0)
分类:编程语言
查看更多
    
    评论
    
发表评论   您还没有登录,请您登录后再发表评论
最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

【转】通过搜索关键和页码字进入百度

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

【转】通过搜索关键和页码字进入百度

评论

发表评论

相关推荐

最近访客更多访客>>