Java版PageRank及网站收录情况查询代码收藏

lzj0470
浏览: 1283850 次
性别:
来自: 深圳
最近访客更多访客>>

gljhh
hedgehog12
chen88358323
wyx065747
博主相关

博客
微博
相册
留言
关于我
文章分类

社区版块

存档分类

博客分类：
网络爬虫
Java Google C C#C++
在Google这个由10的100次方得名的站点中，各种评估网站的算法层出不穷，而PageRank即是其中之一。

  Google的PageRank根据网站的外部链接和内部链接的数量和质量俩衡量网站的价值。PageRank背后的概念是，每个到页面的链接都是对该页面的一次投票，被链接的越多，就意味着被其他网站投票越多。这个就是所谓的“链接流行度”——衡量多少人愿意将他们的网站和你的网站挂钩。 PageRank这个概念引自学术中一篇论文的被引述的频度——即被别人引述的次数越多，一般判断这篇论文的权威性就越高。

  通常情况下讲，原创内容越多的站点，PageRank越容易提升，反之则相对比较困难，PageRank最大上限值为10。在Google的评估中，能上 10的网站真可谓凤毛麟角，即使算上Google，能成就PageRank 10这“伟业”者，望眼环球也不足40家。一般来说，个人站点评估值4即办的不错，商业网站到6以上便算步入正轨了。

  网上虽然有不少现成的查询器及源码，但是光用别人的毕竟不符合程序员风格，所以今天自己用Java重造轮子又写了个PageRank查询实现，捎带着把一些常用搜索引擎的网站链接及反向链接查询也加上了。

  源码如下：

   GooglePageRank.java

   1. package org.loon.test;
   2.
   3. import java.io.IOException;
   4. import java.util.Random;
   5. import java.util.regex.Matcher;
   6. import java.util.regex.Pattern;
   7.
   8. /**
   9.  * Copyright 2008
  10.  * 
  11.  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  12.  * use this file except in compliance with the License. You may obtain a copy of
  13.  * the License at
  14.  * 
  15.  * http://www.apache.org/licenses/LICENSE-2.0
  16.  * 
  17.  * Unless required by applicable law or agreed to in writing, software
  18.  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  19.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  20.  * License for the specific language governing permissions and limitations under
  21.  * the License.
  22.  * 
  23.  * @project loonframework
  24.  * @author chenpeng
  25.  * @email：ceponline@yahoo.com.cn
  26.  * @version 0.1
  27.  */
  28. public class GooglePageRank {
  29.
  30.     // google pagerank服务器ip地址列表（最近google小气了很多，反复查询一个封ip）
  31.     final static String[] GoogleServiceIP = new String[] { "64.233.161.100",
  32.             "64.233.161.101", "64.233.183.91", "64.233.189.44", "66.102.1.103",
  33.             "66.102.9.115", "66.249.89.83", "66.249.91.99", "66.249.93.190" };
  34.
  35.     // google用识别标记
  36.     final static private int GOOGLE_MAGIC = 0xE6359A60;
  37.
  38.     // ch数值混合器
  39.     private class CHMix {
  40.
  41.         int a;
  42.
  43.         int b;
  44.
  45.         int c;
  46.
  47.         public CHMix() {
  48.             this(0, 0, 0);
  49.         }
  50.
  51.         public CHMix(int a, int b, int c) {
  52.             this.a = a;
  53.             this.b = b;
  54.             this.c = c;
  55.         }
  56.     }
  57.
  58.     /**
  59.      * 按google要求混合成ch数据
  60.      * 
  61.      * @param mix
  62.      */
  63.     private static void mix(final CHMix mix) {
  64.         mix.a -= mix.b;
  65.         mix.a -= mix.c;
  66.         mix.a ^= mix.c >> 13;
  67.         mix.b -= mix.c;
  68.         mix.b -= mix.a;
  69.         mix.b ^= mix.a << 8;
  70.         mix.c -= mix.a;
  71.         mix.c -= mix.b;
  72.         mix.c ^= mix.b >> 13;
  73.         mix.a -= mix.b;
  74.         mix.a -= mix.c;
  75.         mix.a ^= mix.c >> 12;
  76.         mix.b -= mix.c;
  77.         mix.b -= mix.a;
  78.         mix.b ^= mix.a << 16;
  79.         mix.c -= mix.a;
  80.         mix.c -= mix.b;
  81.         mix.c ^= mix.b >> 5;
  82.         mix.a -= mix.b;
  83.         mix.a -= mix.c;
  84.         mix.a ^= mix.c >> 3;
  85.         mix.b -= mix.c;
  86.         mix.b -= mix.a;
  87.         mix.b ^= mix.a << 10;
  88.         mix.c -= mix.a;
  89.         mix.c -= mix.b;
  90.         mix.c ^= mix.b >> 15;
  91.     }
  92.
  93.     /**
  94.      * 获得ch数值混合器
  95.      * 
  96.      * @return
  97.      */
  98.     public static CHMix getInnerCHMix() {
  99.         return new GooglePageRank().new CHMix();
 100.     }
 101.
 102.     /**
 103.      * 通过url获得googlech(google数据库针对页面的全球唯一标识)
 104.      * 
 105.      * @param url
 106.      * @return
 107.      */
 108.     public static String GoogleCH(final String url) {
 109.         // 格式化为google要求的info:url模式
 110.         String nUrl = String.format("info:%s", new Object[] { url });
 111.         // 获得新url字符串格式
 112.         char[] urls = nUrl.toCharArray();
 113.         // 获得新url长度
 114.         int length = urls.length;
 115.
 116.         // 获得一个ch数值混合器
 117.         CHMix chMix = GooglePageRank.getInnerCHMix();
 118.         // 为c注入google识别标识
 119.         chMix.c = GOOGLE_MAGIC;
 120.
 121.         // 为a、b项注入google要求的初始标识
 122.         chMix.a = chMix.b = 0x9E3779B9;
 123.
 124.         int k = 0;
 125.
 126.         int len = length;
 127.
 128.         while (len >= 12) {
 129.
 130.             chMix.a += (int) (urls[k + 0] + (urls[k + 1] << 8)
 131.                     + (urls[k + 2] << 16) + (urls[k + 3] << 24));
 132.             chMix.b += (int) (urls[k + 4] + (urls[k + 5] << 8)
 133.                     + (urls[k + 6] << 16) + (urls[k + 7] << 24));
 134.             chMix.c += (int) (urls[k + 8] + (urls[k + 9] << 8)
 135.                     + (urls[k + 10] << 16) + (urls[k + 11] << 24));
 136.             // 获得混合运算后的数据
 137.             GooglePageRank.mix(chMix);
 138.             k += 12;
 139.             len -= 12;
 140.         }
 141.         chMix.c += length;
 142.
 143.         // 产生googlech的11位标识
 144.         switch (len) {
 145.         case 11:
 146.             chMix.c += (int) (urls[k + 10] << 24);
 147.         case 10:
 148.             chMix.c += (int) (urls[k + 9] << 16);
 149.         case 9:
 150.             chMix.c += (int) (urls[k + 8] << 8);
 151.         case 8:
 152.             chMix.b += (int) (urls[k + 7] << 24);
 153.         case 7:
 154.             chMix.b += (int) (urls[k + 6] << 16);
 155.         case 6:
 156.             chMix.b += (int) (urls[k + 5] << 8);
 157.         case 5:
 158.             chMix.b += (int) (urls[k + 4]);
 159.         case 4:
 160.             chMix.a += (int) (urls[k + 3] << 24);
 161.         case 3:
 162.             chMix.a += (int) (urls[k + 2] << 16);
 163.         case 2:
 164.             chMix.a += (int) (urls[k + 1] << 8);
 165.         case 1:
 166.             chMix.a += (int) (urls[k + 0]);
 167.             break;
 168.         default:
 169.             break;
 170.         }
 171.         // 获得混合运算后的数据
 172.         GooglePageRank.mix(chMix);
 173.         // 获得未修订的CH
 174.         String tch = String.valueOf(chMix.c);
 175.         // 矫正差值后反馈正确CH
 176.         return String
 177.                 .format("6%s", new Object[] { tch.length() < 10 ? ("-" + tch)
 178.                         .intern() : tch });
 179.     }
 180.
 181.     /**
 182.      * 正则匹配pagerank结果
 183.      * 
 184.      * @param value
 185.      * @return
 186.      */
 187.     private static String MatchRank(final String value) {
 188.         Pattern pattern = Pattern.compile("Rank_1:[0-9]:([0-9]+)");
 189.         Matcher matcher = pattern.matcher(value);
 190.         if (matcher.find()) {
 191.             return matcher.group(1);
 192.         }
 193.         return "0";
 194.     }
 195.
 196.     /**
 197.      * 获得指定页面的google pagerank值
 198.      * 
 199.      * @param url
 200.      * @return
 201.      */
 202.     public static String GooglePR(final String url) {
 203.         String rip = GoogleServiceIP[new Random()
 204.                 .nextInt(GoogleServiceIP.length)];
 205.         return GooglePR(url, rip);
 206.     }
 207.
 208.     /**
 209.      * 以指定的google服务器获得指定页面的google pagerank值
 210.      * 
 211.      * @param url
 212.      * @param ip
 213.      * @return
 214.      */
 215.     public static String GooglePR(final String url, final String ip) {
 216.         // 产生查询用唯一标识
 217.         String checksum = GoogleCH(url);
 218.         // 产生查询用url
 219.         String queryUrl = String
 220.                 .format(
 221.                         "http://%s/search?client=navclient-auto&ch=%s&features=Rank&q=info:%s",
 222.                         new Object[] { ip, checksum, url });
 223.
 224.         String response;
 225.         try {
 226.             response = SimpleWebClient.getRequestHttp(queryUrl);
 227.         } catch (IOException e) {
 228.             response = "";
 229.         }
 230.         if (response.length() == 0) {
 231.             return "0";
 232.         } else {
 233.             return GooglePageRank.MatchRank(response);
 234.         }
 235.     }
 236.
 237. }
238.


SimpleWebClient.java

   1. package org.loon.test;
   2.
   3. import java.io.BufferedInputStream;
   4. import java.io.ByteArrayOutputStream;
   5. import java.io.IOException;
   6. import java.io.InputStream;
   7. import java.io.InputStreamReader;
   8. import java.io.OutputStreamWriter;
   9. import java.net.HttpURLConnection;
  10. import java.net.URL;
  11. import java.util.HashMap;
  12. import java.util.Iterator;
  13. import java.util.Map;
  14. import java.util.Set;
  15. import java.util.Map.Entry;
  16.
  17. import sun.misc.BASE64Encoder;
  18.
  19. /**
  20.  * Copyright 2008
  21.  * 
  22.  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
  23.  * use this file except in compliance with the License. You may obtain a copy of
  24.  * the License at
  25.  * 
  26.  * http://www.apache.org/licenses/LICENSE-2.0
  27.  * 
  28.  * Unless required by applicable law or agreed to in writing, software
  29.  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  30.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  31.  * License for the specific language governing permissions and limitations under
  32.  * the License.
  33.  * 
  34.  * @project loonframework
  35.  * @author chenpeng
  36.  * @email：ceponline@yahoo.com.cn
  37.  * @version 0.1
  38.  */
  39. public class SimpleWebClient {
  40.
  41.     /**
  42.      * 向指定url发送请求并获得响应数据
  43.      * 
  44.      * @param urlString
  45.      * @return
  46.      * @throws IOException
  47.      */
  48.     public static String getRequestHttp(String urlString) throws IOException {
  49.         return getRequestHttp(urlString, "utf-8");
  50.     }
  51.
  52.     /**
  53.      * 向指定url发送请求并获得响应数据
  54.      * 
  55.      * @param urlString
  56.      * @param encoding
  57.      * @return
  58.      * @throws IOException
  59.      */
  60.     public static String getRequestHttp(String urlString, String encoding)
  61.             throws IOException {
  62.         return getRequestHttp(urlString, encoding, null, 5000);
  63.     }
  64.
  65.     /**
  66.      * 向指定url发送请求并获得响应数据
  67.      * 
  68.      * @param urlString
  69.      * @param encoding
  70.      * @param parameter
  71.      * @return
  72.      * @throws IOException
  73.      */
  74.     public static String getRequestHttp(final String urlString,
  75.             final String encoding, final Map parameter, final int timeout)
  76.             throws IOException {
  77.
  78.         String nURL = (urlString.startsWith("http://") || urlString
  79.                 .startsWith("https://")) ? urlString : ("http:" + urlString)
  80.                 .intern();
  81.
  82.         String user = null;
  83.         String password = null;
  84.         String method = "GET";
  85.         String post = null;
  86.         String digest = null;
  87.
  88.         String responseContent = "ERROR";
  89.
  90.         boolean foundRedirect = false;
  91.
  92.         Map headers = new HashMap();
  93.
  94.         if (parameter != null) {
  95.             Set entrySet = parameter.entrySet();
  96.
  97.             for (Iterator it = entrySet.iterator(); it.hasNext();) {
  98.                 Entry header = (Entry) it.next();
  99.                 String key = (String) header.getKey();
 100.                 String value = (String) header.getValue();
 101.                 if ("user".equals(key)) {
 102.                     user = value;
 103.                 } else if ("pass".equals(key)) {
 104.                     password = value;
 105.                 } else if ("method".equals(key)) {
 106.                     method = value;
 107.                 } else if ("post".equals(key)) {
 108.                     post = value;
 109.                 } else {
 110.                     headers.put(key, value);
 111.                 }
 112.             }
 113.         }
 114.         URL url = new URL(nURL);
 115.
 116.         if (user != null && password != null) {
 117.             BASE64Encoder base64 = new BASE64Encoder();
 118.             digest = "Basic "
 119.                     + base64.encode((user + ":" + password).getBytes());
 120.         }
 121.
 122.         do {
 123.
 124.             HttpURLConnection urlConnection = (HttpURLConnection) url
 125.                     .openConnection();
 126.             // 添加访问授权
 127.             if (digest != null) {
 128.                 urlConnection.setRequestProperty("Authorization", digest);
 129.             }
 130.             urlConnection.setDoOutput(true);
 131.             urlConnection.setDoInput(true);
 132.             urlConnection.setUseCaches(false);
 133.             urlConnection.setInstanceFollowRedirects(false);
 134.             urlConnection.setRequestMethod(method);
 135.             if (timeout > 0) {
 136.                 urlConnection.setConnectTimeout(timeout);
 137.             }
 138.             //模拟http头文件
 139.             urlConnection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0;)");
 140.             urlConnection.setRequestProperty("Accept", "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/msword, application/vnd.ms-excel, application/vnd.ms-powerpoint, */*");
 141.             //追加http头文件
 142.             Set headersSet = headers.entrySet();
 143.             for (Iterator it = headersSet.iterator(); it.hasNext();) {
 144.                 Entry entry = (Entry) it.next();
 145.                 urlConnection.setRequestProperty((String) entry.getKey(),
 146.                         (String) entry.getValue());
 147.             }
 148.
 149.             if (post != null) {
 150.                 OutputStreamWriter outRemote = new OutputStreamWriter(
 151.                         urlConnection.getOutputStream());
 152.                 outRemote.write(post);
 153.                 outRemote.flush();
 154.             }
 155.
 156.             // 获得响应状态
 157.             int responseCode = urlConnection.getResponseCode();
 158.
 159.             // 获得返回的数据长度
 160.             int responseLength = urlConnection.getContentLength();
 161.
 162.             if (responseCode == 302) {
 163.                 // 重定向
 164.                 String location = urlConnection.getHeaderField("Location");
 165.                 url = new URL(location);
 166.                 foundRedirect = true;
 167.             } else {
 168.                 BufferedInputStream in;
 169.                 if (responseCode == 200 || responseCode == 201) {
 170.                     in = new BufferedInputStream(urlConnection.getInputStream());
 171.                 } else {
 172.                     in = new BufferedInputStream(urlConnection.getErrorStream());
 173.                 }
 174.                 int size = responseLength == -1 ? 4096 : responseLength;
 175.                 if (encoding != null) {
 176.                     responseContent = SimpleWebClient.read(in, size, encoding);
 177.                 } else {
 178.                     ByteArrayOutputStream out = new ByteArrayOutputStream();
 179.                     byte[] bytes = new byte[size];
 180.                     int read;
 181.                     while ((read = in.read(bytes)) >= 0) {
 182.                         out.write(bytes, 0, read);
 183.                     }
 184.                     responseContent = new String(out.toByteArray());
 185.                     in.close();
 186.                     out.close();
 187.                 }
 188.                 foundRedirect = false;
 189.             }
 190.             // 如果重定向则继续
 191.         } while (foundRedirect);
 192.
 193.         return responseContent;
 194.     }
 195.
 196.     /**
 197.      * 转化InputStream为String
 198.      * 
 199.      * @param in
 200.      * @param size
 201.      * @return
 202.      * @throws IOException
 203.      */
 204.     private static String read(final InputStream in, final int size,
 205.             final String encoding) throws IOException {
 206.         StringBuilder sbr = new StringBuilder();
 207.         int nSize = size;
 208.         if (nSize == 0) {
 209.             nSize = 1;
 210.         }
 211.         char[] buffer = new char[nSize];
 212.         int offset = 0;
 213.         InputStreamReader isr = new InputStreamReader(in, encoding);
 214.         while ((offset = isr.read(buffer)) != -1) {
 215.             sbr.append(buffer, 0, offset);
 216.         }
 217.         in.close();
 218.         isr.close();
 219.         return sbr.toString();
 220.     }
 221. }
222.


  WebAppraise.java

   1. package org.loon.test;
   2.
   3. import java.io.IOException;
   4.
   5. /**
   6.  * Copyright 2008
   7.  * 
   8.  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
   9.  * use this file except in compliance with the License. You may obtain a copy of
  10.  * the License at
  11.  * 
  12.  * http://www.apache.org/licenses/LICENSE-2.0
  13.  * 
  14.  * Unless required by applicable law or agreed to in writing, software
  15.  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  16.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  17.  * License for the specific language governing permissions and limitations under
  18.  * the License.
  19.  * 
  20.  * @project loonframework
  21.  * @author chenpeng
  22.  * @email：ceponline@yahoo.com.cn
  23.  * @version 0.1
  24.  */
  25. public class WebAppraise {
  26.
  27.     private String googleSum;
  28.
  29.     private String baiduSum;
  30.
  31.     private String msnSum;
  32.
  33.     private String altaVistaSum;
  34.
  35.     private String allTheWebSum;
  36.
  37.     private String yahooSum;
  38.
  39.     private String testURL;
  40.
  41.     public WebAppraise(final String url) {
  42.
  43.         if (url != null && !"".equals(url)) {
  44.             this.testURL = url.trim();
  45.             if (this.testURL.startsWith("http://")) {
  46.                 this.testURL = this.testURL.substring(7);
  47.             }
  48.             if (this.testURL.startsWith("https://")) {
  49.                 this.testURL = this.testURL.substring(8);
  50.             }
  51.         } else {
  52.             throw new RuntimeException("url is NULL!");
  53.         }
  54.
  55.     }
  56.
  57.     /**
  58.      * 分析指定链接结果，并返回整型数值
  59.      * 
  60.      * @param searchURL
  61.      * @param anchor
  62.      * @param trail
  63.      * @return
  64.      */
  65.     private static int getLinks(final String searchURL, final String anchor,
  66.             final String trail) {
  67.         int count = 0;
  68.         String serverResponse;
  69.
  70.         try {
  71.             // 我国特色……
  72.             if (searchURL.startsWith("http://www.baidu.com")) {
  73.                 // 永不离休的gb2312同志(-_-||)
  74.                 serverResponse = SimpleWebClient.getRequestHttp(searchURL,
  75.                         "gb2312");
  76.             } else {
  77.                 serverResponse = SimpleWebClient.getRequestHttp(searchURL);
  78.             }
  79.         } catch (IOException e) {
  80.             serverResponse = e.getMessage();
  81.         }
  82.
  83.         int pos = serverResponse.indexOf(anchor);
  84.         if (pos > 1) {
  85.             serverResponse = serverResponse.substring(pos + anchor.length());
  86.             pos = serverResponse.indexOf(trail);
  87.             String value = serverResponse.substring(0, pos).trim();
  88.             value = value.replace(",", "");
  89.             value = value.replace(".", "");
  90.             count = Integer.parseInt(value);
  91.         }
  92.         return count;
  93.     }
  94.
  95.     public String getAllTheWebSite() {
  96.         return getAllTheWebSite(false);
  97.     }
  98.
  99.     public String getAllTheWebSite(boolean isDomain) {
 100.         try {
 101.             String allTheWeb;
 102.             if (isDomain) {
 103.                 allTheWeb = "http://www.alltheweb.com/search?cat=web&cs=utf8&rys=0&itag=crv&_sb_lang=any&q=linkdomain%3A"
 104.                         + this.testURL;
 105.             } else {
 106.                 allTheWeb = "http://www.alltheweb.com/search?cat=web&cs=utf-8&q=link%3Ahttp%3A%2F%2F"
 107.                         + this.testURL + "&_sb_lang=any";
 108.             }
 109.             allTheWebSum = ""
 110.                     + getLinks(allTheWeb, "<span class=\"ofSoMany\">",
 111.                             "</span>");
 112.         } catch (Exception ex) {
 113.             allTheWebSum = ex.getMessage();
 114.         }
 115.         return allTheWebSum;
 116.     }
 117.
 118.     public String getAltaVistaSite() {
 119.         return getAltaVistaSite(false);
 120.     }
 121.
 122.     public String getAltaVistaSite(boolean isDomain) {
 123.         try {
 124.             String altaVista;
 125.             if (isDomain) {
 126.                 altaVista = "http://www.altavista.com/web/results?itag=ody&q=link%3A"
 127.                         + this.testURL + "&kgs=0&kls=0";
 128.             } else {
 129.                 altaVista = "http://www.altavista.com/web/results?itag=ody&kgs=0&kls=0&q=site%3A"
 130.                         + this.testURL;
 131.             }
 132.             altaVistaSum = "" + getLinks(altaVista, "AltaVista found ", " ");
 133.         } catch (Exception ex) {
 134.             altaVistaSum = ex.getMessage();
 135.         }
 136.         return altaVistaSum;
 137.     }
 138.
 139.     public String getGooglePR() {
 140.         return GooglePageRank.GooglePR(this.testURL);
 141.     }
 142.
 143.     public String getGoogleSite() {
 144.         return getGoogleSite(false);
 145.     }
 146.
 147.     public String getGoogleSite(final boolean isDomian) {
 148.         try {
 149.             String google;
 150.             // 反向链接
 151.             if (isDomian) {
 152.                 google = "http://www.google.com/search?hl=en&q=link%3A"
 153.                         + this.testURL;
 154.             } else {
 155.                 google = "http://www.google.com/search?hl=en&q=site%3A"
 156.                         + this.testURL + "&btnG=Google+Search&aq=f&oq=";
 157.             }
 158.             googleSum = "" + getLinks(google, "about <b>", "</b>");
 159.         } catch (Exception ex) {
 160.             googleSum = ex.getMessage();
 161.         }
 162.         return googleSum;
 163.     }
 164.
 165.     public String getBaiduSite() {
 166.         return getBaiduSite(false);
 167.     }
 168.
 169.     public String getBaiduSite(final boolean isDomian) {
 170.         try {
 171.             String baidu;
 172.             if (isDomian) {
 173.                 baidu = "http://www.baidu.com/s?wd=domain%3A" + this.testURL
 174.                         + "&cl=3";
 175.             } else {
 176.                 baidu = "http://www.baidu.com/s?wd=site%3A" + this.testURL;
 177.             }
 178.             baiduSum = "" + getLinks(baidu, "找到相关网页", "篇");
 179.         } catch (Exception ex) {
 180.             String baidu;
 181.             if (isDomian) {
 182.                 baidu = "http://www.baidu.com/s?wd=domain%3A" + this.testURL
 183.                         + "&cl=3";
 184.             } else {
 185.                 baidu = "http://www.baidu.com/s?wd=site%3A" + this.testURL;
 186.             }
 187.             baiduSum = "" + getLinks(baidu, "找到相关网页约", "篇");
 188.         }
 189.         return baiduSum;
 190.     }
 191.
 192.     public String getYahooSite() {
 193.         return getYahooSite(false);
 194.     }
 195.
 196.     public String getYahooSite(final boolean isDomian) {
 197.         try {
 198.             String yahoo;
 199.             if (isDomian) {
 200.                 yahoo = "http://sitemap.cn.yahoo.com/search?p=" + this.testURL
 201.                         + "&bwm=i";
 202.                 yahooSum = "" + getLinks(yahoo, "<strong>", "</strong>");
 203.             } else {
 204.                 yahoo = "http://www.yahoo.cn/s?p=site%3A" + this.testURL
 205.                         + "&pid=hp&v=web";
 206.                 yahooSum = "" + getLinks(yahoo, "找到相关网页约", "条");
 207.             }
 208.
 209.         } catch (Exception ex) {
 210.             yahooSum = ex.getMessage();
 211.         }
 212.         return yahooSum;
 213.     }
 214.
 215.     public String getMsnSite() {
 216.         return getMsnSite(false);
 217.     }
 218.
 219.     public String getMsnSite(boolean isDomain) {
 220.         try {
 221.             String msn;
 222.             if (isDomain) {
 223.                 msn = "http://cnweb.search.live.com/results.aspx?q=link%3A"
 224.                         + this.testURL + "&mkt=zh-cn&scope=&FORM=LIVSO";
 225.             } else {
 226.                 msn = "http://cnweb.search.live.com/results.aspx?q=site%3A"
 227.                         + this.testURL + "&go=&form=QBRE";
 228.             }
 229.             msnSum = "" + getLinks(msn, "共", "条搜索结果");
 230.         } catch (Exception ex) {
 231.             msnSum = ex.getMessage();
 232.         }
 233.         return msnSum;
 234.     }
 235.
 236.     public String getTestURL() {
 237.         return testURL;
 238.     }
 239.
 240. }
241.


 Test.java

   1. package org.loon.test;
   2.
   3. /**
   4.  * Copyright 2008
   5.  * 
   6.  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
   7.  * use this file except in compliance with the License. You may obtain a copy of
   8.  * the License at
   9.  * 
  10.  * http://www.apache.org/licenses/LICENSE-2.0
  11.  * 
  12.  * Unless required by applicable law or agreed to in writing, software
  13.  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  14.  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  15.  * License for the specific language governing permissions and limitations under
  16.  * the License.
  17.  * 
  18.  * @project loonframework
  19.  * @author chenpeng
  20.  * @email：ceponline@yahoo.com.cn
  21.  * @version 0.1
  22.  */
  23. public class Test {
  24.
  25.     public static void main(String[] args) {
  26.
  27.         WebAppraise appraise = new WebAppraise("http://blog.csdn.net/cping1982");
  28.
  29.          System.out.println("GooglePagerRank值：" + appraise.getGooglePR());
  30.          System.out.println("google收录：" + appraise.getGoogleSite());
  31.          System.out.println("google反向收录：" + appraise.getGoogleSite(true));
  32.          System.out.println("yahoo收录：" + appraise.getYahooSite());
  33.          System.out.println("yahoo反向收录：" + appraise.getYahooSite(true));
  34.          System.out.println("baidu收录：" + appraise.getBaiduSite());
  35.          System.out.println("baidu反向收录：" + appraise.getBaiduSite(true));
  36.          System.out.println("msn收录：" + appraise.getMsnSite());
  37.          System.out.println("msn反向收录：" + appraise.getMsnSite(true));
  38.          System.out.println("AllTheWeb收录：" + appraise.getAllTheWebSite());
  39.          System.out.println("AllTheWeb反向收录：" + appraise.getAllTheWebSite(true));
  40.          System.out.println("AltaVista收录：" + appraise.getAltaVistaSite());
  41.          System.out.println("AltaVista反向收录：" + appraise.getAltaVistaSite(true));
  42.          
  43.     }
  44. }
分享到：
Web压力测试工具 | 爬虫设计要点
2009-02-09 13:55
浏览 7369
评论(1)
查看更多
1 楼 SpreadDiaries 2011-11-17
老兄这个生成ch值没用啊？跑了不行。
发表评论

您还没有登录,请您登录后再发表评论
最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

Java版PageRank及网站收录情况查询代码收藏

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

Java版PageRank及网站收录情况查询代码收藏

评论

发表评论

相关推荐

辨别百度蜘蛛 Google蜘蛛的真伪

ttttttttt

loiyspider网络爬虫

Robots.txt 协议标准

韩文网站 编码方式euc-kr

爬虫设计要点

最近访客更多访客>>

韩文网站编码方式euc-kr