`
neeleon
  • 浏览: 187629 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

java采集csdn论坛源码

    博客分类:
  • java
阅读更多
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WebContent {
 /**
  * 读取一个网页全部内容
  */
 public String getOneHtml(String htmlurl) throws Exception {
  URL url;
  String temp;
  StringBuffer sb = new StringBuffer();

  url = new URL(htmlurl);
  BufferedReader in = new BufferedReader(new InputStreamReader(url
    .openStream(), "utf-8"));// 读取网页全部内容
  while ((temp = in.readLine()) != null) {
   sb.append(temp);
  }
  in.close();
  return sb.toString();
 }

 /**
  *
  * @param s
  * @return 获得网页标题
  */
 public String getTitle(String s, boolean isnew) {
  String regex;
  String title = "";
  List list = new ArrayList();
  regex = "<title>.*?</title>";
  Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ);
  Matcher ma = pa.matcher(s);
  while (ma.find()) {
   list.add(ma.group());
  }
  for (int i = 0; i < list.size(); i++) {
   title = title + list.get(i);
  }
  return outTag(title);
 }
 
 // 标记替换定义  其中FGF是被替换与要替换标记之间的分隔符
 private static String[] bjs = {"&lt;FGF<", "&gt;FGF76>", "&quot;FGF76\""};

 /**
  * 获得正文与回复,指新帖子
  */
 public String[] getTiezi(String s) {
  String regex;
  List list = new ArrayList();
  regex = "msgfont.*?</div>";
  Pattern pa = Pattern.compile(regex, Pattern.MULTILINE);
  Matcher ma = pa.matcher(s);
  while (ma.find()) {
   list.add(outTag(ma.group().replaceAll("msgfont\">", "").replaceAll("<br />", "\r\n").replaceAll("<br/>", "\r\n").replaceAll("&nbsp;", "")));
  }
  
  String[] reStr = new String[list.size()];
  for (int i = 0; i < reStr.length; i++) {
   reStr[i] = replaceByBj(bjs, (String) list.get(i));
  }
  return reStr;
  
 }
 
 public static String replaceByBj(String[] bjs, String nrstr){
     for(int i=0;i<bjs.length;i++){
      String[] bjd = bjs[i].split("NLLD76");
      nrstr = nrstr.replaceAll(bjd[0], bjd[1]);
     }
     return nrstr;
    }
 
 public static String[] getBjs() {
  return bjs;
 }

 public static void setBjs(String[] bjs) {
  WebContent.bjs = bjs;
 }

 /**
  * @方法名称 获得链接
 */
 public String[] getCsdnLink(String s, boolean isnew) {
  if(!isnew){
   return getCsdnHisLink(s);
  }
  String regex;
  List list = new ArrayList();
  regex = "http://topic.csdn.net/u.*?.html";
  Pattern pa = Pattern.compile(regex, Pattern.MULTILINE);
  Matcher ma = pa.matcher(s);
  while (ma.find()) {
   list.add(ma.group());
  }
  String[] reStr = new String[list.size()];
  for (int i = 0; i < reStr.length; i++) {
   reStr[i] = (String) list.get(i);
  }
  return reStr;

 }
 
 public String[] getCsdnHisLink(String s) {
  String regex;
  List list = new ArrayList();
  regex = "http://topic.csdn.net.*?.html";
  Pattern pa = Pattern.compile(regex, Pattern.MULTILINE);
  Matcher ma = pa.matcher(s);
  while (ma.find()) {
   list.add(ma.group());
  }
  String[] reStr = new String[list.size()];
  for (int i = 0; i < reStr.length; i++) {
   reStr[i] = (String) list.get(i);
  }
  return reStr;

 }

 /**
  *
  * @param s
  * @return 获得链接
  */
 public List getLink(String s) {
  String regex;
  List list = new ArrayList();
  regex = "<a[^>]*href=</a>";
  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
  Matcher ma = pa.matcher(s);
  while (ma.find()) {
   list.add(ma.group());
  }
  return list;
 }

 /**
  *
  * @param s
  * @return 获得脚本代码
  */
 public List getScript(String s) {
  String regex;
  List list = new ArrayList();
  regex = "<script.*?</script>";
  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
  Matcher ma = pa.matcher(s);
  while (ma.find()) {
   list.add(ma.group());
  }
  return list;
 }

 /** */
 /**
  *
  * @param s
  * @return 获得CSS
  */
 public List getCSS(String s) {
  String regex;
  List list = new ArrayList();
  regex = "<style.*?</style>";
  Pattern pa = Pattern.compile(regex, Pattern.DOTALL);
  Matcher ma = pa.matcher(s);
  while (ma.find()) {
   list.add(ma.group());
  }
  return list;
 }

 /** */
 /**
  *
  * @param s
  * @return 去掉标记
  */
 public String outTag(String s) {
  return s.replaceAll("<.*?>", "");
 }

 public static void main(String[] args) {
  WebContent w = new WebContent();
  String url = "http://forum.csdn.net/PointForum/Forum/BFTopicList.aspx?Alias=Java&ListType=UnClosedList&page=1";
  try {
   String s = w.getOneHtml(url);
   String[] title2 = w.getTiezi(s);
   for (int i = 0; i < title2.length; i++) {
    System.out.println(title2[i]);
   }
  } catch (Exception e) {
   e.printStackTrace();
  }
 }
}
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics