java抓取新闻

浏览 3965 次

锁定老帖子主题：java抓取新闻精华帖 (0) :: 良好帖 (1) :: 新手帖 (0) :: 隐藏帖 (0)
作者	正文
uidin 等级: 初级会员性别: 文章: 12 积分: 40 来自: 郑州	发表时间：2008-05-07 相关推荐: java抓取新闻抓新闻抓网站新闻 java 抓取微信公众号文章 java新闻抓取 java爬虫抓取新闻.zip java爬虫Gecco工具抓取新闻实例更多相关推荐入门技术这个东西虽然简单，但还是挺好玩的：首先把搜索后的页面用流读取出来，再写个正则，去除不要的内容，再把最后的结果存成xml格式文件、或者直接存入数据库，用的时候再调用本代码只是显示html也的源码内容，如果需要抽取内容请自行改写public static String regex()中的正则式 package rssTest; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * @author Der * @date 05-01 * @E-mail uidin@163.com * / public class MyRSS { /* * 获取搜索结果的html源码 * / public static String getHtmlSource(String url) { StringBuffer codeBuffer = null; BufferedReader in=null; try { URLConnection uc = new URL(url).openConnection(); /* * 为了限制客户端不通过网页直接读取网页内容,就限制只能从浏览器提交请求. * 但是我们可以通过修改http头的User-Agent来伪装,这个代码就是这个作用 * / uc.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows XP; DigExt)"); // 读取url流内容 in = new BufferedReader(new InputStreamReader(uc .getInputStream(), "gb2312")); codeBuffer = new StringBuffer(); String tempCode = ""; // 把buffer内的值读取出来,保存到code中 while ((tempCode = in.readLine()) != null) { codeBuffer.append(tempCode).append("\n"); } in.close(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return codeBuffer.toString(); } /* * 正则表达式 * / public static String regex() { String googleRegex = "<div class=g>(.?)href=\"(.?)\"(.?)\">(.?)</a>(.?)<div class=std>(.?)<br>"; return googleRegex; } /* * 测试用 * 在google中检索关键字，并抽取自己想要的内容 * * / public static List<String> GetNews() { List<String> newsList = new ArrayList<String>(); String allHtmlSource = MyRSS .getHtmlSource("http://www.google.cn/search?complete=1&hl=zh-CN&newwindow=1&client=aff-os-maxthon&hs=SUZ&q=%E8%A7%81%E9%BE%99%E5%8D%B8%E7%94%B2&meta=&aq=f"); Pattern pattern = Pattern.compile(regex()); Matcher matcher = pattern.matcher(allHtmlSource); while (matcher.find()) { String urlLink = matcher.group(2); String title = matcher.group(4); title = title.replaceAll("<font color=CC0033>", ""); title = title.replaceAll("</font>", ""); title = title.replaceAll("<b>...</b>", ""); String content = matcher.group(6); content = content.replaceAll("<font color=CC0033>", ""); content = content.replaceAll("</font>", ""); content = content.replaceAll("<b>...</b>", ""); newsList.add(urlLink); newsList.add(title); newsList.add(content); } return newsList; } /* * main方法 * */ public static void main(String[] args) { System.out .println(MyRSS .getHtmlSource("http://main.house.sina.com.cn/news/zckb/index.html")); } } 声明：ITeye文章版权属于作者，受法律保护。没有作者书面许可不得转载。推荐链接
返回顶楼

论坛首页 → 入门技术版

跳转论坛: