- 浏览: 287360 次
- 性别:
- 来自: 湖南
-
文章分类
- 全部博客 (120)
- Struts 1.框架 (12)
- Spring框架 (9)
- hibernate框架 (6)
- web 综合 (15)
- Dwr (1)
- struts 2 (19)
- 设计模式 (0)
- lucene (6)
- oracle (3)
- linux (10)
- mysql (4)
- jquery (0)
- AJAX (1)
- javaScript (1)
- J2SE (4)
- IBATIS (3)
- JPA (1)
- Compass (3)
- 报表 (1)
- 任务调度 (1)
- tomcat (3)
- RMI (0)
- JMS (2)
- WebService (2)
- SOAP (0)
- XML (1)
- 多线程 (8)
- 缓存 (2)
- nginx (3)
- mongodb (1)
- ant打包 (0)
最新评论
-
iceman1952:
Hi 你觉得很好的那两三篇 百度文库的链接,能贴一下吗?
内网穿透&UDP打洞 -
ice86rain:
里面貌似没有用到Lucene
Struts2+Hibernate3.2+Spring2.5+Compass整合 -
sgq0085:
写得非常详细 好文章
JMS之ActiveMQ Linux下安装与应用实例 -
強顔歓笶:
JMS之ActiveMQ Linux下安装与应用实例 -
yixiandave:
forcer521 写道不指定所有子目录都在一起的话,这样用源 ...
linux下nginx稳定版1.6.2安装
要解析html页面 就要对html中的标签做处理
先准备几个工具类
接下来看看如何解析html页面
加入htmlparser.jar包
现在可以成功的把html解析为纯文本了
先准备几个工具类
package com.cs.parser.util; import org.htmlparser.Node; public class PageContent { private StringBuffer textBuffer; private int number; private Node node; public Node getNode() { return node; } public void setNode(Node node) { this.node = node; } public int getNumber() { return number; } public void setNumber(int number) { this.number = number; } public StringBuffer getTextBuffer() { return textBuffer; } public void setTextBuffer(StringBuffer textBuffer) { this.textBuffer = textBuffer; } }
package com.cs.parser.util; public class TableValid { private int trnum; private int tdnum; private int linknum; private int textnum; private int scriptnum; public int getScriptnum() { return scriptnum; } public void setScriptnum(int scriptnum) { this.scriptnum = scriptnum; } public int getLinknum() { return linknum; } public void setLinknum(int linknum) { this.linknum = linknum; } public int getTdnum() { return tdnum; } public void setTdnum(int tdnum) { this.tdnum = tdnum; } public int getTextnum() { return textnum; } public void setTextnum(int textnum) { this.textnum = textnum; } public int getTrnum() { return trnum; } public void setTrnum(int trnum) { this.trnum = trnum; } }
package com.cs.parser.util; public class TableColumnValid { int tdNum; boolean valid; public int getTdNum() { return tdNum; } public void setTdNum(int tdNum) { this.tdNum = tdNum; } public boolean isValid() { return valid; } public void setValid(boolean valid) { this.valid = valid; } }
接下来看看如何解析html页面
加入htmlparser.jar包
package com.cs; public interface Parsable { public String getTitle() ; public String getContent() ; public String getSummary() ; }
package com.cs; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.nodes.TagNode; import org.htmlparser.nodes.TextNode; import org.htmlparser.tags.Div; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.ParagraphTag; import org.htmlparser.tags.ScriptTag; import org.htmlparser.tags.SelectTag; import org.htmlparser.tags.Span; import org.htmlparser.tags.StyleTag; import org.htmlparser.tags.TableColumn; import org.htmlparser.tags.TableHeader; import org.htmlparser.tags.TableRow; import org.htmlparser.tags.TableTag; import org.htmlparser.tags.TitleTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import com.cs.parser.util.PageContent; import com.cs.parser.util.TableColumnValid; import com.cs.parser.util.TableValid; public class EasyHtmlParser implements Parsable { protected static final String lineSign = System.getProperty( "line.separator"); protected static final int lineSign_size = lineSign.length(); private File file ; private String content ; private String summary ; private String title ; public static void main(String[] args) { EasyHtmlParser eParser = new EasyHtmlParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\www.htm")) ; System.out.println("html content : "+eParser.getContent()) ; } public EasyHtmlParser(File file) { this.file = file ; } private String getString() { try { BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file))) ; String html = "" ; String str = null ; while ((str = br.readLine())!= null ) { html += str ; } return html ; } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null ; } public synchronized String getContent() { if (content != null ) { return content ; } String html = this.getString() ; Parser parser = new Parser() ; try { parser.setInputHTML(html) ; for (NodeIterator e = parser.elements(); e.hasMoreNodes();){ Node node = (Node) e.nextNode(); PageContent context = new PageContent(); context.setNumber(0); context.setTextBuffer(new StringBuffer()); //抓取出内容 extractHtml(node, context, ""); StringBuffer testContext = context.getTextBuffer(); //System.out.println(testContext); content = testContext.toString() ; } if (content == null ) { content = "" ; } if (content.length() < 200) { summary = content ; }else { summary = content.substring(0,200) ; } NodeFilter filter = new NodeClassFilter(TitleTag.class) ; parser.reset() ; NodeList titleNodes = parser.extractAllNodesThatMatch(filter) ; if (titleNodes != null && titleNodes.elementAt(0) != null){ title = titleNodes.elementAt(0).toPlainTextString() ; }else{ title = "" ; } /* System.out.println(file.getAbsolutePath()+" "+"title:"+title); System.out.println(file.getAbsolutePath()+" "+"content:"+content); System.out.println(file.getAbsolutePath()+" "+"summary:"+summary); */ } catch (ParserException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return content; } public String getSummary() { if (summary != null) { return summary ; } if (content == null ) { getContent() ; } return summary; } public String getTitle() { if (title != null) { return title ; } if (content == null ) { getContent() ; } return ""; } protected List extractHtml(Node nodeP, PageContent pageContent, String siteUrl) throws Exception { NodeList nodeList = nodeP.getChildren(); boolean bl = false; if ((nodeList == null) || (nodeList.size() == 0)) { if (nodeP instanceof ParagraphTag) { ArrayList tableList = new ArrayList(); StringBuffer temp = new StringBuffer(); temp.append("<p style=\"TEXT-INDENT: 2em\">"); tableList.add(temp); temp = new StringBuffer(); temp.append("</p>").append(lineSign); tableList.add(temp); return tableList; } return null; } if ((nodeP instanceof TableTag) || (nodeP instanceof Div)) { bl = true; } if (nodeP instanceof ParagraphTag) { ArrayList tableList = new ArrayList(); StringBuffer temp = new StringBuffer(); temp.append("<p style=\"TEXT-INDENT: 2em\">"); tableList.add(temp); extractParagraph(nodeP, siteUrl, tableList); temp = new StringBuffer(); temp.append("</p>").append(lineSign); tableList.add(temp); return tableList; } ArrayList tableList = new ArrayList(); try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) { Node node = (Node) e.nextNode(); if (node instanceof LinkTag) { tableList.add(node); setLinkImg(node, siteUrl); } else if (node instanceof ImageTag) { ImageTag img = (ImageTag) node; if (img.getImageURL().toLowerCase().indexOf("http://") < 0) { img.setImageURL(siteUrl + img.getImageURL()); } else { img.setImageURL(img.getImageURL()); } tableList.add(node); } else if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { } else if (node instanceof TextNode) { if (node.getText().length() > 0) { StringBuffer temp = new StringBuffer(); String text = collapse(node.getText() .replaceAll(" ", "") .replaceAll(" ", "")); temp.append(text.trim()); tableList.add(temp); } } else { if (node instanceof TableTag || node instanceof Div) { TableValid tableValid = new TableValid(); isValidTable(node, tableValid); if (tableValid.getTrnum() > 2) { tableList.add(node); continue; } } List tempList = extractHtml(node, pageContent, siteUrl); if ((tempList != null) && (tempList.size() > 0)) { Iterator ti = tempList.iterator(); while (ti.hasNext()) { tableList.add(ti.next()); } } } } } catch (Exception e) { return null; } if ((tableList != null) && (tableList.size() > 0)) { if (bl) { StringBuffer temp = new StringBuffer(); Iterator ti = tableList.iterator(); int wordSize = 0; StringBuffer node; int status = 0; StringBuffer lineStart = new StringBuffer( "<p style=\"TEXT-INDENT: 2em\">"); StringBuffer lineEnd = new StringBuffer("</p>" + lineSign); while (ti.hasNext()) { Object k = ti.next(); if (k instanceof LinkTag) { if (status == 0) { temp.append(lineStart); status = 1; } node = new StringBuffer(((LinkTag) k).toHtml()); temp.append(node); } else if (k instanceof ImageTag) { if (status == 0) { temp.append(lineStart); status = 1; } node = new StringBuffer(((ImageTag) k).toHtml()); temp.append(node); } else if (k instanceof TableTag) { if (status == 0) { temp.append(lineStart); status = 1; } node = new StringBuffer(((TableTag) k).toHtml()); temp.append(node); } else if (k instanceof Div) { if (status == 0) { temp.append(lineStart); status = 1; } node = new StringBuffer(((Div) k).toHtml()); temp.append(node); } else { node = (StringBuffer) k; if (status == 0) { if (node.indexOf("<p") < 0) { temp.append(lineStart); temp.append(node); wordSize = wordSize + node.length(); status = 1; } else { temp.append(node); status = 1; } } else if (status == 1) { if (node.indexOf("</p") < 0) { if (node.indexOf("<p") < 0) { temp.append(node); wordSize = wordSize + node.length(); } else { temp.append(lineEnd); temp.append(node); status = 1; } } else { temp.append(node); status = 0; } } } } if (status == 1) { temp.append(lineEnd); } if (wordSize > pageContent.getNumber()) { pageContent.setNumber(wordSize); pageContent.setTextBuffer(temp); } return null; } else { return tableList; } } return null; } /** * 提取段落中的内容 * @param nodeP * @param siteUrl * @param tableList * @return */ private List extractParagraph(Node nodeP, String siteUrl, List tableList) { NodeList nodeList = nodeP.getChildren(); if ((nodeList == null) || (nodeList.size() == 0)) { if (nodeP instanceof ParagraphTag) { StringBuffer temp = new StringBuffer(); temp.append("<p style=\"TEXT-INDENT: 2em\">"); tableList.add(temp); temp = new StringBuffer(); temp.append("</p>").append(lineSign); tableList.add(temp); return tableList; } return null; } try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) { Node node = (Node) e.nextNode(); if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { } else if (node instanceof LinkTag) { tableList.add(node); setLinkImg(node, siteUrl); } else if (node instanceof ImageTag) { ImageTag img = (ImageTag) node; if (img.getImageURL().toLowerCase().indexOf("http://") < 0) { img.setImageURL(siteUrl + img.getImageURL()); } else { img.setImageURL(img.getImageURL()); } tableList.add(node); } else if (node instanceof TextNode) { if (node.getText().trim().length() > 0) { String text = collapse(node.getText() .replaceAll(" ", "") .replaceAll(" ", "")); StringBuffer temp = new StringBuffer(); temp.append(text); tableList.add(temp); } } else if (node instanceof Span) { StringBuffer spanWord = new StringBuffer(); getSpanWord(node, spanWord); if ((spanWord != null) && (spanWord.length() > 0)) { String text = collapse(spanWord.toString() .replaceAll(" ", "") .replaceAll(" ", "")); StringBuffer temp = new StringBuffer(); temp.append(text); tableList.add(temp); } } else if (node instanceof TagNode) { String tag = node.toHtml(); if (tag.length() <= 10) { tag = tag.toLowerCase(); if ((tag.indexOf("strong") >= 0) || (tag.indexOf("b") >= 0)) { StringBuffer temp = new StringBuffer(); temp.append(tag); tableList.add(temp); } } else { if (node instanceof TableTag || node instanceof Div) { TableValid tableValid = new TableValid(); isValidTable(node, tableValid); if (tableValid.getTrnum() > 2) { tableList.add(node); continue; } } extractParagraph(node, siteUrl, tableList); } } } } catch (Exception e) { return null; } return tableList; } protected void getSpanWord(Node nodeP, StringBuffer spanWord) { NodeList nodeList = nodeP.getChildren(); try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) { Node node = (Node) e.nextNode(); if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { } else if (node instanceof TextNode) { spanWord.append(node.getText()); } else if (node instanceof Span) { getSpanWord(node, spanWord); } else if (node instanceof ParagraphTag) { getSpanWord(node, spanWord); } else if (node instanceof TagNode) { String tag = node.toHtml().toLowerCase(); if (tag.length() <= 10) { if ((tag.indexOf("strong") >= 0) || (tag.indexOf("b") >= 0)) { spanWord.append(tag); } } } } } catch (Exception e) { } return; } /** * 判断TABLE是否是表单 * @param nodeP * @return */ private void isValidTable(Node nodeP, TableValid tableValid) { NodeList nodeList = nodeP.getChildren(); /**如果该表单没有子节点则返回**/ if ((nodeList == null) || (nodeList.size() == 0)) { return; } try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) { Node node = (Node) e.nextNode(); /**如果子节点本身也是表单则返回**/ if (node instanceof TableTag || node instanceof Div) { return; } else if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { return; } else if (node instanceof TableColumn) { return; } else if (node instanceof TableRow) { TableColumnValid tcValid = new TableColumnValid(); tcValid.setValid(true); findTD(node, tcValid); if (tcValid.isValid()) { if (tcValid.getTdNum() < 2) { if (tableValid.getTdnum() > 0) { return; } else { continue; } } else { if (tableValid.getTdnum() == 0) { tableValid.setTdnum(tcValid.getTdNum()); tableValid.setTrnum(tableValid.getTrnum() + 1); } else { if (tableValid.getTdnum() == tcValid.getTdNum()) { tableValid.setTrnum(tableValid.getTrnum() + 1); } else { return; } } } } } else { isValidTable(node, tableValid); } } } catch (Exception e) { return; } return; } /** * 判断是否有效TR * @param nodeP * @param TcValid * @return */ private void findTD(Node nodeP, TableColumnValid tcValid) { NodeList nodeList = nodeP.getChildren(); /**如果该表单没有子节点则返回**/ if ((nodeList == null) || (nodeList.size() == 0)) { return; } try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) { Node node = (Node) e.nextNode(); /**如果有嵌套表单**/ if (node instanceof TableTag || node instanceof Div || node instanceof TableRow || node instanceof TableHeader) { tcValid.setValid(false); return; } else if (node instanceof ScriptTag || node instanceof StyleTag || node instanceof SelectTag) { tcValid.setValid(false); return; } else if (node instanceof TableColumn) { tcValid.setTdNum(tcValid.getTdNum() + 1); } else { findTD(node, tcValid); } } } catch (Exception e) { tcValid.setValid(false); return; } return; } protected String collapse(String string) { int chars; int length; int state; char character; StringBuffer buffer = new StringBuffer(); chars = string.length(); if (0 != chars) { length = buffer.length(); state = ((0 == length) || (buffer.charAt(length - 1) == ' ') || ((lineSign_size <= length) && buffer.substring(length - lineSign_size, length).equals(lineSign))) ? 0 : 1; for (int i = 0; i < chars; i++) { character = string.charAt(i); switch (character) { case '\u0020': case '\u0009': case '\u000C': case '\u200B': case '\u00a0': case '\r': case '\n': if (0 != state) { state = 1; } break; default: if (1 == state) { buffer.append(' '); } state = 2; buffer.append(character); } } } return buffer.toString(); } /** * 设置图象连接 * @param nodeP * @param siteUrl */ private void setLinkImg(Node nodeP, String siteUrl) { NodeList nodeList = nodeP.getChildren(); try { for (NodeIterator e = nodeList.elements(); e.hasMoreNodes();) { Node node = (Node) e.nextNode(); if (node instanceof ImageTag) { ImageTag img = (ImageTag) node; if (img.getImageURL().toLowerCase().indexOf("http://") < 0) { img.setImageURL(siteUrl + img.getImageURL()); } else { img.setImageURL(img.getImageURL()); } } } } catch (Exception e) { return; } return; } }
现在可以成功的把html解析为纯文本了
发表评论
-
lucene入门到项目开发
2010-10-10 15:53 3366加入jar包 lucene-core-2.4.0.jar ... -
lucene根据文件类型自动解析的工厂类
2010-10-10 15:22 1453阅读本章之前 请先参考其他几篇解析各类文档的章节 http: ... -
lucene如何解析Doc文档
2010-10-10 15:11 1770加入poi-scratchpad-3.0.2-FINAL-20 ... -
lucene如何解析pdf文档
2010-10-10 15:04 2473XPDF使用文档 XPDF版本 3.0.2 日期 2008-1 ... -
lucene如何解析PPT文档
2010-10-10 14:52 2343加入jar包(poi-3.0.2-FINAL-20080204 ...
相关推荐
T型三电平+SVPWM的下垂控制与双闭环中点电位平衡控制.pdf
STM32真实企业级项目:锅炉控制器源码、原理图与PCB图.pdf
STM32F103 Modbus主站源码:正常使役,支持多从机功能码通信及从机寄存器写入.pdf
Simulink永磁同步直驱风机PMSG一次调频离散模型:含虚拟惯性与下垂控制,可扩展至光伏储能研究.pdf
VSG仿真、并网与离网运行仿真、预同期并网控制及虚拟同步机逆变器仿真.pdf
VIC水文模型全程视频教学指导.pdf
vrep_coppeliasim+matlab机器人轨迹控制仿真:利用matlab读取轨迹并控制机械臂在墙上绘图的详细学习示例.pdf
2000-2022年上市公司行业异质性数据(技术密集型、劳动密集型、资本密集型)(含原始数据和处理代码) 1、时间:2000-2022年 2、指标:股票代码、年份、股票简称、统计日期、行业名称、行业代码、成立日期、上市日期、所在省份、所在城市、上市状态、保留两位行业代码、保留一位行业代码、高科技为1,非高科技为0、重污染为1,非重污染为0、制造业为1,非制造业为0、劳动密集型为1,资本密集型为2,技术密集型为3 3、来源:csmar 4、根据2012年中国证监会行业划分是否高科技、是否重污染、是否制造业、是否劳动密集型、资本密集型、技术密集型。 5、内容:包括原始数据、处理代码和计算结果
TMS320F28335电机控制程序:BLDC、PMSM无感有感及异步VF程序源代码与开发资料大全.pdf
tc275、s12x、s32k144基于CANoe的UDS诊断数据库CDD文件及CAPL Boot上位机、下位机程序移植说明文档.pdf
STM32系列通信透传技术:以太网、串口、CAN透传及OBD协议解析.pdf
STM32开发:IIR带阻滤波器设计与实现.pdf
UG后处理:CNC西门子828D后处理与西门子后处理工厂实战自用.pdf
MYSQL深入学习总结.pdf
Stewart六自由度平台反解算法 C#.pdf
1、文件说明: Centos8操作系统vim-ale-3.3.0-1.el8.rpm以及相关依赖,全打包为一个tar.gz压缩包 2、安装指令: #Step1、解压 tar -zxvf vim-ale-3.3.0-1.el8.tar.gz #Step2、进入解压后的目录,执行安装 sudo rpm -ivh *.rpm
tc275、s12x和s32k144的Boot程序及UDS故障诊断与Bootloader移植的Python自制上位机源码.pdf
SSA-CNN-LSTM时间序列预测(Matlab)_ 麻雀算法优化卷积长短期记忆网络.pdf
UI篇:C#工控上位机Chart控件实现与展示.pdf
SRM12-8开关磁阻电机,功率2200w,额定转速3450rpm.pdf