纯干货,你懂的,各位看官直接看代码:
package com.yida.spider4j.crawler.utils.xml; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import com.yida.spider4j.crawler.utils.common.GerneralUtils; /** * XML常用操作工具类 * * @since 1.0 * @author Lanxiaowei@citic-finance.com * @date 2015-6-16下午3:39:10 * */ public class XMLUtils { private DocumentBuilder builder; private XPath xpath; private XMLUtils () { init(); } private static class SingletonHolder { private static final XMLUtils INSTANCE = new XMLUtils(); } public static final XMLUtils getInstance() { return SingletonHolder.INSTANCE; } private void init() { if(builder == null) { DocumentBuilderFactory domfactory = DocumentBuilderFactory .newInstance(); domfactory.setValidating(false); domfactory.setIgnoringComments(true); try { builder = domfactory.newDocumentBuilder(); } catch (ParserConfigurationException e) { throw new RuntimeException( "Create DocumentBuilder instance occur one exception."); } } if(xpath == null) { XPathFactory xpfactory = XPathFactory.newInstance(); xpath = xpfactory.newXPath(); } } /** * @Author: Lanxiaowei(736031305@qq.com) * @Title: document2String * @Description: W3C Document对象转成XML String * @param @param doc * @param @return * @return String * @throws */ public String document2String(Document doc) { DOMSource domSource = new DOMSource(doc); StringWriter writer = new StringWriter(); StreamResult result = new StreamResult(writer); TransformerFactory tf = TransformerFactory.newInstance(); Transformer transformer; try { transformer = tf.newTransformer(); transformer.transform(domSource, result); } catch (TransformerException e) { throw new RuntimeException( "Transformer org.w3c.dom.document object occur one exception."); } return writer.toString(); } /** * @Author Lanxiaowei * @Title: parseDocument * @Description: 根据XML路径解析XML文档 * @param path * @return * @return Document * @throws */ public Document parseDocument(String path) { try { return builder.parse(path); } catch (SAXException e) { throw new RuntimeException( "The xml path is invalid or parsing xml occur exception."); } catch (IOException e) { throw new RuntimeException( "The xml path is invalid or parsing xml occur exception."); } } /** * @Author Lanxiaowei * @Title: parseDocument * @Description: 根据文件解析XML文档 * @param file * @return * @return Document * @throws */ public Document parseDocument(File file) { try { return builder.parse(file); } catch (SAXException e) { throw new RuntimeException( "The input xml file is null or parsing xml occur exception."); } catch (IOException e) { throw new RuntimeException( "The input xml file is null or parsing xml occur exception."); } } /** * @Author Lanxiaowei * @Title: parseDocument * @Description: 根据输入流解析XML文档 * @param is * @return * @throws IOException * @throws SAXException * @return Document * @throws */ public Document parseDocument(InputStream is) { try { return builder.parse(is); } catch (SAXException e) { throw new RuntimeException( "The input xml fileInputStream is null or parsing xml occur exception."); } catch (IOException e) { throw new RuntimeException( "The input xml fileInputStream is null or parsing xml occur exception."); } } /** * @Author: Lanxiaowei(736031305@qq.com) * @Title: fragment2Document * @Description: 将html代码片段转换成document对象 * @param @param fragment * @param @return * @return Document * @throws */ public Document fragment2Document(String fragment) { try { return builder.parse(new InputSource(new StringReader(fragment))); } catch (SAXException e) { throw new RuntimeException( "parse fragment to document occur SAXException,please check your fragment."); } catch (IOException e) { throw new RuntimeException( "parse fragment to document occur one IOException."); } } /** * @Author Lanxiaowei * @Title: selectNodes * @Description: 通过xpath获取节点列表 * @param node * @param expression * @return * @throws XPathExpressionException * @return NodeList * @throws */ public NodeList selectNodes(Node node, String expression) { XPathExpression xpexpreesion = null; try { xpexpreesion = this.xpath.compile(expression); return (NodeList) xpexpreesion.evaluate(node, XPathConstants.NODESET); } catch (XPathExpressionException e) { throw new RuntimeException( "Compile xpath expression occur excetion,please check out your xpath expression."); } } /** * @Author Lanxiaowei * @Title: selectSingleNode * @Description: 通过xpath获取单个节点 * @param node * @param expression * @return * @return Node * @throws */ public Node selectSingleNode(Node node, String expression) { XPathExpression xpexpreesion = null; try { xpexpreesion = this.xpath.compile(expression); return (Node) xpexpreesion.evaluate(node, XPathConstants.NODE); } catch (XPathExpressionException e) { throw new RuntimeException( "Compile xpath expression occur excetion,please check out your xpath expression."); } } /** * @Author Lanxiaowei * @Title: getNodeText * @Description: 根据xpath获取节点的文本值(只返回匹配的第一个节点的文本值) * @param node * @param expression * @return * @return String * @throws */ public String getNodeText(Node node, String expression) { XPathExpression xpexpreesion = null; try { xpexpreesion = this.xpath.compile(expression); return (String) xpexpreesion.evaluate(node, XPathConstants.STRING); } catch (XPathExpressionException e) { throw new RuntimeException( "Compile xpath expression occur excetion,please check out your xpath expression."); } } /** * @Author: Lanxiaowei(736031305@qq.com) * @Title: getMultiNodeText * @Description: 根据xpath获取节点的文本值(若xpath表达式匹配到多个节点,则会提取所有匹配到节点的文本值) * @param @param node * @param @param expression * @param @return * @return List<String> * @throws */ public List<String> getMultiNodeText(Node node, String expression) { NodeList nodeList = selectNodes(node, expression); if(null == nodeList || nodeList.getLength() == 0) { return null; } List<String> list = new ArrayList<String>(); for(int i=0; i < nodeList.getLength(); i++) { Node n = nodeList.item(i); String text = n.getTextContent(); list.add(text); } return list; } /** * @Author: Lanxiaowei(736031305@qq.com) * @Title: getNodeAttributeValue * @Description: 根据xpath获取节点的属性值(若xpath表达式匹配到多个节点,则只会提取匹配到的第一个节点的属性值) * @param @param node * @param @param expression * @param @param atrributeName * @param @return * @return String * @throws */ public String getNodeAttributeValue(Node node, String expression, String atrributeName) { Node matchNode = selectSingleNode(node, expression); if (null == matchNode) { return null; } Node attNode = matchNode.getAttributes().getNamedItem( atrributeName); if (null == attNode) { return null; } return attNode.getNodeValue(); } /** * @Author: Lanxiaowei(736031305@qq.com) * @Title: getMultiNodeAttributeValue * @Description: 根据xpath获取节点的属性值(若xpath表达式匹配到多个节点,则会提取所有匹配到节点的属性值) * @param @param node * @param @param expression Xpath表达式,如div\span[@class] * @param @param atrributeName 属性名称 * @param @return * @return List<String> * @throws */ public List<String> getMultiNodeAttributeValue(Node node, String expression,String atrributeName) { NodeList nodeList = selectNodes(node, expression); if(null == nodeList || nodeList.getLength() == 0) { return null; } List<String> list = new ArrayList<String>(); for(int i=0; i < nodeList.getLength(); i++) { Node currentItem = nodeList.item(i); Node attNode = currentItem.getAttributes().getNamedItem(atrributeName); if(null == attNode) { continue; } String val = currentItem.getAttributes().getNamedItem(atrributeName).getNodeValue(); list.add(val); } return list; } public static void main(String[] args) throws ParserConfigurationException, SAXException, IOException { /*String fragment = "<data><employee><name>益达</name>" + "<title>Manager</title></employee></data>"; XMLUtils util = new XMLUtils(); Document doc = util.fragment2Document(fragment); NodeList nodes = doc.getElementsByTagName("employee"); for (int i = 0; i < nodes.getLength(); i++) { Element element = (Element) nodes.item(i); NodeList name = element.getElementsByTagName("name"); Element line = (Element) name.item(0); System.out.println("Name: " + line.getNodeName() + ":" + line.getTextContent()); NodeList title = element.getElementsByTagName("title"); line = (Element) title.item(0); System.out.println("Name: " + line.getNodeName() + ":" + line.getTextContent()); }*/ String fragment = "<data><employee><name id=\"1\">益达</name><name id=\"2\">yida</name>" + "<title>Manager</title></employee></data>"; XMLUtils util = new XMLUtils(); Document doc = util.fragment2Document(fragment); List<String> strList = util.getMultiNodeText(doc, "//employee/name[@id]"); String s = GerneralUtils.joinCollection(strList); System.out.println(s); strList = util.getMultiNodeAttributeValue(doc, "//employee/name[@id]", "id"); s = GerneralUtils.joinCollection(strList); System.out.println(s); } }
注意这里说的Document指的都是org.w3c.dom.Document,而不是JDOM or DOM4J or Jsoup里的Document.org.w3c.dom.Document是JDK原生对象.
相关推荐
在Java中,我们可以使用`javax.xml.parsers.DocumentBuilderFactory`和`org.w3c.dom.Document`来解析XML文档,然后遍历DOM树,将其节点转化为Map。例如,每个XML元素的标签名作为键,元素的文本内容作为值。如果元素...
* 本类是专门解析XML文件的,主要用于为系统读取自己的配置文件时提供最方便的解析操作 * @author HX * */ public class XmlManager { /** * 得到某节点下某个属性的值 * @param element 要获取属性的...
本文将详细介绍如何创建一个Java通用的XML解析工具类,以便于在项目中复用和简化XML处理。 首先,我们来看DOM解析方式。DOM解析器会将整个XML文档加载到内存中,形成一棵树形结构,便于对XML进行随机访问。以下是一...
本文将详细介绍如何使用工具类进行XML与实体类的转换,并探讨相关依赖和实现方法。 首先,XML转换为Java实体类的基本原理是通过解析XML文档,创建对应的Java对象。Java中常用的库有JAXB(Java Architecture for XML...
在Java中,`javax.xml.parsers.DocumentBuilderFactory` 和 `org.w3c.dom.Document` 是使用DOM解析XML的主要类。 2. **SAX解析器** SAX(Simple API for XML)是一种基于事件驱动的解析方式,它不将整个XML文档...
3. **XML工具类**: 处理XML文档通常涉及到`javax.xml.parsers`和`org.w3c.dom`等包。`DocumentBuilderFactory`和`DocumentBuilder`用于解析XML文档,生成`Document`对象,然后通过`NodeList`, `Element`, `Attr`等...
DOMUtil是一个处理XML文档的工具类,它基于DOM(Document Object Model)模型,允许程序对XML文档进行读取、解析、修改和生成。DOM模型将XML文档转化为一个树形结构,每个节点代表XML文档的一部分,方便程序进行操作...
这篇博客“XML工具类”可能详细讲解了如何在Java中创建和操作XML文档的实用方法。 在Java中,有多种方式可以解析XML,包括DOM(Document Object Model)、SAX(Simple API for XML)和StAX(Streaming API for XML...
Java中的`javax.xml.parsers.DocumentBuilderFactory`和`org.w3c.dom.Document`类可以用来创建和操作DOM对象。 - SAX解析:SAX(Simple API for XML)是一种事件驱动的解析器,适用于处理大型XML文件。`org.xml.sax...
Java中的`javax.xml.parsers.DocumentBuilderFactory` 和 `org.w3c.dom.Document` 类可用于创建和操作DOM对象。 2. **SAX解析器(Simple API for XML)**:相对于DOM,SAX解析器是事件驱动的,它在解析XML时触发一...
例如,如果这个工具是用Java编写的,那么可能使用了`javax.xml.parsers.DocumentBuilderFactory`和`org.w3c.dom.Document`等类来创建和操作XML文档。如果是Python,可能涉及了`xml.etree.ElementTree`模块。了解这些...
SchemaFactory schemaFactory = SchemaFactory.newInstance(XMLConstants.W3C_XML_SCHEMA_NS_URI); Schema schema = schemaFactory.newSchema(new File(xsdFile)); Validator validator = schema.newValidator();...
接着,通过`parse()`方法解析XML字符串或输入源,返回一个`org.w3c.dom.Document`对象。要将XML文档转换回字符串,可以利用`TransformerFactory`和`Transformer`,设置输出编码,如`"GB2312"`,并使用`transform()`...
本文将深入探讨“XML解析读取通用类”的相关知识点,以帮助开发者更好地理解和处理XML文档。 首先,我们需要理解XML的基本结构。XML文档由元素(Element)、属性(Attribute)、文本内容(Text Content)等组成。...
`XmlDocument`遵循W3C的DOM(文档对象模型)标准,使开发者能够以结构化的方式访问XML文档的各个部分。 DOM是一种将XML文档转换为内存中对象树的接口,每个XML元素、属性、文本节点等都有对应的DOM对象。`Xml...
Java中常用的DOM解析库包括`javax.xml.parsers.DocumentBuilderFactory`和`org.w3c.dom.Document`。`dom4j`是一个流行的第三方DOM解析库,提供更方便的API。 2. **SAX解析器**:Simple API for XML (SAX) 是事件...
`XmlDocument`遵循W3C DOM(Document Object Model)标准,提供了一种结构化的表示方式来访问和修改XML文档的节点。DOM模型将XML文档视为一棵树,其中每个元素、属性、文本等都是树的一个节点。通过`XmlDocument`,...
`XmlDocument`类是.NET Framework提供的一种强大工具,它遵循W3C制定的XML DOM(文档对象模型)标准,允许程序员在内存中创建、修改和处理XML文档。然而,由于`XmlDocument`将整个文档加载到内存中,所以当处理大型...
- 使用DOM解析器:Java的`javax.xml.parsers.DocumentBuilderFactory`和`org.w3c.dom.Document`接口用于构建DOM树。`DocumentBuilder.parse()`方法读取XML,`Transformer.transform()`方法进行写入。 - 使用SAX...
例如,使用`javax.xml.parsers.DocumentBuilderFactory`和`org.w3c.dom.Document`接口进行解析。 2. SAX:SAX是一种事件驱动的解析器,只在需要时读取XML,按顺序触发事件,适合处理大型XML。Java中的`org.xml.sax....