`
fdqzq
  • 浏览: 8568 次
  • 性别: Icon_minigender_1
  • 来自: 厦门
最近访客 更多访客>>
文章分类
社区版块
存档分类
最新评论

cyberneko和dom4j解析html

阅读更多
闲着无聊用cyberneko和dom4j写了个解析html图片的小程序,在这里贴出主要代码如下
	public void  testPaseHtml(){
            String url = "http://www.iteye.com/";
            Document document = this.getDoc(url);//获取document
            String gz = "//xmlns:IMG/@src";//xpath匹配
		    List nodes = this.getAttr(document,gz);//获取属性
            String[] photoUrl = this.getUrl(nodes);//获取url
            this.insertPhoto(this.getConn(),photoUrl);//插入图片
	}
	/**
	 * 获取属性
	 * @param document
	 * @param gz
	 * @return
	 */
	public List getAttr(Document document,String gz){
		 Map nameSpaces=new HashMap();
			XPath xpath=new DefaultXPath(gz);
			nameSpaces.put("xmlns","http://www.w3.org/1999/xhtml");
	        xpath.setNamespaceContext(new SimpleNamespaceContext(nameSpaces));
			List nodes=xpath.selectNodes(document);
			return nodes;
	}
	/**
	 * 获取Document
	 * @param url
	 * @return
	 */
public Document getDoc(String url){
	DOMParser parser = new DOMParser();
	 try {
		parser.parse(url);
	} catch (SAXException e1) {
		// TODO Auto-generated catch block
		e1.printStackTrace();
	} catch (IOException e1) {
		// TODO Auto-generated catch block
		e1.printStackTrace();
	}
	 org.w3c.dom.Document doc=parser.getDocument(); 

	 //SAXReader reader = new SAXReader();
		DOMReader domReader=new DOMReader();
		Document document =domReader.read(doc);
		return document;
}
/**
 * 获取图片路径
 * @param nodes
 * @return
 */
	public String[] getUrl(List nodes){
		if(nodes==null){
			return null;
		}
		  Iterator iter=nodes.iterator();
		  String[] url = new String[nodes.size()];
	       for(int i=0;iter.hasNext();i++){
	            Attribute attribute=(Attribute)iter.next();
	            url[i] = attribute.getValue();   
	       }
	       return url;
	}
	/**
	 * 获取图片流
	 * @param path
	 * @return
	 */
	public InputStream getInput(String path){
	        URL url = null;
			try {
				url = new URL(path);
			} catch (MalformedURLException e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
			}
	        URLConnection conn = null;
			try {
				conn = url.openConnection();
			} catch (IOException e1) {
				// TODO Auto-generated catch block
				e1.printStackTrace();
			}
	        conn.setDoOutput(true); 
	        
	        InputStream inputStream = null;
			try {
				inputStream = conn.getInputStream();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
			return inputStream;
	}
	/**
	 * 插入图片入数据库
	 * @param conn
	 * @param photoUrl
	 */
public void insertPhoto(Connection conn,String[] photoUrl){
	       String sql = "insert into photo set photo = ?";
		   PreparedStatement ps;
		 
			try {
			  conn.setAutoCommit(false);
			ps = conn.prepareStatement(sql);
			for(int i=0;i<photoUrl.length;i++){
				if("".equals(photoUrl[i]))continue;
		        InputStream inputStream = this.getInput("http://www.iteye.com/"+photoUrl[i]);
			    ps.setBinaryStream(1, inputStream, inputStream.available());
			    ps.addBatch();
			    inputStream.close();
			}
			ps.executeBatch();
			conn.commit();
			conn.close();
		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
}
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics