正则表达式解析HTML,code -

lewking

浏览: 9797 次
性别:
来自: 西安

最近访客更多访客>>

pengjj2

ktylin

luzhenkun

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

正则表达式解析HTML,code

博客分类：

code

public class HTMLBean<T> {

	private String eleName;//元素名称
	private int eleCount;//元素个数
	private String result;//元素值
	private Map<String,T> attribute;//元素属性
	
	public String getResult() {
		return result;
	}
	public void setResult(String result) {
		this.result = result;
	}
	public Map<String, T> getAttribute() {
		return attribute;
	}
	public void setAttribute(Map<String, T> attribute) {
		this.attribute = attribute;
	}
	public String getEleName() {
		return eleName;
	}
	public void setEleName(String eleName) {
		this.eleName = eleName;
	}
	public int getEleCount() {
		return eleCount;
	}
	public void setEleCount(int eleCount) {
		this.eleCount = eleCount;
	}
}

public interface ElementFilter<T> {

	boolean filter(HTMLBean<T> bean);
	
}

public class HtmlUtil {

		//patternString1 = "<(\\w+)\\s+?(type\\s*?=[^>]+)?\\s+?(src\\s*?=[^>]+)?>(.*?)</\\1>";
		private static final String patternString = "<(\\w+)\\s+(\\w+\\s*=[^>]+)?>(.*?)</\\1>";
		private static final Pattern pattern = Pattern.compile(patternString,Pattern.DOTALL);
		private static final String patternString1 = "(\\w+?)=[\"|\']?(.*?)[\"|\']?\\s+?";
		private static final Pattern pattern1 = Pattern.compile(patternString1,Pattern.DOTALL);
		
		public static HTMLBean<String> getHTMLBean(String content){
			HTMLBean<String> bean = null;
	        Matcher matcher = pattern.matcher(content);;   
	        if(matcher!=null && matcher.find()) {
	        	bean = new HTMLBean<String>();
	        	bean.setEleName(matcher.group(1));
	        	String result = matcher.group(2); 
	        	bean.setResult(matcher.group(3));
	        	HashMap<String,String> attrMap = new HashMap<String, String>();
	        	result = result + " ";
	            Matcher m = pattern1.matcher(result);
	            int len = 0;
	            while(m!=null && m.find())   
	            {   
	                attrMap.put(m.group(1),m.group(2));
	                len = len + 1;
	            }   
	        	//String[] attr = result.split("\\s+?\\w+?=");
	/*        	int len = attr.length;
	        	for (int i = 0; i < len; i++) {
	        		String temp = attr[i].replaceAll("\"|'", "");
	        		int index = temp.indexOf("=");
	        		if (index > -1) {
	        			attrMap.put(temp.substring(0, index),temp.substring(index + 1, temp.length()));
	        		}
	        		else {
	        			attrMap.put(temp,"");
	        		}
	        		//String[] temp = attr[i].split("=\\*?[\"|']");
	        		//System.out.println(attr[i]);
	        		//attrMap.put(temp[0],temp.length > 1 ? temp[1] : "");
				}*/
	        	bean.setAttribute(attrMap);
	        	bean.setEleCount(len);
	        }
			return bean;
		}

		public static List<HTMLBean<String>> getHTMLBeanList(String path,
				ElementFilter<String> filter) throws IOException {
				LinkedList<HTMLBean<String>> link = new LinkedList<HTMLBean<String>>();
				InputStream fs = new FileInputStream(path);
				InputStreamReader isr = new InputStreamReader(fs, "UTF-8");
				BufferedReader br = new BufferedReader(isr);
				String r = null;
				while ((r = br.readLine()) != null) {
					HTMLBean<String> bean = getHTMLBean(r);
					if (bean != null && filter.filter(bean)) {
						link.add(bean);
					}
				}
			return link;
		}
		
		@Test
		public void getHTMLBeanContentList_test() throws IOException{
			String p = "D:\\Users\\lewking\\Desktop\\test.html";
			List<HTMLBean<String>> link = 
					getHTMLBeanList(p,new ElementFilter<String>(){
						@Override
						public boolean filter(HTMLBean<String> bean) {
							//过滤 A 标签
							return "a".equals(bean.getEleName().toLowerCase());
						}
					});
			
			System.out.println("解析完成..............");
			for(HTMLBean<String> bean : link){
				System.out.println("< " + bean.getEleName() +" >");
				System.out.println("%%%%%%%%: " + bean.getResult());
				Map<String,String> m = bean.getAttribute();
				for(Iterator<Map.Entry<String, String>> entry = m.entrySet().iterator();entry.hasNext();){
					Entry<String, String> e = entry.next();
					System.out.println(e.getKey() +" : "+ e.getValue());
				}
			}
		}
		
}

分享到：