htmlParser解析html文件

cskysnew

浏览: 13255 次
性别:
来自: 青岛

最近访客更多访客>>

ldf1991

p369000

chenqi20002008

博主相关

博客

微博

相册

留言

关于我

文章分类

全部博客 (3)

社区版块

存档分类

HTML Office J#

java 代码

public class HtmlFileIo {
public static void main (String[] args)
{
String strFile = "file://tenwa-98bf4155e/zhanghftemp/office/JMO_34.htm";
String strDir = "file://tenwa-98bf4155e/zhanghftemp/office";
try
{
//读取一个文件的内容
//readByHtml(strFile);
//得到一个文件最后修改时间
//getRemoteLastModified(strFile);
ArrayList al = getRemoteDirInfo(strDir);
//获取一个文件夹下所有文件的名称
for(int i=0;i<al.size();i++){
System.out.println(al.get(i));
String strName = (al.get(i)).toString();
//获取楼层数
System.out.println(strName.substring(0, strName.indexOf('.')).split("_")[1]);
// System.out.println(strDir+"/"+al.get(i));
//获取楼层中的单元信息
// ArrayList alArea = readByHtml(strDir+"/"+al.get(i));
// for(int j=0;j<alArea.size();j++){
// HashMap hm = (HashMap)alArea.get(j);
//获取楼层中的单元信息
// System.out.println("href:"+hm.get("href")+" shape:"+hm.get("shape")+" coords:"+hm.get("coords"));
// }
}
}
catch (Exception pe)
{
pe.printStackTrace ();
}
}
/**
* 得到指定文件夹下的所有符合规则的文件(本地)
* */
public static ArrayList getFileList(String content)throws Exception{
URI uri = new URI(content);
ArrayList al = new ArrayList();
File file = new File(uri);
System.out.println(file.exists());
if(file.isDirectory()){
File[] filelist = file.listFiles();
for(int i=0;i<filelist.length;i++){
if(filelist[i].getName().substring(0, 4).equals("JMO_")&&getFileTypeName(filelist[i].getName())){
al.add(filelist[i]);
}
}
}
return al;
}
public static boolean getFileTypeName(String strFile){
if(strFile.substring(strFile.indexOf(".")+1, strFile.length()).equals("htm")||strFile.substring(strFile.indexOf(".")+1, strFile.length()).equals("html")){
return true;
}else{
return false;
}
}
/**
* 读取房态图中的单元号，房间大小，ａｒｅａ位置
* */
public static ArrayList readByHtml(String content) throws Exception{
ArrayList alRoom = new ArrayList();
Parser parser = new Parser();
parser.setEncoding("8859_1");
parser.setInputHTML(getWmlContent(content));
PrototypicalNodeFactory factory = new PrototypicalNodeFactory ();
factory.registerTag(new AreaTag ());
parser.setNodeFactory(factory);
NodeList nlArea = parser.extractAllNodesThatMatch(lnkFilter);
for(int i=0;i<nlArea.size();i++){
CompositeTag node = (CompositeTag)nlArea.elementAt(i);
if(node instanceof AreaTag){
AreaTag at = (AreaTag)nlArea.elementAt(i);
HashMap hm = new HashMap();
hm.put("href", at.getHref());
hm.put("shape", at.getShape());
hm.put("coords", at.getCoords());
alRoom.add(hm);
}
}
return alRoom;
}
/**
* 得到文件中的内容
* */
static String getWmlContent(String content) throws Exception{
StringBuffer wml = new StringBuffer();
String line = getRemoteInfo(content);
if(wml.length()>0)
wml.append("\r\n");
wml.append(line);
return wml.toString();
}
/**
* 得到url文件的最后修改时间
* */
public static String getRemoteLastModified(String content)throws Exception{
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
URL url = new URL(content);
java.util.Date dateLast = new java.util.Date(url.openConnection().getLastModified());
String strMod = sdf.format(dateLast);
System.out.println(strMod);
return strMod;
}
/**
* 获取远程html文件的内容
* */
public static String getRemoteInfo(String content)throws Exception{
URL urlfile;
BufferedReader in;
String inputLine;
String info = "";
try {
urlfile = new URL(content);
in = new BufferedReader(new InputStreamReader(urlfile.openStream()));
inputLine = in.readLine();
while (inputLine != null) {
info += inputLine ;
inputLine = in.readLine();
}
in.close();
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e){
// TODO Auto-generated catch block
e.printStackTrace();
}
return info;
}
/**
* 获取远程目录中的文件名称
* */
public static ArrayList getRemoteDirInfo(String content)throws Exception{
ArrayList alFile = new ArrayList();
URL urlfile;
BufferedReader in;
String inputLine;
String info = "";
try {
urlfile = new URL(content);
in = new BufferedReader(new InputStreamReader(urlfile.openStream()));
inputLine = in.readLine();
while (inputLine != null) {
if(inputLine.substring(0,4).equals("JMO_")&&getFileTypeName(inputLine)){
alFile.add(inputLine);
}
inputLine = in.readLine();
}
in.close();
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e){
// TODO Auto-generated catch block
e.printStackTrace();
}
return alFile;
}
static NodeFilter lnkFilter = new NodeFilter() {
public boolean accept(Node node) {
if(node instanceof AreaTag)
return true;
return false;
}
};
/**
* 定义ａｒｅａ标签，用于查找ａｒｅａ信息
* */
static class AreaTag extends CompositeTag{
private static final String[] mIds = new String[] {"area"};
private static final String[] mEndTagEnders = new String[] {"map"};
/**
* Create a new text area tag.
*/
public AreaTag (){
}
/**
* Return the set of names handled by this tag.
* @return The names to be matched that create tags of this type.
*/
public String[] getIds ()
{
return (mIds);
}
public String[] getEnders (){
return (mIds);
}
public String[] getEndTagEnders (){
return (mEndTagEnders);
}
public String getHref(){
return super.getAttribute("href");
}
public String getCoords(){
return super.getAttribute("coords");
}
public String getShape(){
return super.getAttribute("shape");
}
public String toString(){
return mIds[0].toString();
}
}
}

分享到：

osworkflow工作流的workitem的一种实现方式

2007-08-29 13:36
浏览 6526
评论(4)
查看更多

4 楼 jjp2009 2008-08-16

这两句到底读取的是什么啊

3 楼 jjp2009 2008-08-16

String strFile = "file://tenwa-98bf4155e/zhanghftemp/office/JMO_34.htm";
String strDir = "file://tenwa-98bf4155e/zhanghftemp/office";
楼主这两句话是什么意思啊

2 楼 water84222 2008-04-17

请问一下，怎样将修改过得html保存到文件中
code如下
parser = new Parser(getContentByLocalFile(file));
NodeFilter nt = new NodeClassFilter(ImageTag.class) ;
NodeList tmpImageList = (NodeList) parser.parse(nt);

/*linkTmpHash = new Hashtable();
for (int i = 0; i < length; i++) {
Element tmpElement = (Element) tmpNodeList.item(i);
String href = tmpElement.getAttribute("href");
if (href != null && !href.equals("")) {
linkTmpHash.put(href, "");
}
}
data.setHrefs((String[]) linkTmpHash.keySet().toArray(new String[linkTmpHash.size()]));*/
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter (new FileOutputStream (file)));
linkTmpHash = new Hashtable();
for (int i = 0; i < tmpImageList.size(); i++) {
imgnode = (ImageTag)tmpImageList.elementAt(i);
String src = imgnode.getImageURL();
if (URLPathNameUtil.isAbsolutePath(src)) {
if (testAbsolutePath) {
testImagetag(file,src);
}
} else {
if (testRelativePath) {
testImagetag(file, src);
}
}
if(getRealPath()!=null){
imgnode.setImageURL(getRealPath());
writer.write(tmpImageList.toHtml());
}
/*if (src != null && !src.equals("")) {
linkTmpHash.put(src, "");
}*/
}
writer.flush();
writer.close ();

谢谢了

1 楼 yongtree 2007-10-17

顶！
好东东哦。

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论