字符集编码的自动识别jchardet

xiangxingchina

浏览: 526959 次
性别:
来自: 北京

最近访客更多访客>>

WoKo_Jb

lanmubai

forfelicity

linktoyl22

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

算法 D语言编程 Ant CGI

什么是jchardet?

jchardet是mozilla自动字符集探测算法代码的java移植,其源代码可以从sourceforge下载。这个算法的最初作者是 frank Tang,C++源代码在http://www.infomall.cn/cgi-bin/mallgate/20040514/http: //lxr.mozilla.org/mozilla/source/intl/chardet/，可以从http://www.infomall.cn /cgi-bin/mallgate/20040514/http://www.mozilla.org/projects/intl /chardet.html得到更多关于这个算法的信息。

编译及应用

　　将下载后的chardet.zip解压缩后，到~/mozilla/intl/chardet/java/目录下，运行ant即可在dist/lib目录下生成chardet.jar，将这个jar包加入CLASSPATH.然后
运行：java org.mozilla.intl.chardet.HtmlCharsetDetector http://hedong.3322.org
结果：CHARSET = GB18030
运行：java org.mozilla.intl.chardet.HtmlCharsetDetector http://www.wesnapcity.com/
结果：CHARSET = ASCII
运行：java org.mozilla.intl.chardet.HtmlCharsetDetector http://www.wesnapcity.com/blog/
结果：CHARSET = UTF-8

编程使用

　　下面就jchardet.jar中的HtmlCharsetDetector.java,对调用jchardet过程予以说明：

Java代码

//实现nsICharsetDetectionObserver接口，这个接口只有一个Notify()方法.当jchardet引擎自己认为已经识别出字符串的字符集后(不论识别的对错)，都会调用这个Notify方法。
nsICharsetDetectionObserver cdo=new nsICharsetDetectionObserver() {
public void Notify(String charset) {
HtmlCharsetDetector.found = true ;
System.out.println("CHARSET = " + charset);
}
};
/**
* 初始化nsDetector()
*lang为一个整数，用以提示语言线索，可以提供的语言线索有以下几个：
*
Japanese
Chinese
Simplified Chinese
Traditional Chinese
Korean
Dont know (默认)
*/
nsDetector det = new nsDetector(lang) ;
// 设置一个Oberver
det.Init(cdo);
BufferedInputStream imp = new BufferedInputStream(url.openStream());
byte [] buf = new byte [ 1024 ] ;
boolean done = false ; //是否已经确定某种字符集
boolean isAscii = true ; //假定当前的串是ASCII编码
while ( (len=imp.read(buf, 0 ,buf.length)) != - 1 ) {
// 检查是不是全是ascii字符，当有一个字符不是ASC编码时，则所有的数据即不是ASCII编码了。
if (isAscii) isAscii = det.isAscii(buf,len);
// 如果不是ascii字符，则调用DoIt方法.
if (!isAscii && !done) done = det.DoIt(buf,len, false ); //如果不是ASCII，又还没确定编码集，则继续检测。
}
det.DataEnd();//最后要调用此方法，此时，Notify被调用。
if (isAscii) {
System.out.println("CHARSET = ASCII" );
found = true ;
}
if (!found) { //如果没找到，则找到最可能的那些字符集
String prob[] = det.getProbableCharsets() ;
for ( int i= 0 ; i System.out.println( "Probable Charset = " + prob[i]);
}
}

//实现nsICharsetDetectionObserver接口，这个接口只有一个Notify()方法.当jchardet引擎自己认为已经识别出字符串的字符集后(不论识别的对错)，都会调用这个Notify方法。
nsICharsetDetectionObserver cdo=new nsICharsetDetectionObserver() {
  public void Notify(String charset) {
   HtmlCharsetDetector.found = true ;
   System.out.println("CHARSET = " + charset);
  }
};
/**
* 初始化nsDetector()
*lang为一个整数，用以提示语言线索，可以提供的语言线索有以下几个： 
* 
Japanese 
Chinese 
Simplified Chinese 
Traditional Chinese 
Korean 
Dont know (默认) 

*/
nsDetector det = new nsDetector(lang) ;
// 设置一个Oberver
det.Init(cdo);
BufferedInputStream imp = new BufferedInputStream(url.openStream());
byte[] buf = new byte[1024] ;
boolean done = false ;  //是否已经确定某种字符集
boolean isAscii = true ;//假定当前的串是ASCII编码
while( (len=imp.read(buf,0,buf.length)) != -1) {
  // 检查是不是全是ascii字符，当有一个字符不是ASC编码时，则所有的数据即不是ASCII编码了。
  if (isAscii) isAscii = det.isAscii(buf,len);
  // 如果不是ascii字符，则调用DoIt方法.
  if (!isAscii && !done) done = det.DoIt(buf,len, false);//如果不是ASCII，又还没确定编码集，则继续检测。
}
det.DataEnd();//最后要调用此方法，此时，Notify被调用。
if (isAscii) {
System.out.println("CHARSET = ASCII");
found = true ;
}
if (!found) {//如果没找到，则找到最可能的那些字符集
String prob[] = det.getProbableCharsets() ;
for(int i=0; i   System.out.println("Probable Charset = " + prob[i]);
}
}

使用方法如下

Java代码

//使用 jchardet 获得文件编码 -javacode
//当含中文的文件用ANSI编码保存时，检测还是出错。
package org.mozilla.intl.chardet;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
/**
* 借助JCharDet获取文件字符集
* @author icer
* PS:
* JCharDet 是mozilla自动字符集探测算法代码的java移植，其官方主页为：
* http://jchardet.sourceforge.net/
* @date 2008/11/13
*/
public class FileCharsetDetector {
private boolean found = false ;
/**
* 如果完全匹配某个字符集检测算法, 则该属性保存该字符集的名称. 否则(如二进制文件)其值就为默认值 null, 这时应当查询属性
*/
private String encoding = null ;
public static void main(String[] argv) throws Exception {
if (argv.length != 1 && argv.length != 2 ) {
System.out
.println("Usage: FileCharsetDetector <path> [<languageHint>]" );
System.out.println("" );
System.out.println("Where <path> is d:/demo.txt" );
System.out.println("For optional <languageHint>. Use following..." );
System.out.println(" 1 => Japanese" );
System.out.println(" 2 => Chinese" );
System.out.println(" 3 => Simplified Chinese" );
System.out.println(" 4 => Traditional Chinese" );
System.out.println(" 5 => Korean" );
System.out.println(" 6 => Dont know (default)" );
return ;
} else {
String encoding = null ;
if (argv.length == 2 ) {
encoding = new FileCharsetDetector().guestFileEncoding(argv[ 0 ],
Integer.valueOf(argv[1 ]));
} else {
encoding = new FileCharsetDetector().guestFileEncoding(argv[ 0 ]);
}
System.out.println("文件编码:" + encoding);
}
}
/**
* 传入一个文件(File)对象，检查文件编码
*
* @param file
* File对象实例
* @return 文件编码，若无，则返回null
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(File file) throws FileNotFoundException,
IOException {
return geestFileEncoding(file, new nsDetector());
}
/**
* 获取文件的编码
*
* @param file
* File对象实例
* @param languageHint
* 语言提示区域代码 eg：1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;
* 4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)
* @return 文件编码，eg：UTF-8,GBK,GB2312形式，若无，则返回null
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(File file, int languageHint)
throws FileNotFoundException, IOException {
return geestFileEncoding(file, new nsDetector(languageHint));
}
/**
* 获取文件的编码
*
* @param path
* 文件路径
* @return 文件编码，eg：UTF-8,GBK,GB2312形式，若无，则返回null
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(String path) throws FileNotFoundException,
IOException {
return guestFileEncoding( new File(path));
}
/**
* 获取文件的编码
*
* @param path
* 文件路径
* @param languageHint
* 语言提示区域代码 eg：1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;
* 4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)
* @return
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(String path, int languageHint)
throws FileNotFoundException, IOException {
return guestFileEncoding( new File(path), languageHint);
}
/**
* 获取文件的编码
*
* @param file
* @param det
* @return
* @throws FileNotFoundException
* @throws IOException
*/
private String geestFileEncoding(File file, nsDetector det)
throws FileNotFoundException, IOException {
// Set an observer...
// The Notify() will be called when a matching charset is found.
det.Init(new nsICharsetDetectionObserver() {
public void Notify(String charset) {
found = true ;
encoding = charset;
}
});
BufferedInputStream imp = new BufferedInputStream( new FileInputStream(
file));
byte [] buf = new byte [ 1024 ];
int len;
boolean done = false ;
boolean isAscii = true ;
while ((len = imp.read(buf, 0 , buf.length)) != - 1 ) {
// Check if the stream is only ascii.
if (isAscii)
isAscii = det.isAscii(buf, len);
// DoIt if non-ascii and not done yet.
if (!isAscii && !done)
done = det.DoIt(buf, len, false );
}
det.DataEnd();
if (isAscii) {
encoding = "ASCII" ;
found = true ;
}
if (!found) {
String prob[] = det.getProbableCharsets();
if (prob.length > 0 ) {
// 在没有发现情况下，则取第一个可能的编码
encoding = prob[0 ];
} else {
return null ;
}
}
return encoding;
}
}

//使用 jchardet 获得文件编码 -javacode
//当含中文的文件用ANSI编码保存时，检测还是出错。

package org.mozilla.intl.chardet;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

/**
* 借助JCharDet获取文件字符集
* @author icer
* PS:
* JCharDet 是mozilla自动字符集探测算法代码的java移植，其官方主页为：
*      http://jchardet.sourceforge.net/
* @date 2008/11/13 
*/
public class FileCharsetDetector {

private boolean found = false;

/**
* 如果完全匹配某个字符集检测算法, 则该属性保存该字符集的名称. 否则(如二进制文件)其值就为默认值 null, 这时应当查询属性 
*/
private String encoding = null;

public static void main(String[] argv) throws Exception {
   if (argv.length != 1 && argv.length != 2) {

    System.out
      .println("Usage: FileCharsetDetector <path> [<languageHint>]");

    System.out.println("");
    System.out.println("Where <path> is d:/demo.txt");
    System.out.println("For optional <languageHint>. Use following...");
    System.out.println("   1 => Japanese");
    System.out.println("   2 => Chinese");
    System.out.println("   3 => Simplified Chinese");
    System.out.println("   4 => Traditional Chinese");
    System.out.println("   5 => Korean");
    System.out.println("   6 => Dont know (default)");

    return;
   } else {
    String encoding = null;
    if (argv.length == 2) {
     encoding = new FileCharsetDetector().guestFileEncoding(argv[0],
       Integer.valueOf(argv[1]));
    } else {
     encoding = new FileCharsetDetector().guestFileEncoding(argv[0]);
    }
    System.out.println("文件编码:" + encoding);
   }
}

/**
* 传入一个文件(File)对象，检查文件编码
* 
* @param file
*            File对象实例
* @return 文件编码，若无，则返回null
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(File file) throws FileNotFoundException,
    IOException {
   return geestFileEncoding(file, new nsDetector());
}

/**
* 获取文件的编码
* 
* @param file
*            File对象实例
* @param languageHint
*            语言提示区域代码 eg：1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;
*            4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)
* @return 文件编码，eg：UTF-8,GBK,GB2312形式，若无，则返回null
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(File file, int languageHint)
    throws FileNotFoundException, IOException {
   return geestFileEncoding(file, new nsDetector(languageHint));
}

/**
* 获取文件的编码
* 
* @param path
*            文件路径
* @return 文件编码，eg：UTF-8,GBK,GB2312形式，若无，则返回null
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(String path) throws FileNotFoundException,
    IOException {
   return guestFileEncoding(new File(path));
}

/**
* 获取文件的编码
* 
* @param path
*            文件路径
* @param languageHint
*            语言提示区域代码 eg：1 : Japanese; 2 : Chinese; 3 : Simplified Chinese;
*            4 : Traditional Chinese; 5 : Korean; 6 : Dont know (default)
* @return
* @throws FileNotFoundException
* @throws IOException
*/
public String guestFileEncoding(String path, int languageHint)
    throws FileNotFoundException, IOException {
   return guestFileEncoding(new File(path), languageHint);
}

/**
* 获取文件的编码
* 
* @param file
* @param det
* @return
* @throws FileNotFoundException
* @throws IOException
*/
private String geestFileEncoding(File file, nsDetector det)
    throws FileNotFoundException, IOException {
   // Set an observer...
   // The Notify() will be called when a matching charset is found.
   det.Init(new nsICharsetDetectionObserver() {
    public void Notify(String charset) {
     found = true;
     encoding = charset;
    }
   });

   BufferedInputStream imp = new BufferedInputStream(new FileInputStream(
     file));

   byte[] buf = new byte[1024];
   int len;
   boolean done = false;
   boolean isAscii = true;

   while ((len = imp.read(buf, 0, buf.length)) != -1) {
    // Check if the stream is only ascii.
    if (isAscii)
     isAscii = det.isAscii(buf, len);

    // DoIt if non-ascii and not done yet.
    if (!isAscii && !done)
     done = det.DoIt(buf, len, false);
   }
   det.DataEnd();

   if (isAscii) {
    encoding = "ASCII";
    found = true;
   }

   if (!found) {
    String prob[] = det.getProbableCharsets();
    if (prob.length > 0) {
     // 在没有发现情况下，则取第一个可能的编码
     encoding = prob[0];
    } else {
     return null;
    }
   }
   return encoding;
}
}

分享到：

如何去除桌面图标的边框 | 心得随记（项目管理）

2010-10-27 09:56
浏览 3549
评论(1)
分类:编程语言
查看更多

1 楼蓝月儿 2011-03-23

JCharDet 在主页上怎么下不下来着急

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论