浏览 1718 次
锁定老帖子 主题:上传的TXT文件编码怎么获得 有方法
精华帖 (0) :: 良好帖 (0) :: 新手帖 (0) :: 隐藏帖 (0)
|
|
---|---|
作者 | 正文 |
发表时间:2011-08-12
* 上传文件编码判断 * */ public static String get_charset( File file ) { String charset = "GBK"; byte[] first3Bytes = new byte[3]; try { boolean checked=false;; BufferedInputStream bis = new BufferedInputStream( new FileInputStream( file ) ); bis.mark( 0 ); int read = bis.read( first3Bytes, 0, 3 ); if ( read == -1 ) return charset; if ( first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE ) { charset = "UTF-16LE"; checked = true; } else if ( first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF ) { charset = "UTF-16BE"; checked = true; } else if ( first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF ) { charset = "UTF-8"; checked = true; } bis.reset(); if ( !checked ) { // int len = 0; int loc = 0; while ( (read = bis.read()) != -1 ) { loc++; if ( read >= 0xF0 ) break; if ( 0x80 <= read && read <= 0xBF ) // 单独出现BF以下的,也算是GBK break; if ( 0xC0 <= read && read <= 0xDF ) { read = bis.read(); if ( 0x80 <= read && read <= 0xBF ) // 双字节 (0xC0 - 0xDF) (0x80 // - 0xBF),也可能在GB编码内 continue; else break; } else if ( 0xE0 <= read && read <= 0xEF ) {// 也有可能出错,但是几率较小 read = bis.read(); if ( 0x80 <= read && read <= 0xBF ) { read = bis.read(); if ( 0x80 <= read && read <= 0xBF ) { charset = "UTF-8"; break; } else break; } else break; } } //System.out.println( loc + " " + Integer.toHexString( read ) ); } bis.close(); } catch ( Exception e ) { e.printStackTrace(); } return charset; } 声明:ITeye文章版权属于作者,受法律保护。没有作者书面许可不得转载。
推荐链接
|
|
返回顶楼 | |