论坛首页 Java企业应用论坛

上传的TXT文件编码怎么获得 有方法

浏览 1718 次
精华帖 (0) :: 良好帖 (0) :: 新手帖 (0) :: 隐藏帖 (0)
作者 正文
   发表时间:2011-08-12  
/**
* 上传文件编码判断
* */
public static String get_charset( File file ) {  
        String charset = "GBK";  
        byte[] first3Bytes = new byte[3];  
        try {  
            boolean checked=false;;  
            BufferedInputStream bis = new BufferedInputStream( new FileInputStream( file ) );  
            bis.mark( 0 );  
            int read = bis.read( first3Bytes, 0, 3 );  
            if ( read == -1 ) return charset;  
            if ( first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE ) {  
                charset = "UTF-16LE";  
                checked = true;  
            }  
            else if ( first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF ) {  
                charset = "UTF-16BE";  
                checked = true;  
            }  
            else if ( first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF ) {  
                charset = "UTF-8";  
                checked = true;  
            }  
            bis.reset();  
            if ( !checked ) {  
            //    int len = 0;  
                int loc = 0;  
 
                while ( (read = bis.read()) != -1 ) {  
                    loc++;  
                    if ( read >= 0xF0 ) break;  
                    if ( 0x80 <= read && read <= 0xBF ) // 单独出现BF以下的,也算是GBK  
                    break;  
                    if ( 0xC0 <= read && read <= 0xDF ) {  
                        read = bis.read();  
                        if ( 0x80 <= read && read <= 0xBF ) // 双字节 (0xC0 - 0xDF) (0x80  
                                                                        // - 0xBF),也可能在GB编码内  
                        continue;  
                        else break;  
                    }  
                    else if ( 0xE0 <= read && read <= 0xEF ) {// 也有可能出错,但是几率较小  
                        read = bis.read();  
                        if ( 0x80 <= read && read <= 0xBF ) {  
                            read = bis.read();  
                            if ( 0x80 <= read && read <= 0xBF ) {  
                                charset = "UTF-8";  
                                break;  
                            }  
                            else break;  
                        }  
                        else break;  
                    }  
                }  
                //System.out.println( loc + " " + Integer.toHexString( read ) );  
            }  
 
            bis.close();  
        } catch ( Exception e ) {  
            e.printStackTrace();  
        }  
 
        return charset;  
    }  
论坛首页 Java企业应用版

跳转论坛:
Global site tag (gtag.js) - Google Analytics