utf

philip01

浏览: 47371 次
来自: ...

最近访客更多访客>>

jp8happy

stevenchen

cjb19873

xieyuncs123

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

C C++C#

package com;

public class UTF8Tester {

private static String HEX = "0123456789ABCDEF";

private static String toBin(int n) {
StringBuffer stringBuffer = new StringBuffer();
n = n & 0x000000FF;

  for (int i = 7; i >= 0; i--) {
   if (1 == ((n >> i) & 1)) {
    stringBuffer.append('1');
   } else {
    stringBuffer.append('0');
   }
  }

return stringBuffer.toString();
}

private static String toHex(int n) {
  StringBuffer stringBuffer = new StringBuffer();
  n = n & 0x000000FF;
  stringBuffer.append(HEX.charAt(n >> 4));
  stringBuffer.append(HEX.charAt(n & 0x0F));
  return stringBuffer.toString();
}

private static void printUTF8(char ch) throws Exception {
String unicode = toHex(ch >> 8) + toHex(ch & 0xFF);
String unicodeBin = toBin(ch >> 8) + ' ' + toBin(ch & 0xFF);

String s = "" + ch;
byte[] b = s.getBytes("UTF-8");

String hex = "";

  for (int i = 0; i < b.length; i++) {
   hex += toHex((int) b[i]);
   hex += " ";
  }

  String bin = "";
  for (int i = 0; i < b.length; i++) {
   bin += toBin((int) b[i]);
   bin += " ";
  }
  String sf = String.format("U+%s %s : %-8s : %s", unicode, unicodeBin,
    hex.trim(), bin.trim());
  System.out.println(sf);

}

public static void main(String[] args) throws Exception {

  System.out.println("");
  printUTF8('\u002A');
  printUTF8('\u012A');
  printUTF8('\u012B');
  printUTF8('\u052C');
  printUTF8('\u013C');
  printUTF8('\uAA2A');
  printUTF8('\uFDFD');

}

}

package com;

import java.io.ByteArrayOutputStream;
import java.io.UnsupportedEncodingException;

public class Utf8Utils {

     private final static byte B_10000000 = (byte)0x80;
     private final static byte B_11000000 = (byte)0xc0;
     private final static byte B_11100000 = (byte)0xE0;
     private final static byte B_11110000 = (byte)0xf0;
     private final static byte B_00011100 = (byte)0x1C;
     private final static byte B_00000011 = (byte)0x03;
     private final static byte B_00111111 = (byte)0x3F;
     private final static byte B_00001111 = (byte)0xf;
     private final static byte B_00111100 = (byte)0x3c;

     /** Convert from UTF8 bytes to UNICODE character */
     public static char[] toUCS2(byte[] utf8Bytes) {
         CharList charList = new CharList();
         byte b2 = 0, b3 = 0;
         int ub1 = 0, ub2 = 0;

         for (int i = 0; i < utf8Bytes.length; i++) {
             byte b = utf8Bytes[i];
             if (isNotHead(b)) {
                 // start with 10xxxxxx, skip it.
                 continue;
             } else if (b > 0) {
                 // 1 byte, ASCII   b
                 charList.add((char) b);
             } else if ((b & B_11110000) == B_11110000) {
                 // UCS-4, here we skip it
                 continue;
             } else if ((b & B_11100000) == B_11100000) {
                 // 3 bytes
                 b2 = utf8Bytes[i+1];
                 if (!isNotHead(b2)) continue;
                 i++;
                 b3 = utf8Bytes[i+1];
                 if (!isNotHead(b3)) continue;
                 i++;
                 ub1 = ((b & B_00001111) << 4) + ((b2 & B_00111100) >> 2);
                 ub2 = ((b2 & B_00000011) << 6) + ((b3 & B_00111111));
                 charList.add(makeChar(ub1, ub2));
             } else {
                 // 2 bytes
                 b2 = utf8Bytes[i+1];
                 if (!isNotHead(b2)) continue;
                 i++;
                 ub1 = (b & B_00011100) >> 2;
                 ub2 = ((b & B_00000011) << 6) + (b2 & B_00111111);
                 charList.add(makeChar(ub1, ub2));
             }
         }

         return charList.toArray();
     }

     private static boolean isNotHead(byte b) {
         return (b & B_11000000) == B_10000000;
     }

     private static char makeChar(int b1, int b2) {
         return (char) ((b1 << 8) + b2);
     }

     public static byte[] fromUCS2(char[] ucs2Array) {
         ByteArrayOutputStream baos = new ByteArrayOutputStream();
         for (int i = 0; i < ucs2Array.length; i++) {
             char ch = ucs2Array[i];
             if (ch <= 0x007F) {
                 baos.write(ch);
             } else if (ch <= 0x07FF) {
                 int ub1 = ch >> 8;
                 int ub2 = ch & 0xFF;
                 int b1 = B_11000000 + (ub1 << 2) + (ub2 >> 6);
                 int b2 = B_10000000 + (ub2 & B_00111111);
                 baos.write(b1);
                 baos.write(b2);
             } else {
                 int ub1 = ch >> 8;
                 int ub2 = ch & 0xFF;
                 int b1 = B_11100000 + (ub1 >> 4);
                 int b2 = B_10000000 + ((ub1 & B_00001111) << 2) + (ub2 >> 6);
                 int b3 = B_10000000 + (ub2 & B_00111111);
                 baos.write(b1);
                 baos.write(b2);
                 baos.write(b3);
             }
         }
         return baos.toByteArray();
     }

     private static class CharList {
         private char[] data = null;
         private int used = 0;
         public void add(char c) {
             if (data == null) {
                 data = new char[16];
             } else if (used >= data.length) {
                 char[] temp = new char[data.length * 2];
                 System.arraycopy(data, 0, temp, 0, used);
                 data = temp;
             }
             data[used++] = c;
         }
         public char[] toArray() {
             char[] chars = new char[used];
             System.arraycopy(data, 0, chars, 0, used);
             return chars;
         }
     }

     private static void assert1(String s) throws UnsupportedEncodingException {
         byte[] b = s.getBytes("utf-8");
         char[] c = toUCS2(b);
         if (!s.equals(new String(c))) {
             throw new RuntimeException("Can not pass assert1 for: " + s);
         }
     }

     private static void assert2(String s) throws UnsupportedEncodingException {
         byte[] b = s.getBytes("utf-8");
         byte[] b2 = fromUCS2(s.toCharArray());
         if (b.length == b2.length) {
             int i;
             for (i = 0; i < b.length; i++) {
                 if (b[i] != b2[i]) {
                     break;
                 }
             }
             if (i == b.length) {
                 return;
             }
         }
         throw new RuntimeException("Can not pass assert2 for: " + s);
     }

     public static void main(String[] args) throws Exception {
         assert1("test");
         assert1("中文测试");
         assert1("A中V文c测d试E");
         assert1("\u052CA\u052CBc测");

         assert2("test");
         assert2("中文测试");
         assert2("A中V文c测d试E");
         assert2("\u052CA\u052CBc测\u007F\u07FF");

         System.out.println("pass");
     }

}