UTF8 和 GBK混合的文本识别转换..... -

f002489

浏览: 274964 次
性别:
来自: 成都

最近访客更多访客>>

guodong666

yhnd685

quanhailee

liaoyang.777

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

UTF8 和 GBK混合的文本识别转换.....

博客分类：

http://bbs.chinaunix.net/thread-971041-1-1.html

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <iconv.h>
#include <stdint.h>
#include <errno.h>

static int charconv(char *from, char *to,
        const char *input, int inlen, char **output, int *outlen)
{
        char *inbuf;
        char *outbuf;
        size_t inleft;
        size_t outleft;
        iconv_t cd;
        size_t result;

        cd = iconv_open(to, from);
        if (cd == (iconv_t) (-1)) {
                *outlen = -1;
                *output = NULL;
                return -1;
        }

        if (inlen == 0)
                inlen = strlen(input);
        *outlen = 4 * inlen;

        inbuf = (char *)input;
        outbuf = (char *) malloc(*outlen);

        inleft = inlen;
        outleft = *outlen;

        *output = outbuf;

#if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2)
        result = iconv(cd, &inbuf, &inleft, &outbuf, &outleft);
#else
        result =
                iconv(cd, (const char **) &inbuf, &inleft, &outbuf, &outleft);
#endif

        iconv_close(cd);

        *outlen = *outlen - outleft;
        (*output)[*outlen] = 0;
        return inlen - inleft;
}

int isutf8(char *s, size_t ns)
{
        uint8_t x = 0, i = 0, j = 0, nbytes = 0, n = 0;

        for(i = 1; i < 7; i++)
        {
                x = (uint8_t)(255 << i);
                if(((uint8_t)*s & x) == x)
                {
                        n = nbytes = (8 - i);
                        for(j = 0; (j < nbytes && j < ns); j++)
                        {
                                if((uint8_t)s[j] <= 0x80 && (uint8_t)s[j] >= 0xc0)break;
                                else n--;
                        }
                        if(n == 0) return nbytes;
                }
        }
        return 0;
}

int isgbk(char *s, size_t ns)
{
        if(ns > 2 && (uint8_t)*s >= 0x81 && (uint8_t)*s <= 0xfe
                && (
                        ((uint8_t)*(s+1) >= 0x80 && (uint8_t)*(s+1) <= 0x7e)
                        || ((uint8_t)*(s+1) >= 0xa1 && (uint8_t)*(s+1) <= 0xfe)
                    )
          )
        {
                return 1;
        }
        return 0;
}

void convert(char *src, size_t nsrc, char **dst, int *ndst,
        const char *codefrom, const char *codeto)
{
        char *s = src, *d = (*dst), *end = (src + nsrc), *p = NULL;
        iconv_t handler, cd;
        size_t n = 0, ns = nsrc, nd = (*ndst), result = 0;
        size_t x = 0, nbytes = 0, nbuf = 16;
        char buf[nbuf];

        handler = iconv_open(codeto, codefrom);
        while(ns > 0)
        {
                n = ns;
                if((nbytes = isutf8(s, ns)) > 0)
                {
                        memcpy(d, s, nbytes);
                        s += nbytes;
                        d += nbytes;
                        ns -= nbytes;
                        nd -= nbytes;
                        //fprintf(stdout, "utf8:%d\n", ns);
                        //fprintf(stdout, "utf8:%d:%s\n", nbytes, (d - nbytes));
                }
                else if(isgbk(s, ns))
                {
                        memset(buf, 0, nbuf);
                        memcpy(buf, s, 2);
                        x = 2;
                        p = buf;
                        result = iconv(handler, &p, &x, &d, &nd);
                        ns -= 2;
                        s += 2;
                        //fprintf(stdout, "gbk:%d\n", ns);
                }
                else
                {
                        *d++ = *s++;
                        ns--;
                        nd--;
                }
                if(ns == n) break;
        }
        //fprintf(stdout, "%s\n", *dst);
        iconv_close(handler);
}

#ifdef _DEBUG_UTF8_FILE
int main(int argc, char **argv)
{
        char *file = NULL;
        char *s = NULL, *inbuffer = NULL, *outbuffer = NULL;
        struct stat st;
        int i = 0, n = 0;
        FILE *fp = NULL;
        size_t nout;

        if(argc < 2)
        {
                fprintf(stderr, "Usage:%s file ...\n", argv[0]);
                _exit(-1);
        }

        for(i = 1; i <= argc; i++)
        {
                if((fp = fopen(argv[i], "r")))
                {
                        if(stat(argv[i], &st) == 0 && st.st_size > 0)
                        {
                                s = inbuffer = (char *)calloc(1, (st.st_size + 1));
                                outbuffer = (char *)calloc(1, (st.st_size * 4 + 1));
                                nout = st.st_size * 4;
                                while((n = fread(s, 1, st.st_size, fp)) > 0)
                                {
                                        s += n;
                                }
                                if(( n = (s - inbuffer)) > 0)
                                {
                                        convert(inbuffer, n,
                                                        &outbuffer, &nout,
                                                        "gbk", "utf8");
                                        fprintf(stdout, "%s\n", outbuffer);
                                }
                                else
                                {
                                        fprintf(stderr, "read %s %d bytes failed, %s\n",
                                                argv[i], n, strerror(errno));
                                }
                                free(inbuffer);
                                free(outbuffer);
                        }
                        fclose(fp);
                }
        }
}
#endif

#ifdef _DEBUG_UTF8_STRING
int main(int argc, char **argv)
{
        int i = 0, n = 0;
        if(argc < 2)
        {
                fprintf(stderr, "Usage:%s string ...\n", argv[0]);
                _exit(-1);
        }

        for(i = 1; i < argc; i++)
        {
                if((n = isutf8(argv[i], strlen(argv[i]))) > 0)
                {
                        fprintf(stdout, "\"%s\" is %d bytes UTF8 charset\n", argv[i], n);
                }
                else if(isgbk(argv[i], strlen(argv[i])))
                {
                        fprintf(stdout, "\"%s\" is GBK charset\n", argv[i]);
                }
                else
                {
                        fprintf(stdout, "\"%s\" is unknown charset\n", argv[i]);
                }
        }
}
#endif

分享到：

Linux头文件 C/C++头文件 | Java如何获取文件编码格式

2013-06-20 16:20
浏览 1489
评论(1)
分类:编程语言
查看更多

1 楼 f002489 2013-06-21

ICU ,ICONV ，， http://bbs.csdn.net/topics/360181231

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

UTF8 和 GBK混合的文本识别转换.....

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

UTF8 和 GBK混合的文本识别转换.....

评论

发表评论

相关推荐

程序员该怎么用Atomic操作？

没有atomic.h后如何在linux实现原子操作

atomic_inc 原子操作

sizeof与strlen的区别

数组和指针的区别

fscanf 跳过空格，读取一行

最近访客更多访客>>