iconv clucene

sealbird

浏览: 601432 次
性别:
来自: 广州

最近访客更多访客>>

ladies_killer

wbsh583

u012363178

dilimic120

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

clucene

Linux UP

/////////////////////////////////////////////////////////////////////////////////////
/// 此内容摘自 linux 上 iconv  命令程序代码，目的在于处理转码出现无效字符的情况
////////////////////////////////////////////////////////////////////////////////////
struct iconv_hooks {};
struct iconv_fallbacks {};
typedef unsigned int ucs4_t;
typedef struct conv_struct * conv_t;
struct loop_funcs {
  size_t (*loop_convert) (iconv_t icd,
                          const char* * inbuf, size_t *inbytesleft,
                          char* * outbuf, size_t *outbytesleft);
  size_t (*loop_reset) (iconv_t icd,
                        char* * outbuf, size_t *outbytesleft);
};
struct mbtowc_funcs {
  int (*xxx_mbtowc) (conv_t conv, ucs4_t *pwc, unsigned char const *s, int n);
  /*
   * int xxx_mbtowc (conv_t conv, ucs4_t *pwc, unsigned char const *s, int n)
   * converts the byte sequence starting at s to a wide character. Up to n bytes
   * are available at s. n is >= 1.
   * Result is number of bytes consumed (if a wide character was read),
   * or -1 if invalid, or -2 if n too small, or -2-(number of bytes consumed)
   * if only a shift sequence was read.
   */
  int (*xxx_flushwc) (conv_t conv, ucs4_t *pwc);
  /*
   * int xxx_flushwc (conv_t conv, ucs4_t *pwc)
   * returns to the initial state and stores the pending wide character, if any.
   * Result is 1 (if a wide character was read) or 0 if none was pending.
   */
};
struct wctomb_funcs {
  int (*xxx_wctomb) (conv_t conv, unsigned char *r, ucs4_t wc, int n);
  /*
   * int xxx_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
   * converts the wide character wc to the character set xxx, and stores the
   * result beginning at r. Up to n bytes may be written at r. n is >= 1.
   * Result is number of bytes written, or -1 if invalid, or -2 if n too small.
   */
  int (*xxx_reset) (conv_t conv, unsigned char *r, int n);
  /*
   * int xxx_reset (conv_t conv, unsigned char *r, int n)
   * stores a shift sequences returning to the initial state beginning at r.
   * Up to n bytes may be written at r. n is >= 0.
   * Result is number of bytes written, or -2 if n too small.
   */
};
typedef unsigned int state_t;
struct conv_struct {
  struct loop_funcs lfuncs;
  /* Input (conversion multibyte -> unicode) */
  int iindex;
  struct mbtowc_funcs ifuncs;
  state_t istate;
  /* Output (conversion unicode -> multibyte) */
  int oindex;
  struct wctomb_funcs ofuncs;
  int oflags;
  state_t ostate;
  /* Operation flags */
  int transliterate;
  int discard_ilseq;
  #ifndef LIBICONV_PLUG
  struct iconv_fallbacks fallbacks;
  struct iconv_hooks hooks;
  #endif
};

////////////////////////////////////////////////////////////
/// 		转载结束
////////////////////////////////////////////////////////////

int __charcode_convert__(LPCSTR from, LPCSTR to, LPSTR save, int savelen, LPSTR src, int srclen, bool ignore_invalid_sequence)
{
    iconv_t cd;
    char *inbuf = src;
    char *outbuf = save;
    size_t outbufsize = savelen;
    int status = 0;
    size_t  savesize = 0;
    size_t inbufsize = srclen;
    const char* inptr = inbuf;
    size_t	insize = inbufsize;
    char* outptr = outbuf;
    size_t outsize = outbufsize;

	if (!ignore_invalid_sequence)
    	cd = iconv_open(to, from);
    else
	{
		char tochartset[64]={0};
		sprintf(tochartset, "%s//IGNORE", to);
    	cd = iconv_open(tochartset, from);
    }

    if (cd == (iconv_t)(-1))
    {
    	printf("iconv_open oper error!\n");
		status = -1;
        goto done;
    }

    iconv(cd, NULL, NULL, NULL, NULL);
    if (inbufsize == 0) 
	{
        status = -1;
        goto done;
    }

	int invaild_do;
	invaild_do = 0;

    while (insize > 0) 
	{
		size_t res = iconv(cd, (char**)&inptr, &insize, &outptr, &outsize);
        if (res == (size_t)(-1)) 
		{
            if (errno == EILSEQ) 
			{
				if (invaild_do == 0)
				{
	                ((conv_t)cd)->discard_ilseq = 1;
	                invaild_do = 1;
	                continue;
	            }

                status = -3;
                goto done;
            }
            else if (errno == EINVAL) 
            {
                if (inbufsize == 0) 
                {
                    status = -4;
                    goto done;
                }
                else 
                {
                    break;
                }
            }
            else if (errno == E2BIG) 
            {
                status = -5;
                goto done;
            }
            else 
            {
                status = -6;
                goto done;
            }
        }

		invaild_do = 0;

        if (outptr != outbuf) 
		{
            int saved_errno = errno;
            int outsize = outptr - outbuf;
            strncpy(save+savesize, outbuf, outsize);
            errno = saved_errno;
        }

        lj_sleep(0, 1);
    }

    status = strlen(save);
	status = status > 0 ? 0 : -1;

done:
    iconv_close(cd);
    return status;
}

char *charcode_convert(LPCSTR from, LPCSTR to, LPSTR src, int srclen, bool ignore_invalid_sequence)
{
	char *outbuf = (char*)malloc(4*srclen+sizeof(char));
	memset(outbuf, 0, 4*srclen+sizeof(char));
	if (__charcode_convert__(from, to, outbuf, 4*srclen, src, srclen, ignore_invalid_sequence) != 0)
	{
		LJFREE(outbuf);
		outbuf = (char*)malloc(sizeof(char));
		memset(outbuf, 0, sizeof(char));
	}
	return outbuf;
}

char* utf8_to_chna(char *utf8buf, bool ignore_invalid_sequence, LPCSTR to_chna_charset)	
{
	return charcode_convert("UTF-8", to_chna_charset, utf8buf, strlen(utf8buf), ignore_invalid_sequence);
}

char* chna_to_utf8(char *chnabuf, bool ignore_invalid_sequence, LPCSTR frm_chna_charset)
{
	return charcode_convert(frm_chna_charset, "UTF-8", chnabuf, strlen(chnabuf), ignore_invalid_sequence);
}

分享到：

小记：ICONV库，开源的编码转换工具 | 培养人脉的106个技巧

2011-01-14 16:17
浏览 1192
评论(0)
分类:编程语言
查看更多

发表评论

您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

iconv clucene

评论

发表评论

相关推荐

最近访客 更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论

iconv clucene

评论

发表评论

相关推荐

clucene 读源码记录

小记：ICONV库，开源的编码转换工具

WideCharToMultiByte和MultiByteToWideChar函数的用法

CLucene源码剖析(三) 实现跨平台的线程安全

wchar 与 char 的互换

Clucene C++编码转换

CLucene 中文分词

Clucene实现中文分词搜索（转载）

使用clucene对汉字文本进行索引

clucene编译

最近访客更多访客>>