<!-- Set Window Title
Deleted on 2008-03-20 by Peter Lee for separating header.php into: header.php, title, header2.php
-->
Note: part of this article is referenced from RFC2279 - UTF-8, a transformation format of ISO 10646.
I'm kinda practising my C programming skills, and I found some friends had written some codes, but the codes might not be readable. I learned the rules of the conversion between Unicode and UTF-8, and wrote two functions as shown below.
Please be focusing on the functions fnUnicode2UTF8() and fnUTF82Unicode() in the source code. Don't be confused with the program output part in the main() function :-)
<at>
http://www.peterlee.com.cn
http://blog.peterlee.com.cn
#include <stdio.h>
#include <string.h>
#define MAX 6
void fnUnicode2UTF8(unsigned long unicode, char UTF8[])
{
if ( 0x00000000 <= unicode && unicode <= 0x0000007F )
{
UTF8[MAX] = 1;
UTF8[0] = (char)(unicode);
return;
}
if ( 0x00000080 <= unicode && unicode <= 0x000007FF )
{
UTF8[MAX] = 2;
UTF8[0] = (char)(0xC0 | unicode>>6);
UTF8[1] = (char)(0x80 | unicode & 0x3F);
return;
}
if ( 0x00000800 <= unicode && unicode <= 0x0000FFFF )
{
UTF8[MAX] = 3;
UTF8[0] = (char)(0xE0 | unicode>>12);
UTF8[1] = (char)(0x80 | unicode>>6 & 0x3F);
UTF8[2] = (char)(0x80 | unicode & 0x3F);
return;
}
if ( 0x00010000 <= unicode && unicode <= 0x001FFFFF )
{
UTF8[MAX] = 4;
UTF8[0] = (char)(0xF0 | unicode>>18);
UTF8[1] = (char)(0x80 | unicode>>12 & 0x3F);
UTF8[2] = (char)(0x80 | unicode>>6 & 0x3F);
UTF8[3] = (char)(0x80 | unicode & 0x3F);
return;
}
if ( 0x00200000 <= unicode && unicode <= 0x03FFFFFF )
{
UTF8[MAX] = 5;
UTF8[0] = (char)(0xF8 | unicode>>24);
UTF8[1] = (char)(0x80 | unicode>>18 & 0x3F);
UTF8[2] = (char)(0x80 | unicode>>12 & 0x3F);
UTF8[3] = (char)(0x80 | unicode>>6 & 0x3F);
UTF8[4] = (char)(0x80 | unicode & 0x3F);
return;
}
if ( 0x04000000 <= unicode && unicode <= 0x7FFFFFFF )
{
UTF8[MAX] = 6;
UTF8[0] = (char)(0xFC | unicode>>30);
UTF8[1] = (char)(0x80 | unicode>>24 & 0x3F);
UTF8[2] = (char)(0x80 | unicode>>18 & 0x3F);
UTF8[3] = (char)(0x80 | unicode>>12 & 0x3F);
UTF8[4] = (char)(0x80 | unicode>>6 & 0x3F);
UTF8[5] = (char)(0x80 | unicode & 0x3F);
return;
}
}
unsigned long fnUTF82Unicode(char UTF8[])
{
unsigned long unicode = 0;
if ( 0x00 == (unsigned char)(UTF8[0])>>7 )
{
UTF8[MAX] = 1;
unicode = UTF8[0];
return unicode;
}
if ( 0x06 == (unsigned char)(UTF8[0])>>5 )
{
UTF8[MAX] = 2;
unicode = (UTF8[0]&0x1F) << 6;
unicode |= (UTF8[1]&0x3F);
return unicode;
}
if ( 0x0E == (unsigned char)(UTF8[0])>>4 )
{
UTF8[MAX] = 3;
unicode = (UTF8[0]&0x0F) << 12;
unicode |= (UTF8[1]&0x3F) << 6;
unicode |= (UTF8[2]&0x3F);
return unicode;
}
if ( 0x1E == (unsigned char)(UTF8[0])>>3 )
{
UTF8[MAX] = 4;
unicode = (UTF8[0]&0x07) << 18;
unicode |= (UTF8[1]&0x3F) << 12;
unicode |= (UTF8[2]&0x3F) << 6;
unicode |= (UTF8[3]&0x3F);
return unicode;
}
if ( 0x3E == (unsigned char)(UTF8[0])>>2 )
{
UTF8[MAX] = 5;
unicode = (UTF8[0]&0x03) << 24;
unicode |= (UTF8[1]&0x3F) << 18;
unicode |= (UTF8[2]&0x3F) << 12;
unicode |= (UTF8[3]&0x3F) << 6;
unicode |= (UTF8[4]&0x3F);
return unicode;
}
if ( 0x7E == (unsigned char)(UTF8[0])>>1 )
{
UTF8[MAX] = 6;
unicode = (UTF8[0]&0x01) << 30;
unicode |= (UTF8[1]&0x3F) << 24;
unicode |= (UTF8[2]&0x3F) << 18;
unicode |= (UTF8[3]&0x3F) << 12;
unicode |= (UTF8[4]&0x3F) << 6;
unicode |= (UTF8[5]&0x3F);
return unicode;
}
return 0;
}
char Hex2Bin[23][5] = {"0000", "0001", "0010", "0011",
"0100", "0101", "0110", "0111",
"1000", "1001",
"", "", "", "", "", "", "",
"1010", "1011",
"1100", "1101", "1110", "1111"};
void fnHex2Bin(char hex[], char bin[])
{
int i, len = strlen(hex);
for ( bin[0] = i = 0; i < len; ++i )
strcat ( bin, Hex2Bin[hex[i]-'0'] );
}
int main(int argc, char* argv[])
{
int i;
char UTF8[MAX+1], bin[4*8+1], hex[8+1];
unsigned long unicode = 0x4F60;
printf ("Unicode 2 UTF8:\n");
sprintf ( hex, "%X", unicode );
fnHex2Bin ( hex, bin );
printf ( "Unicode Hex: %s\n", hex );
printf ( "Unicode Bin: %s\n", bin );
fnUnicode2UTF8 ( unicode, UTF8 );
for ( i = 0; i < UTF8[MAX]; ++i )
sprintf ( hex+2*i, "%02X", (unsigned char)(UTF8[i]) );
hex[2*UTF8[MAX]] = 0;
fnHex2Bin ( hex, bin );
printf ( " UTF Hex: %0*s\n", UTF8[MAX]*2, hex );
printf ( " UTF Bin: %s\n", bin );
printf ("\n");
printf ("UTF8 2 Unicode:\n");
printf ( " UTF Hex: %0*s\n", UTF8[MAX]*2, hex );
printf ( " UTF Bin: %s\n", bin );
unicode = fnUTF82Unicode ( UTF8 );
sprintf ( hex, "%X", unicode );
fnHex2Bin ( hex, bin );
printf ( "Unicode Hex: %s\n", hex );
printf ( "Unicode Bin: %s\n", bin );
return 0;
}
分享到:
相关推荐
离线安装包,亲测可用
本文将详细介绍这三种编码方式,并提供C语言实现它们之间转换的函数。 GBK编码是中国大陆广泛使用的多字节编码,它是GB2312的扩展,兼容ASCII码,主要针对中文字符集,能够表示大约2万个汉字。GBK编码每个字符通常...
在Utf-8与GB2312之间转换时,由于两者编码方式的不同,可能会出现一些字符无法直接映射的情况,这时工具通常会采用近似替换或者抛出错误的方式来处理。 总之,批量文件编码转换工具是处理多语言或多编码环境下的...
您可以使用 AWS Schema Conversion Tool (AWS SCT) 将现有的数据库架构从一个数据库引擎转换为另一个数据库引擎。您可以转换关系 OLTP 架构或数据仓库架构。转换后的架构适用于 Amazon Relational Database Service ...
### 采样理论与模拟-数字转换 #### 一、引言 《采样理论与模拟-数字转换》是一本深入探讨信号处理领域核心概念的专业书籍。本书由Patrick Jungwirth博士撰写,全面覆盖了从基本原理到高级应用的相关内容。在数字...
AbstractƩ-Δ analog-to-digital converters are widely used in motor drives where high signal integrity and galvanic isolation are required. While the Σ-Δ technology itself is well understood, the ...
Dataconversion库可能专注于数据转换,即帮助用户在不同格式之间转换数据,这在数据分析和数据处理项目中极为重要。 描述中提到的"解压后可用"意味着该.whl文件在下载后无需进一步压缩,可以直接在Python环境中通过...
An Eyuyan library dealing with Unicode string conversion.Unicode-Eyuyan一个进行 Unicode 相关字符串转换的易语言库。一个处理 Unicode 字符串转换的 Eyuyan 库。背景 Background易语言是一个面向初学者的编程...
微小的 4.3描述Tiny-utf8是一个库,用于将Unicode非常轻松地集成到任意C ++ 11项目中。 该库仅由类utf8_string组成,该类utf8_string替代std::string 。 它的实现成功地介于小内存占用和快速访问之间。 因此, std::...
**PyPI 官网下载 | rios.conversion-0.2.1-py2.py3-none-any.whl** PyPI(Python Package Index)是Python开发者发布和分享自己编写的Python库的主要平台。这个资源,"rios.conversion-0.2.1-py2.py3-none-any.whl...
书籍的ISBN号码是0-7803-1093-4,可以通过IEEE出版社以折扣价购买(购买数量较大时)。 5. 书籍的内容涉及到模拟至数字转换器(Analog-to-digital converters)的设计与构造、数字至模拟转换器(Digital-to-analog ...
本文档“Understanding and Minimising ADC Conversion Errors”由STMicroelectronics发布,旨在解释模拟数字转换器(ADC)的各种错误及其相关技术,帮助应用开发者最小化这些错误。文档首先介绍了ADC的重要性,即它...
提供的文件 "Geode_Conversion-2.12.3-cp38-cp38-win_amd64.whl" 是Python的wheel文件,这是一种预编译的Python包格式。Wheel文件的优点在于它能够直接安装,无需先进行编译,大大简化了安装过程,特别是在Windows...
标题"utf_string_conversion_utils.rar_conversion"暗示了我们关注的焦点是UTF(Unicode Transformation Format)字符串转换工具,这通常涉及到不同字符编码之间的转换,特别是针对Linux源代码的情况。UTF是一种广泛...
在IT行业中,转换测试是软件开发过程中的一个重要环节,它涉及到数据类型的转换,尤其是数值类型与字符类型之间的转换。此"Conversion-Test-Value.rar_conversion"项目似乎专注于将十进制(decimal)数字转化为字符...
资源分类:Python库 所属语言:Python 使用前提:需要解压 资源全名:Geode_Conversion-2.12.2-cp38-cp38-win_amd64.whl 资源来源:官方 安装方法:https://lanzao.blog.csdn.net/article/details/101784059
We propose and experimentally demonstrate mutual optical format conversion between signals characterized as 10-Gb/s nonreturn-to-zero on-off-keying (NRZ-OOK) and NRZ binary phase-shift keying (BPSK) ...
官方离线安装包,亲测可用
本篇文章将详细探讨“GBK转UTF-8”的相关知识,帮助你理解和掌握这两种编码之间的转换。 首先,我们来了解GBK编码。GBK是中国大陆广泛使用的汉字编码标准,全称为“汉字内码扩展规范”(GB 2312-80的扩展)。它兼容...