So here is my new UTF8Decoder and converter
/*
UTF8Decoder.c
This program converts a utf-8 encoded string to utf-16 hexadecimal code sequence
UTF-8 is a variable-width encoding of Unicode.
UTF-16 is a fixed width encoding of two bytes
A UTF-8 decoder must not accept UTF-8 sequences that are longer than necessary to
encode a character. For example, the character U+000A (line feed) must be accepted from
a UTF-8 stream only in the form 0x0A, but not in any of the following five possible overlong forms:
0xC0 0x8A
0xE0 0x80 0x8A
0xF0 0x80 0x80 0x8A
0xF8 0x80 0x80 0x80 0x8A
0xFC 0x80 0x80 0x80 0x80 0x8A
Ref: UTF-8 and Unicode FAQ for Unix/Linux http://www.cl.cam.ac.uk/~mgk25/unicode.html
Author: Santhosh Thottingal <santhosh.thottingal at gmail.com>
License: This program is licensed under GPLv3 or later version(at your choice)
*/
#include<stdlib.h>
#include<stdio.h>
#include<string.h>
unsigned short
utf8_to_utf16 (unsigned char *text, int *ptr)
{
unsigned short c; /*utf-16 character */
int i = 0;
int trailing = 0;
if (text[*ptr] < 0x80) /*ascii character till 128 */
{
trailing = 0;
c = text[(*ptr)++];
}
else if (text[*ptr] >> 7)
{
if (text[*ptr] < 0xE0)
{
c = text[*ptr] & 0x1F;
trailing = 1;
}
else if (text[*ptr] < 0xF8)
{
c = text[*ptr] & 0x07;
trailing = 3;
}
for (; trailing; trailing--)
{
if ((((text[++*ptr]) & 0xC0) != 0x80))
break;
c <<= 6;
c |= text[*ptr] & 0x3F;
}
}
return c;
}
/* for testing */
int
main ()
{
char *instr = "സന്തോഷ് തോട്ടിങ്ങല്"; /* my name :) */
int length = strlen (instr);
int i = 0;
for (; i < length;)
{
printf ("0x%.4x ", utf8_to_utf16 (instr, &i));
}
printf ("\n");
/* output is:
0x0d38 0x0d28 0x0d4d 0x0d24 0x0d4b 0x0d37 0x0d4d 0x0020 0x0d24 0x0d4b 0x0d1f 0x0d4d 0x0d1f 0x0d3f 0x0d19 0x0d4d 0x0d19 0x0d32 0x0d4d 0x200d
*/
return 0;
}
There may be already existing libraries for this, but writing a simple one ourself is fun and good learning experience.
For example, in python, to get the UTF-16 code sequence for a unicode string, we can use this:
str=u"സന്തോഷ്"
print repr(str)
This gives the following output
u'\u0d38\u0d28\u0d4d\u0d24\u0d4b\u0d37\u 0d4d'
