#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <stdarg.h>
#include <limits.h>
#include <locale.h>
#include <wchar.h>
#include <wctype.h>
#include <defs.h>
#include <utf8ing.h>
static const ucs4_t replacement_char = 0xfffd;
static const ucs4_t maximum_ucs4 = 0x7fffffff;
static const int half_shift = 10;
static const ucs4_t half_base = 0x0010000;
static const ucs4_t surrogate_high_start = 0xd800;
static const ucs4_t surrogate_high_end = 0xdbff;
static const ucs4_t surrogate_low_start = 0xdc00;
static const ucs4_t surrogate_low_end = 0xdfff;
static utf8_t
first_byte_mark[7] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc };
/***************************************************************
static copy_ucs4_to_utf8()
Переводит строку символов UCS4( src ) в UTF8( dest ).
Возвращаемое значение:
Количество байт, реально записанное в DEST.
NOTE:
Выход за пределы памяти, выделенной под указатель DEST
не контролируются.
Подразумевается, что строка SRC имеет null-терминатор.
***************************************************************/
int copy_ucs4_to_utf8( utf8_t *dest, const ucs4_t *src )
{
utf8_t target[7];
utf8_t *ptr;
int count = 0;
while( *src )
{
ucs4_t c;
int bytes_to_write = 0;
const ucs4_t byte_mask = 0xbf;
const ucs4_t byte_mark = 0x80;
c = *src++;
if( c >= surrogate_high_start &&
c <= surrogate_high_end && *src )
{
ucs4_t c2 = *src;
if( c2 >= surrogate_low_start &&
c2 <= surrogate_low_end )
{
c = ((c - surrogate_high_start) << half_shift) +
(c2 - surrogate_low_start) + half_base;
++src;
}
}
if( c < 0x80 ) bytes_to_write = 1;
else if( c < 0x800 ) bytes_to_write = 2;
else if( c < 0x10000 ) bytes_to_write = 3;
else if( c < 0x200000 ) bytes_to_write = 4;
else if( c < 0x4000000 ) bytes_to_write = 5;
else if( c <= maximum_ucs4 ) bytes_to_write = 6;
else
{
bytes_to_write = 2; c = replacement_char;
}
ptr = &target[0] + bytes_to_write;
switch( bytes_to_write )
{
case 6:
*--ptr = (c | byte_mark) & byte_mask; c >>= 6;
case 5:
*--ptr = (c | byte_mark) & byte_mask; c >>= 6;
case 4:
*--ptr = (c | byte_mark) & byte_mask; c >>= 6;
case 3:
*--ptr = (c | byte_mark) & byte_mask; c >>= 6;
case 2:
*--ptr = (c | byte_mark) & byte_mask; c >>= 6;
case 1:
*--ptr = c | first_byte_mark[bytes_to_write];
}
ptr = &target[0];
while( bytes_to_write > 0 )
{
*dest++ = *ptr++; /* write byte */
--bytes_to_write;
++count;
}
} /* End while( *src ) */
*dest = (utf8_t)0; /* null terminator */
return( count );
} /* End of static copy_ucs4_to_utf8() */