5 kx
5 kx #ifdef HAVE_CONFIG_H
5 kx #include <config.h>
5 kx #endif
5 kx
5 kx #include <stdlib.h>
5 kx #include <stdio.h>
5 kx #include <unistd.h>
5 kx #include <string.h>
5 kx #include <stdarg.h>
5 kx #include <limits.h>
5 kx #include <locale.h>
5 kx #include <wchar.h>
5 kx #include <wctype.h>
5 kx
5 kx #include <defs.h>
5 kx #include <utf8ing.h>
5 kx
5 kx
5 kx static const ucs4_t replacement_char = 0xfffd;
5 kx static const ucs4_t maximum_ucs4 = 0x7fffffff;
5 kx
5 kx static const int half_shift = 10;
5 kx static const ucs4_t half_base = 0x0010000;
5 kx
5 kx static const ucs4_t surrogate_high_start = 0xd800;
5 kx static const ucs4_t surrogate_high_end = 0xdbff;
5 kx static const ucs4_t surrogate_low_start = 0xdc00;
5 kx static const ucs4_t surrogate_low_end = 0xdfff;
5 kx
5 kx static utf8_t
5 kx first_byte_mark[7] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc };
5 kx
5 kx
5 kx /***************************************************************
5 kx static copy_ucs4_to_utf8()
5 kx
5 kx Переводит строку символов UCS4( src ) в UTF8( dest ).
5 kx
5 kx Возвращаемое значение:
5 kx Количество байт, реально записанное в DEST.
5 kx
5 kx NOTE:
5 kx Выход за пределы памяти, выделенной под указатель DEST
5 kx не контролируются.
5 kx Подразумевается, что строка SRC имеет null-терминатор.
5 kx ***************************************************************/
5 kx int copy_ucs4_to_utf8( utf8_t *dest, const ucs4_t *src )
5 kx {
5 kx utf8_t target[7];
5 kx utf8_t *ptr;
5 kx int count = 0;
5 kx
5 kx while( *src )
5 kx {
5 kx ucs4_t c;
5 kx int bytes_to_write = 0;
5 kx const ucs4_t byte_mask = 0xbf;
5 kx const ucs4_t byte_mark = 0x80;
5 kx
5 kx c = *src++;
5 kx
5 kx if( c >= surrogate_high_start &&
5 kx c <= surrogate_high_end && *src )
5 kx {
5 kx ucs4_t c2 = *src;
5 kx
5 kx if( c2 >= surrogate_low_start &&
5 kx c2 <= surrogate_low_end )
5 kx {
5 kx c = ((c - surrogate_high_start) << half_shift) +
5 kx (c2 - surrogate_low_start) + half_base;
5 kx ++src;
5 kx }
5 kx }
5 kx
5 kx if( c < 0x80 ) bytes_to_write = 1;
5 kx else if( c < 0x800 ) bytes_to_write = 2;
5 kx else if( c < 0x10000 ) bytes_to_write = 3;
5 kx else if( c < 0x200000 ) bytes_to_write = 4;
5 kx else if( c < 0x4000000 ) bytes_to_write = 5;
5 kx else if( c <= maximum_ucs4 ) bytes_to_write = 6;
5 kx else
5 kx {
5 kx bytes_to_write = 2; c = replacement_char;
5 kx }
5 kx
5 kx ptr = &target[0] + bytes_to_write;
5 kx
5 kx switch( bytes_to_write )
5 kx {
5 kx case 6:
5 kx *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
5 kx case 5:
5 kx *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
5 kx case 4:
5 kx *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
5 kx case 3:
5 kx *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
5 kx case 2:
5 kx *--ptr = (c | byte_mark) & byte_mask; c >>= 6;
5 kx case 1:
5 kx *--ptr = c | first_byte_mark[bytes_to_write];
5 kx }
5 kx
5 kx ptr = &target[0];
5 kx
5 kx while( bytes_to_write > 0 )
5 kx {
5 kx *dest++ = *ptr++; /* write byte */
5 kx --bytes_to_write;
5 kx ++count;
5 kx }
5 kx
5 kx } /* End while( *src ) */
5 kx
5 kx *dest = (utf8_t)0; /* null terminator */
5 kx
5 kx return( count );
5 kx
5 kx } /* End of static copy_ucs4_to_utf8() */