233 lines
7.4 KiB
C
233 lines
7.4 KiB
C
|
/**
|
||
|
* @file unicode_funcs.h
|
||
|
* @author Ambroz Bizjak <ambrop7@gmail.com>
|
||
|
*
|
||
|
* @section LICENSE
|
||
|
*
|
||
|
* Redistribution and use in source and binary forms, with or without
|
||
|
* modification, are permitted provided that the following conditions are met:
|
||
|
* 1. Redistributions of source code must retain the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer.
|
||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||
|
* notice, this list of conditions and the following disclaimer in the
|
||
|
* documentation and/or other materials provided with the distribution.
|
||
|
* 3. Neither the name of the author nor the
|
||
|
* names of its contributors may be used to endorse or promote products
|
||
|
* derived from this software without specific prior written permission.
|
||
|
*
|
||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||
|
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
||
|
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||
|
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
*/
|
||
|
|
||
|
#ifndef BADVPN_UNICODE_FUNCS_H
|
||
|
#define BADVPN_UNICODE_FUNCS_H
|
||
|
|
||
|
#include <misc/expstring.h>
|
||
|
#include <misc/bsize.h>
|
||
|
#include <misc/Utf8Encoder.h>
|
||
|
#include <misc/Utf8Decoder.h>
|
||
|
#include <misc/Utf16Encoder.h>
|
||
|
#include <misc/Utf16Decoder.h>
|
||
|
|
||
|
/**
|
||
|
* Decodes UTF-16 data as bytes into an allocated null-terminated UTF-8 string.
|
||
|
*
|
||
|
* @param data UTF-16 data, in big endian
|
||
|
* @param data_len size of data in bytes
|
||
|
* @param out_is_error if not NULL and the function returns a string,
|
||
|
* *out_is_error will be set to 0 or 1, indicating
|
||
|
* whether there have been errors decoding the input.
|
||
|
* A null decoded character is treated as an error.
|
||
|
* @return An UTF-8 null-terminated string which can be freed with free(),
|
||
|
* or NULL if out of memory.
|
||
|
*/
|
||
|
static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error);
|
||
|
|
||
|
/**
|
||
|
* Decodes UTF-8 data into UTF-16 data as bytes.
|
||
|
*
|
||
|
* @param data UTF-8 data
|
||
|
* @param data_len size of data in bytes
|
||
|
* @param out output buffer
|
||
|
* @param out_avail number of bytes available in output buffer
|
||
|
* @param out_len if not NULL, *out_len will contain the number of bytes
|
||
|
* required to store the resulting data (or overflow)
|
||
|
* @param out_is_error if not NULL, *out_is_error will contain 0 or 1,
|
||
|
* indicating whether there have been errors decoding
|
||
|
* the input
|
||
|
*/
|
||
|
static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error);
|
||
|
|
||
|
static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error)
|
||
|
{
|
||
|
// will build the resulting UTF-8 string by appending to ExpString
|
||
|
ExpString str;
|
||
|
if (!ExpString_Init(&str)) {
|
||
|
goto fail0;
|
||
|
}
|
||
|
|
||
|
// init UTF-16 decoder
|
||
|
Utf16Decoder decoder;
|
||
|
Utf16Decoder_Init(&decoder);
|
||
|
|
||
|
// set initial input and input matching positions
|
||
|
size_t i_in = 0;
|
||
|
size_t i_ch = 0;
|
||
|
|
||
|
int error = 0;
|
||
|
|
||
|
while (i_in < data_len) {
|
||
|
// read two input bytes from the input position
|
||
|
uint8_t x = data[i_in++];
|
||
|
if (i_in == data_len) {
|
||
|
break;
|
||
|
}
|
||
|
uint8_t y = data[i_in++];
|
||
|
|
||
|
// combine them into a 16-bit value
|
||
|
uint16_t xy = (((uint16_t)x << 8) | (uint16_t)y);
|
||
|
|
||
|
// give the 16-bit value to the UTF-16 decoder and maybe
|
||
|
// receive a Unicode character back
|
||
|
uint32_t ch;
|
||
|
if (!Utf16Decoder_Input(&decoder, xy, &ch)) {
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
if (!error) {
|
||
|
// encode the Unicode character back into UTF-16
|
||
|
uint16_t chenc[2];
|
||
|
int chenc_n = Utf16Encoder_EncodeCharacter(ch, chenc);
|
||
|
ASSERT(chenc_n > 0)
|
||
|
|
||
|
// match the result with input
|
||
|
for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
|
||
|
uint8_t cx = (chenc[chenc_i] >> 8);
|
||
|
uint8_t cy = (chenc[chenc_i] & 0xFF);
|
||
|
|
||
|
if (i_ch >= data_len || data[i_ch] != cx) {
|
||
|
error = 1;
|
||
|
break;
|
||
|
}
|
||
|
i_ch++;
|
||
|
|
||
|
if (i_ch >= data_len || data[i_ch] != cy) {
|
||
|
error = 1;
|
||
|
break;
|
||
|
}
|
||
|
i_ch++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// we don't like null Unicode characters because we're building a
|
||
|
// null-terminated UTF-8 string
|
||
|
if (ch == 0) {
|
||
|
error = 1;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// encode the Unicode character into UTF-8
|
||
|
uint8_t enc[5];
|
||
|
int enc_n = Utf8Encoder_EncodeCharacter(ch, enc);
|
||
|
ASSERT(enc_n > 0)
|
||
|
|
||
|
// append the resulting UTF-8 bytes to the result string
|
||
|
enc[enc_n] = 0;
|
||
|
if (!ExpString_Append(&str, enc)) {
|
||
|
goto fail1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// check if we matched the whole input string when encoding back
|
||
|
if (i_ch < data_len) {
|
||
|
error = 1;
|
||
|
}
|
||
|
|
||
|
if (out_is_error) {
|
||
|
*out_is_error = error;
|
||
|
}
|
||
|
return ExpString_Get(&str);
|
||
|
|
||
|
fail1:
|
||
|
ExpString_Free(&str);
|
||
|
fail0:
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error)
|
||
|
{
|
||
|
Utf8Decoder decoder;
|
||
|
Utf8Decoder_Init(&decoder);
|
||
|
|
||
|
size_t i_in = 0;
|
||
|
size_t i_ch = 0;
|
||
|
|
||
|
bsize_t len = bsize_fromsize(0);
|
||
|
|
||
|
int error = 0;
|
||
|
|
||
|
while (i_in < data_len) {
|
||
|
uint8_t x = data[i_in++];
|
||
|
|
||
|
uint32_t ch;
|
||
|
if (!Utf8Decoder_Input(&decoder, x, &ch)) {
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
if (!error) {
|
||
|
uint8_t chenc[4];
|
||
|
int chenc_n = Utf8Encoder_EncodeCharacter(ch, chenc);
|
||
|
ASSERT(chenc_n > 0)
|
||
|
|
||
|
for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
|
||
|
if (i_ch >= data_len || data[i_ch] != chenc[chenc_i]) {
|
||
|
error = 1;
|
||
|
break;
|
||
|
}
|
||
|
i_ch++;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
uint16_t enc[2];
|
||
|
int enc_n = Utf16Encoder_EncodeCharacter(ch, enc);
|
||
|
ASSERT(enc_n > 0)
|
||
|
|
||
|
len = bsize_add(len, bsize_fromsize(2 * enc_n));
|
||
|
|
||
|
for (int enc_i = 0; enc_i < enc_n; enc_i++) {
|
||
|
if (out_avail == 0) {
|
||
|
break;
|
||
|
}
|
||
|
*(out++) = (enc[enc_i] >> 8);
|
||
|
out_avail--;
|
||
|
|
||
|
if (out_avail == 0) {
|
||
|
break;
|
||
|
}
|
||
|
*(out++) = (enc[enc_i] & 0xFF);
|
||
|
out_avail--;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (i_ch < data_len) {
|
||
|
error = 1;
|
||
|
}
|
||
|
|
||
|
if (out_len) {
|
||
|
*out_len = len;
|
||
|
}
|
||
|
if (out_is_error) {
|
||
|
*out_is_error = error;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#endif
|