140 lines
3.3 KiB
C
140 lines
3.3 KiB
C
#include "charset.h"
|
|
#include <string.h>
|
|
|
|
static int is_utf16_le(const unsigned char *in, size_t len)
|
|
{
|
|
return len >= 2 && in[0] == 0xFF && in[1] == 0xFE;
|
|
}
|
|
|
|
static int is_utf16_be(const unsigned char *in, size_t len)
|
|
{
|
|
return len >= 2 && in[0] == 0xFE && in[1] == 0xFF;
|
|
}
|
|
|
|
static int is_utf8(const unsigned char *in, size_t len)
|
|
{
|
|
size_t i;
|
|
int has_multibyte = 0;
|
|
|
|
for (i = 0; i < len; i++) {
|
|
if (in[i] < 0x80)
|
|
continue;
|
|
has_multibyte = 1;
|
|
if ((in[i] & 0xE0) == 0xC0) {
|
|
if (i + 1 >= len || (in[i+1] & 0xC0) != 0x80)
|
|
return 0;
|
|
i += 1;
|
|
} else if ((in[i] & 0xF0) == 0xE0) {
|
|
if (i + 2 >= len || (in[i+1] & 0xC0) != 0x80 ||
|
|
(in[i+2] & 0xC0) != 0x80)
|
|
return 0;
|
|
i += 2;
|
|
} else if ((in[i] & 0xF8) == 0xF0) {
|
|
if (i + 3 >= len || (in[i+1] & 0xC0) != 0x80 ||
|
|
(in[i+2] & 0xC0) != 0x80 || (in[i+3] & 0xC0) != 0x80)
|
|
return 0;
|
|
i += 3;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
return has_multibyte;
|
|
}
|
|
|
|
static int utf8_to_codepoint(const unsigned char *in, size_t remaining,
|
|
unsigned int *cp)
|
|
{
|
|
if (in[0] < 0x80) {
|
|
*cp = in[0];
|
|
return 1;
|
|
} else if ((in[0] & 0xE0) == 0xC0) {
|
|
if (remaining < 2) return 0;
|
|
*cp = ((in[0] & 0x1F) << 6) | (in[1] & 0x3F);
|
|
return 2;
|
|
} else if ((in[0] & 0xF0) == 0xE0) {
|
|
if (remaining < 3) return 0;
|
|
*cp = ((in[0] & 0x0F) << 12) | ((in[1] & 0x3F) << 6) |
|
|
(in[2] & 0x3F);
|
|
return 3;
|
|
} else if ((in[0] & 0xF8) == 0xF0) {
|
|
if (remaining < 4) return 0;
|
|
*cp = ((in[0] & 0x07) << 18) | ((in[1] & 0x3F) << 12) |
|
|
((in[2] & 0x3F) << 6) | (in[3] & 0x3F);
|
|
return 4;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int utf16le_to_iso8859_1(const unsigned char *in, size_t in_len,
|
|
char *out, size_t out_size)
|
|
{
|
|
size_t i, o = 0;
|
|
|
|
for (i = 2; i + 1 < in_len && o + 1 < out_size; i += 2) {
|
|
unsigned int cp = in[i] | (in[i+1] << 8);
|
|
out[o++] = cp <= 0xFF ? (char)cp : '?';
|
|
}
|
|
out[o] = '\0';
|
|
return (int)o;
|
|
}
|
|
|
|
static int utf16be_to_iso8859_1(const unsigned char *in, size_t in_len,
|
|
char *out, size_t out_size)
|
|
{
|
|
size_t i, o = 0;
|
|
|
|
for (i = 2; i + 1 < in_len && o + 1 < out_size; i += 2) {
|
|
unsigned int cp = (in[i] << 8) | in[i+1];
|
|
out[o++] = cp <= 0xFF ? (char)cp : '?';
|
|
}
|
|
out[o] = '\0';
|
|
return (int)o;
|
|
}
|
|
|
|
static int utf8_to_iso8859_1(const unsigned char *in, size_t in_len,
|
|
char *out, size_t out_size)
|
|
{
|
|
size_t i = 0, o = 0;
|
|
|
|
while (i < in_len && o + 1 < out_size) {
|
|
unsigned int cp;
|
|
int consumed = utf8_to_codepoint(in + i, in_len - i, &cp);
|
|
if (consumed == 0) {
|
|
out[o++] = '?';
|
|
i++;
|
|
continue;
|
|
}
|
|
out[o++] = cp <= 0xFF ? (char)cp : '?';
|
|
i += consumed;
|
|
}
|
|
out[o] = '\0';
|
|
return (int)o;
|
|
}
|
|
|
|
int to_iso8859_1(const unsigned char *in, size_t in_len,
|
|
char *out, size_t out_size)
|
|
{
|
|
if (!in || !out || out_size == 0)
|
|
return 0;
|
|
|
|
if (in_len == 0) {
|
|
out[0] = '\0';
|
|
return 0;
|
|
}
|
|
|
|
if (is_utf16_le(in, in_len))
|
|
return utf16le_to_iso8859_1(in, in_len, out, out_size);
|
|
|
|
if (is_utf16_be(in, in_len))
|
|
return utf16be_to_iso8859_1(in, in_len, out, out_size);
|
|
|
|
if (is_utf8(in, in_len))
|
|
return utf8_to_iso8859_1(in, in_len, out, out_size);
|
|
|
|
/* Already ISO-8859-1 / ASCII — passthrough */
|
|
size_t copy = in_len < out_size - 1 ? in_len : out_size - 1;
|
|
memcpy(out, in, copy);
|
|
out[copy] = '\0';
|
|
return (int)copy;
|
|
}
|