first commit
This commit is contained in:
@@ -0,0 +1,139 @@
|
||||
#include "charset.h"
|
||||
#include <string.h>
|
||||
|
||||
static int is_utf16_le(const unsigned char *in, size_t len)
|
||||
{
|
||||
return len >= 2 && in[0] == 0xFF && in[1] == 0xFE;
|
||||
}
|
||||
|
||||
static int is_utf16_be(const unsigned char *in, size_t len)
|
||||
{
|
||||
return len >= 2 && in[0] == 0xFE && in[1] == 0xFF;
|
||||
}
|
||||
|
||||
static int is_utf8(const unsigned char *in, size_t len)
|
||||
{
|
||||
size_t i;
|
||||
int has_multibyte = 0;
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
if (in[i] < 0x80)
|
||||
continue;
|
||||
has_multibyte = 1;
|
||||
if ((in[i] & 0xE0) == 0xC0) {
|
||||
if (i + 1 >= len || (in[i+1] & 0xC0) != 0x80)
|
||||
return 0;
|
||||
i += 1;
|
||||
} else if ((in[i] & 0xF0) == 0xE0) {
|
||||
if (i + 2 >= len || (in[i+1] & 0xC0) != 0x80 ||
|
||||
(in[i+2] & 0xC0) != 0x80)
|
||||
return 0;
|
||||
i += 2;
|
||||
} else if ((in[i] & 0xF8) == 0xF0) {
|
||||
if (i + 3 >= len || (in[i+1] & 0xC0) != 0x80 ||
|
||||
(in[i+2] & 0xC0) != 0x80 || (in[i+3] & 0xC0) != 0x80)
|
||||
return 0;
|
||||
i += 3;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return has_multibyte;
|
||||
}
|
||||
|
||||
static int utf8_to_codepoint(const unsigned char *in, size_t remaining,
|
||||
unsigned int *cp)
|
||||
{
|
||||
if (in[0] < 0x80) {
|
||||
*cp = in[0];
|
||||
return 1;
|
||||
} else if ((in[0] & 0xE0) == 0xC0) {
|
||||
if (remaining < 2) return 0;
|
||||
*cp = ((in[0] & 0x1F) << 6) | (in[1] & 0x3F);
|
||||
return 2;
|
||||
} else if ((in[0] & 0xF0) == 0xE0) {
|
||||
if (remaining < 3) return 0;
|
||||
*cp = ((in[0] & 0x0F) << 12) | ((in[1] & 0x3F) << 6) |
|
||||
(in[2] & 0x3F);
|
||||
return 3;
|
||||
} else if ((in[0] & 0xF8) == 0xF0) {
|
||||
if (remaining < 4) return 0;
|
||||
*cp = ((in[0] & 0x07) << 18) | ((in[1] & 0x3F) << 12) |
|
||||
((in[2] & 0x3F) << 6) | (in[3] & 0x3F);
|
||||
return 4;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int utf16le_to_iso8859_1(const unsigned char *in, size_t in_len,
|
||||
char *out, size_t out_size)
|
||||
{
|
||||
size_t i, o = 0;
|
||||
|
||||
for (i = 2; i + 1 < in_len && o + 1 < out_size; i += 2) {
|
||||
unsigned int cp = in[i] | (in[i+1] << 8);
|
||||
out[o++] = cp <= 0xFF ? (char)cp : '?';
|
||||
}
|
||||
out[o] = '\0';
|
||||
return (int)o;
|
||||
}
|
||||
|
||||
static int utf16be_to_iso8859_1(const unsigned char *in, size_t in_len,
|
||||
char *out, size_t out_size)
|
||||
{
|
||||
size_t i, o = 0;
|
||||
|
||||
for (i = 2; i + 1 < in_len && o + 1 < out_size; i += 2) {
|
||||
unsigned int cp = (in[i] << 8) | in[i+1];
|
||||
out[o++] = cp <= 0xFF ? (char)cp : '?';
|
||||
}
|
||||
out[o] = '\0';
|
||||
return (int)o;
|
||||
}
|
||||
|
||||
static int utf8_to_iso8859_1(const unsigned char *in, size_t in_len,
|
||||
char *out, size_t out_size)
|
||||
{
|
||||
size_t i = 0, o = 0;
|
||||
|
||||
while (i < in_len && o + 1 < out_size) {
|
||||
unsigned int cp;
|
||||
int consumed = utf8_to_codepoint(in + i, in_len - i, &cp);
|
||||
if (consumed == 0) {
|
||||
out[o++] = '?';
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
out[o++] = cp <= 0xFF ? (char)cp : '?';
|
||||
i += consumed;
|
||||
}
|
||||
out[o] = '\0';
|
||||
return (int)o;
|
||||
}
|
||||
|
||||
int to_iso8859_1(const unsigned char *in, size_t in_len,
|
||||
char *out, size_t out_size)
|
||||
{
|
||||
if (!in || !out || out_size == 0)
|
||||
return 0;
|
||||
|
||||
if (in_len == 0) {
|
||||
out[0] = '\0';
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (is_utf16_le(in, in_len))
|
||||
return utf16le_to_iso8859_1(in, in_len, out, out_size);
|
||||
|
||||
if (is_utf16_be(in, in_len))
|
||||
return utf16be_to_iso8859_1(in, in_len, out, out_size);
|
||||
|
||||
if (is_utf8(in, in_len))
|
||||
return utf8_to_iso8859_1(in, in_len, out, out_size);
|
||||
|
||||
/* Already ISO-8859-1 / ASCII — passthrough */
|
||||
size_t copy = in_len < out_size - 1 ? in_len : out_size - 1;
|
||||
memcpy(out, in, copy);
|
||||
out[copy] = '\0';
|
||||
return (int)copy;
|
||||
}
|
||||
Reference in New Issue
Block a user