first commit

This commit is contained in:
2026-04-29 11:44:36 +02:00
commit e2a5ddf9a0
5 changed files with 1202 additions and 0 deletions
+139
View File
@@ -0,0 +1,139 @@
#include "charset.h"
#include <string.h>
static int is_utf16_le(const unsigned char *in, size_t len)
{
return len >= 2 && in[0] == 0xFF && in[1] == 0xFE;
}
static int is_utf16_be(const unsigned char *in, size_t len)
{
return len >= 2 && in[0] == 0xFE && in[1] == 0xFF;
}
static int is_utf8(const unsigned char *in, size_t len)
{
size_t i;
int has_multibyte = 0;
for (i = 0; i < len; i++) {
if (in[i] < 0x80)
continue;
has_multibyte = 1;
if ((in[i] & 0xE0) == 0xC0) {
if (i + 1 >= len || (in[i+1] & 0xC0) != 0x80)
return 0;
i += 1;
} else if ((in[i] & 0xF0) == 0xE0) {
if (i + 2 >= len || (in[i+1] & 0xC0) != 0x80 ||
(in[i+2] & 0xC0) != 0x80)
return 0;
i += 2;
} else if ((in[i] & 0xF8) == 0xF0) {
if (i + 3 >= len || (in[i+1] & 0xC0) != 0x80 ||
(in[i+2] & 0xC0) != 0x80 || (in[i+3] & 0xC0) != 0x80)
return 0;
i += 3;
} else {
return 0;
}
}
return has_multibyte;
}
static int utf8_to_codepoint(const unsigned char *in, size_t remaining,
unsigned int *cp)
{
if (in[0] < 0x80) {
*cp = in[0];
return 1;
} else if ((in[0] & 0xE0) == 0xC0) {
if (remaining < 2) return 0;
*cp = ((in[0] & 0x1F) << 6) | (in[1] & 0x3F);
return 2;
} else if ((in[0] & 0xF0) == 0xE0) {
if (remaining < 3) return 0;
*cp = ((in[0] & 0x0F) << 12) | ((in[1] & 0x3F) << 6) |
(in[2] & 0x3F);
return 3;
} else if ((in[0] & 0xF8) == 0xF0) {
if (remaining < 4) return 0;
*cp = ((in[0] & 0x07) << 18) | ((in[1] & 0x3F) << 12) |
((in[2] & 0x3F) << 6) | (in[3] & 0x3F);
return 4;
}
return 0;
}
static int utf16le_to_iso8859_1(const unsigned char *in, size_t in_len,
char *out, size_t out_size)
{
size_t i, o = 0;
for (i = 2; i + 1 < in_len && o + 1 < out_size; i += 2) {
unsigned int cp = in[i] | (in[i+1] << 8);
out[o++] = cp <= 0xFF ? (char)cp : '?';
}
out[o] = '\0';
return (int)o;
}
static int utf16be_to_iso8859_1(const unsigned char *in, size_t in_len,
char *out, size_t out_size)
{
size_t i, o = 0;
for (i = 2; i + 1 < in_len && o + 1 < out_size; i += 2) {
unsigned int cp = (in[i] << 8) | in[i+1];
out[o++] = cp <= 0xFF ? (char)cp : '?';
}
out[o] = '\0';
return (int)o;
}
static int utf8_to_iso8859_1(const unsigned char *in, size_t in_len,
char *out, size_t out_size)
{
size_t i = 0, o = 0;
while (i < in_len && o + 1 < out_size) {
unsigned int cp;
int consumed = utf8_to_codepoint(in + i, in_len - i, &cp);
if (consumed == 0) {
out[o++] = '?';
i++;
continue;
}
out[o++] = cp <= 0xFF ? (char)cp : '?';
i += consumed;
}
out[o] = '\0';
return (int)o;
}
int to_iso8859_1(const unsigned char *in, size_t in_len,
char *out, size_t out_size)
{
if (!in || !out || out_size == 0)
return 0;
if (in_len == 0) {
out[0] = '\0';
return 0;
}
if (is_utf16_le(in, in_len))
return utf16le_to_iso8859_1(in, in_len, out, out_size);
if (is_utf16_be(in, in_len))
return utf16be_to_iso8859_1(in, in_len, out, out_size);
if (is_utf8(in, in_len))
return utf8_to_iso8859_1(in, in_len, out, out_size);
/* Already ISO-8859-1 / ASCII — passthrough */
size_t copy = in_len < out_size - 1 ? in_len : out_size - 1;
memcpy(out, in, copy);
out[copy] = '\0';
return (int)copy;
}