Fix translation detection for Cyrillic and ISO-8859-1
- Skip ISO-8859-1 Latin chars (0xC0-0xFF) to avoid Swedish false positives - Require 3+ non-Latin chars to trigger translation - Handle both UTF-8 and single-byte Cyrillic encodings - Update README with ai_port option
This commit is contained in:
@@ -103,7 +103,7 @@ ai_target_lang=english
|
||||
ai_skip_langs=swedish,english
|
||||
```
|
||||
|
||||
When configured, foreign-script messages are shown immediately with an italic translation appearing below after 1-5 seconds. Leave values empty to disable.
|
||||
Host can include port (`host:port`) or use a separate `ai_port=` line. When configured, foreign-script messages are shown immediately with an italic translation appearing below after 1-5 seconds. Leave values empty to disable.
|
||||
|
||||
## License
|
||||
|
||||
|
||||
@@ -173,24 +173,27 @@ static void ai_config_load(void)
|
||||
static int needs_translation(const char *text)
|
||||
{
|
||||
if (!ai_cfg.enabled) return 0;
|
||||
/* Check for non-ASCII characters that suggest non-Latin script */
|
||||
int non_latin_count = 0;
|
||||
for (const unsigned char *p = (const unsigned char *)text; *p; p++) {
|
||||
if (*p >= 0xC0) {
|
||||
/* Multi-byte UTF-8: check if it's beyond Latin Extended (U+0250+) */
|
||||
unsigned int cp = 0;
|
||||
if ((*p & 0xE0) == 0xC0 && p[1]) {
|
||||
cp = ((*p & 0x1F) << 6) | (p[1] & 0x3F);
|
||||
} else if ((*p & 0xF0) == 0xE0 && p[1] && p[2]) {
|
||||
cp = ((*p & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
|
||||
} else if ((*p & 0xF8) == 0xF0 && p[1] && p[2] && p[3]) {
|
||||
cp = ((*p & 0x07) << 18) | ((p[1] & 0x3F) << 12) |
|
||||
((p[2] & 0x3F) << 6) | (p[3] & 0x3F);
|
||||
}
|
||||
/* Skip Latin Extended, keep Cyrillic, CJK, Arabic, etc. */
|
||||
if (cp > 0x024F) return 1;
|
||||
if (*p >= 0x80) {
|
||||
if ((*p & 0xE0) == 0xC0 && (p[1] & 0xC0) == 0x80) {
|
||||
unsigned int cp = ((*p & 0x1F) << 6) | (p[1] & 0x3F);
|
||||
if (cp > 0x024F) non_latin_count++;
|
||||
p++;
|
||||
} else if ((*p & 0xF0) == 0xE0 && (p[1] & 0xC0) == 0x80 && (p[2] & 0xC0) == 0x80) {
|
||||
non_latin_count++;
|
||||
p += 2;
|
||||
} else if ((*p & 0xF8) == 0xF0 && (p[1] & 0xC0) == 0x80 && (p[2] & 0xC0) == 0x80 && (p[3] & 0xC0) == 0x80) {
|
||||
non_latin_count++;
|
||||
p += 3;
|
||||
} else {
|
||||
/* Not valid UTF-8 — skip ISO-8859-1 Latin chars (0xC0-0xFF) */
|
||||
if (*p < 0xC0) non_latin_count++;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
/* Require at least 3 non-Latin chars to avoid triggering on stray åäö */
|
||||
return non_latin_count >= 3;
|
||||
}
|
||||
|
||||
static void translate_async(const char *text, int level)
|
||||
|
||||
Reference in New Issue
Block a user