Fix translation detection for Cyrillic and ISO-8859-1

- Skip ISO-8859-1 Latin chars (0xC0-0xFF) to avoid Swedish false positives
- Require 3+ non-Latin chars to trigger translation
- Handle both UTF-8 and single-byte Cyrillic encodings
- Update README with ai_port option
This commit is contained in:
2026-04-30 12:15:17 +02:00
parent 4a5ca98c13
commit 2e7006a8d5
6 changed files with 3162 additions and 15 deletions
+1 -1
View File
@@ -103,7 +103,7 @@ ai_target_lang=english
ai_skip_langs=swedish,english ai_skip_langs=swedish,english
``` ```
When configured, foreign-script messages are shown immediately with an italic translation appearing below after 1-5 seconds. Leave values empty to disable. Host can include port (`host:port`) or use a separate `ai_port=` line. When configured, foreign-script messages are shown immediately with an italic translation appearing below after 1-5 seconds. Leave values empty to disable.
## License ## License
BIN
View File
Binary file not shown.
Executable
BIN
View File
Binary file not shown.
+3144
View File
File diff suppressed because it is too large Load Diff
+18 -15
View File
@@ -173,24 +173,27 @@ static void ai_config_load(void)
static int needs_translation(const char *text) static int needs_translation(const char *text)
{ {
if (!ai_cfg.enabled) return 0; if (!ai_cfg.enabled) return 0;
/* Check for non-ASCII characters that suggest non-Latin script */ int non_latin_count = 0;
for (const unsigned char *p = (const unsigned char *)text; *p; p++) { for (const unsigned char *p = (const unsigned char *)text; *p; p++) {
if (*p >= 0xC0) { if (*p >= 0x80) {
/* Multi-byte UTF-8: check if it's beyond Latin Extended (U+0250+) */ if ((*p & 0xE0) == 0xC0 && (p[1] & 0xC0) == 0x80) {
unsigned int cp = 0; unsigned int cp = ((*p & 0x1F) << 6) | (p[1] & 0x3F);
if ((*p & 0xE0) == 0xC0 && p[1]) { if (cp > 0x024F) non_latin_count++;
cp = ((*p & 0x1F) << 6) | (p[1] & 0x3F); p++;
} else if ((*p & 0xF0) == 0xE0 && p[1] && p[2]) { } else if ((*p & 0xF0) == 0xE0 && (p[1] & 0xC0) == 0x80 && (p[2] & 0xC0) == 0x80) {
cp = ((*p & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F); non_latin_count++;
} else if ((*p & 0xF8) == 0xF0 && p[1] && p[2] && p[3]) { p += 2;
cp = ((*p & 0x07) << 18) | ((p[1] & 0x3F) << 12) | } else if ((*p & 0xF8) == 0xF0 && (p[1] & 0xC0) == 0x80 && (p[2] & 0xC0) == 0x80 && (p[3] & 0xC0) == 0x80) {
((p[2] & 0x3F) << 6) | (p[3] & 0x3F); non_latin_count++;
} p += 3;
/* Skip Latin Extended, keep Cyrillic, CJK, Arabic, etc. */ } else {
if (cp > 0x024F) return 1; /* Not valid UTF-8 — skip ISO-8859-1 Latin chars (0xC0-0xFF) */
if (*p < 0xC0) non_latin_count++;
} }
} }
return 0; }
/* Require at least 3 non-Latin chars to avoid triggering on stray åäö */
return non_latin_count >= 3;
} }
static void translate_async(const char *text, int level) static void translate_async(const char *text, int level)
BIN
View File
Binary file not shown.