Fix translation detection for Cyrillic and ISO-8859-1
- Skip ISO-8859-1 Latin chars (0xC0-0xFF) to avoid Swedish false positives - Require 3+ non-Latin chars to trigger translation - Handle both UTF-8 and single-byte Cyrillic encodings - Update README with ai_port option
This commit is contained in:
@@ -103,7 +103,7 @@ ai_target_lang=english
|
|||||||
ai_skip_langs=swedish,english
|
ai_skip_langs=swedish,english
|
||||||
```
|
```
|
||||||
|
|
||||||
When configured, foreign-script messages are shown immediately with an italic translation appearing below after 1-5 seconds. Leave values empty to disable.
|
Host can include port (`host:port`) or use a separate `ai_port=` line. When configured, foreign-script messages are shown immediately with an italic translation appearing below after 1-5 seconds. Leave values empty to disable.
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
|
|||||||
@@ -173,24 +173,27 @@ static void ai_config_load(void)
|
|||||||
static int needs_translation(const char *text)
|
static int needs_translation(const char *text)
|
||||||
{
|
{
|
||||||
if (!ai_cfg.enabled) return 0;
|
if (!ai_cfg.enabled) return 0;
|
||||||
/* Check for non-ASCII characters that suggest non-Latin script */
|
int non_latin_count = 0;
|
||||||
for (const unsigned char *p = (const unsigned char *)text; *p; p++) {
|
for (const unsigned char *p = (const unsigned char *)text; *p; p++) {
|
||||||
if (*p >= 0xC0) {
|
if (*p >= 0x80) {
|
||||||
/* Multi-byte UTF-8: check if it's beyond Latin Extended (U+0250+) */
|
if ((*p & 0xE0) == 0xC0 && (p[1] & 0xC0) == 0x80) {
|
||||||
unsigned int cp = 0;
|
unsigned int cp = ((*p & 0x1F) << 6) | (p[1] & 0x3F);
|
||||||
if ((*p & 0xE0) == 0xC0 && p[1]) {
|
if (cp > 0x024F) non_latin_count++;
|
||||||
cp = ((*p & 0x1F) << 6) | (p[1] & 0x3F);
|
p++;
|
||||||
} else if ((*p & 0xF0) == 0xE0 && p[1] && p[2]) {
|
} else if ((*p & 0xF0) == 0xE0 && (p[1] & 0xC0) == 0x80 && (p[2] & 0xC0) == 0x80) {
|
||||||
cp = ((*p & 0x0F) << 12) | ((p[1] & 0x3F) << 6) | (p[2] & 0x3F);
|
non_latin_count++;
|
||||||
} else if ((*p & 0xF8) == 0xF0 && p[1] && p[2] && p[3]) {
|
p += 2;
|
||||||
cp = ((*p & 0x07) << 18) | ((p[1] & 0x3F) << 12) |
|
} else if ((*p & 0xF8) == 0xF0 && (p[1] & 0xC0) == 0x80 && (p[2] & 0xC0) == 0x80 && (p[3] & 0xC0) == 0x80) {
|
||||||
((p[2] & 0x3F) << 6) | (p[3] & 0x3F);
|
non_latin_count++;
|
||||||
|
p += 3;
|
||||||
|
} else {
|
||||||
|
/* Not valid UTF-8 — skip ISO-8859-1 Latin chars (0xC0-0xFF) */
|
||||||
|
if (*p < 0xC0) non_latin_count++;
|
||||||
}
|
}
|
||||||
/* Skip Latin Extended, keep Cyrillic, CJK, Arabic, etc. */
|
|
||||||
if (cp > 0x024F) return 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
/* Require at least 3 non-Latin chars to avoid triggering on stray åäö */
|
||||||
|
return non_latin_count >= 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void translate_async(const char *text, int level)
|
static void translate_async(const char *text, int level)
|
||||||
|
|||||||
Reference in New Issue
Block a user