Add client-side language pre-filter to reduce false translations
- Common Swedish/English words checked before API call - Word boundary matching (non-alpha boundaries) - Lower threshold for short messages (1 hit vs 2) - Only filters languages listed in ai_skip_langs
This commit is contained in:
@@ -188,11 +188,49 @@ static void ai_config_load(void)
|
|||||||
static int needs_translation(const char *text)
|
static int needs_translation(const char *text)
|
||||||
{
|
{
|
||||||
if (!ai_cfg.enabled || !translate_enabled) return 0;
|
if (!ai_cfg.enabled || !translate_enabled) return 0;
|
||||||
/* Skip very short messages (nicks, URLs, single words like "ok") */
|
/* Skip very short messages */
|
||||||
int words = 0;
|
int words = 0;
|
||||||
for (const char *p = text; *p; p++)
|
for (const char *p = text; *p; p++)
|
||||||
if (*p == ' ') words++;
|
if (*p == ' ') words++;
|
||||||
if (words < 1 && strlen(text) < 6) return 0;
|
if (words < 1 && strlen(text) < 6) return 0;
|
||||||
|
|
||||||
|
/* Pre-filter: if text contains common words from skip languages, skip */
|
||||||
|
static const struct { const char *lang; const char *words[23]; } lang_words[] = {
|
||||||
|
{"swedish", {"jag", "och", "att", "det", "inte", "var",
|
||||||
|
"som", "för", "med", "har", "den", "kan",
|
||||||
|
"ska", "till", "eller", "men", "där",
|
||||||
|
"när", "från", "ett", "en", "ta",
|
||||||
|
NULL}},
|
||||||
|
{"english", {"the", "and", "that", "this", "with",
|
||||||
|
"have", "was", "are", "you", "not",
|
||||||
|
"from", "but", "for", "can", NULL}},
|
||||||
|
{NULL, {NULL}}
|
||||||
|
};
|
||||||
|
char lower[1024];
|
||||||
|
snprintf(lower, sizeof(lower), "%s", text);
|
||||||
|
for (char *p = lower; *p; p++)
|
||||||
|
if (*p >= 'A' && *p <= 'Z') *p += 32;
|
||||||
|
int hits = 0;
|
||||||
|
for (int i = 0; lang_words[i].lang; i++) {
|
||||||
|
if (!strstr(ai_cfg.skip_langs, lang_words[i].lang)) continue;
|
||||||
|
for (int j = 0; lang_words[i].words[j]; j++) {
|
||||||
|
const char *w = lang_words[i].words[j];
|
||||||
|
size_t wlen = strlen(w);
|
||||||
|
char *p = lower;
|
||||||
|
while ((p = strstr(p, w)) != NULL) {
|
||||||
|
/* Check word boundaries (non-alpha on both sides) */
|
||||||
|
int before_ok = (p == lower) ||
|
||||||
|
!(*(p-1) >= 'a' && *(p-1) <= 'z');
|
||||||
|
int after_ok = !p[wlen] ||
|
||||||
|
!(p[wlen] >= 'a' && p[wlen] <= 'z');
|
||||||
|
if (before_ok && after_ok) { hits++; break; }
|
||||||
|
p++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int threshold = (words < 4) ? 1 : 2;
|
||||||
|
if (hits >= threshold) return 0;
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user