Dual-translation similarity check to detect already-understood languages

- Ask LLM for both English and skip-language translations (one call)
- Compare skip-language result to original text (word overlap)
- Suppress if 50%+ words match (was already in skip language)
- Added på (UTF-8) to Swedish word filter
- Eliminates false translations of Swedish with tech jargon
This commit is contained in:
2026-04-30 22:01:46 +02:00
parent 6f1c8d1f5b
commit 57d8a0bea9
+62 -11
View File
@@ -124,6 +124,7 @@ static struct {
int level; /* window to display in */ int level; /* window to display in */
pid_t pid; pid_t pid;
char target[128]; /* channel/nick to echo translation to */ char target[128]; /* channel/nick to echo translation to */
char original[512]; /* original text for similarity check */
} translate_pending[MAX_TRANSLATE]; } translate_pending[MAX_TRANSLATE];
static int translate_count = 0; static int translate_count = 0;
@@ -203,6 +204,7 @@ static int needs_translation(const char *text)
"dig", "du", "vi", "de", "sig", "dig", "du", "vi", "de", "sig",
"hade", "sedan", "bara", "hade", "sedan", "bara",
"\xc3\xa4r", "\xc3\xa5", "f\xc3\xb6r", "\xc3\xa4r", "\xc3\xa5", "f\xc3\xb6r",
"p\xc3\xa5",
NULL}}, NULL}},
{"english", {"the", "and", "that", "this", "with", {"english", {"the", "and", "that", "this", "with",
"have", "was", "are", "you", "not", "have", "was", "are", "you", "not",
@@ -283,10 +285,11 @@ static void translate_async(const char *text, int level, const char *target)
char body[2048]; char body[2048];
snprintf(body, sizeof(body), snprintf(body, sizeof(body),
"{\"model\":\"%s\",\"messages\":[" "{\"model\":\"%s\",\"messages\":["
"{\"role\":\"system\",\"content\":\"You are a language filter for IRC. Respond with ONLY the word SKIP or a translation. Rules: If the sentence structure and grammar is %s, respond SKIP. Code snippets, function names, URLs, and technical jargon inside a sentence do NOT change the language. Only translate if the grammar itself is in another language. Translate to %s.\"}," "{\"role\":\"system\",\"content\":\"Translate the message to both %s and %s. Format: %s translation|||%s translation. Nothing else.\"},"
"{\"role\":\"user\",\"content\":\"%s\"}" "{\"role\":\"user\",\"content\":\"%s\"}"
"],\"stream\":false,\"tool_choice\":\"none\"}", "],\"stream\":false,\"tool_choice\":\"none\"}",
ai_cfg.model, ai_cfg.skip_langs, ai_cfg.target_lang, escaped); ai_cfg.model, ai_cfg.target_lang, ai_cfg.skip_langs,
ai_cfg.target_lang, ai_cfg.skip_langs, escaped);
char req[4096]; char req[4096];
int rlen = snprintf(req, sizeof(req), int rlen = snprintf(req, sizeof(req),
@@ -349,6 +352,9 @@ static void translate_async(const char *text, int level, const char *target)
snprintf(translate_pending[translate_count].target, snprintf(translate_pending[translate_count].target,
sizeof(translate_pending[translate_count].target), sizeof(translate_pending[translate_count].target),
"%s", target ? target : ""); "%s", target ? target : "");
snprintf(translate_pending[translate_count].original,
sizeof(translate_pending[translate_count].original),
"%s", text);
translate_count++; translate_count++;
} }
@@ -1528,25 +1534,70 @@ int main(int argc, char *argv[])
/* Strip trailing whitespace */ /* Strip trailing whitespace */
while (n > 0 && (tbuf[n-1] == '\n' || tbuf[n-1] == '\r' || tbuf[n-1] == ' ')) while (n > 0 && (tbuf[n-1] == '\n' || tbuf[n-1] == '\r' || tbuf[n-1] == ' '))
tbuf[--n] = '\0'; tbuf[--n] = '\0';
/* Skip if LLM says it's already in a skip language */ if (n == 0) goto skip_translate;
if (n == 0 ||
strcasecmp(tbuf, "SKIP") == 0 || /* Parse "english|||swedish" format */
strncasecmp(tbuf, "SKIP", 4) == 0) { char *sep = strstr(tbuf, "|||");
/* do nothing */ char *english = tbuf;
} else { char *skip_part = NULL;
if (sep) {
*sep = '\0';
skip_part = sep + 3;
/* Strip leading space */
while (*skip_part == ' ') skip_part++;
}
/* Compare skip_part to original — if similar, suppress */
int suppress = 0;
if (skip_part && translate_pending[ti].original[0]) {
/* Simple word overlap: count matching words */
char orig_lower[512], skip_lower[512];
snprintf(orig_lower, sizeof(orig_lower), "%s",
translate_pending[ti].original);
snprintf(skip_lower, sizeof(skip_lower), "%s", skip_part);
for (char *p = orig_lower; *p; p++)
if (*p >= 'A' && *p <= 'Z') *p += 32;
for (char *p = skip_lower; *p; p++)
if (*p >= 'A' && *p <= 'Z') *p += 32;
/* Count words in original that appear in skip translation */
int total = 0, matches = 0;
char *tok = strtok(orig_lower, " ,.!?:;");
while (tok) {
if (strlen(tok) > 2) {
total++;
if (strstr(skip_lower, tok)) matches++;
}
tok = strtok(NULL, " ,.!?:;");
}
if (total > 0 && matches * 100 / total >= 50)
suppress = 1;
}
/* Also check old SKIP response */
if (strcasecmp(english, "SKIP") == 0 ||
strncasecmp(english, "SKIP", 4) == 0)
suppress = 1;
if (!suppress && english[0]) {
/* Strip trailing whitespace from english */
size_t elen = strlen(english);
while (elen > 0 && (english[elen-1] == ' ' ||
english[elen-1] == '\n'))
english[--elen] = '\0';
wprintf(translate_pending[ti].level, wprintf(translate_pending[ti].level,
" \033[3m%s\033[0m\n", tbuf); " \033[3m%s\033[0m\n", english);
if (translate_pending[ti].target[0]) { if (translate_pending[ti].target[0]) {
char pfx[IRC_MAX]; char pfx[IRC_MAX];
snprintf(pfx, sizeof(pfx), "PRIVMSG %s :", snprintf(pfx, sizeof(pfx), "PRIVMSG %s :",
translate_pending[ti].target); translate_pending[ti].target);
irc_send_converted(pfx, irc_send_converted(pfx,
(unsigned char *)tbuf, strlen(tbuf)); (unsigned char *)english, strlen(english));
wprintf(translate_pending[ti].level, wprintf(translate_pending[ti].level,
"<%s> %s\n", nick, tbuf); "<%s> %s\n", nick, english);
} }
} }
} }
skip_translate:
translate_count--; translate_count--;
if (ti < translate_count) if (ti < translate_count)
translate_pending[ti] = translate_pending[translate_count]; translate_pending[ti] = translate_pending[translate_count];