From 57d8a0bea9ef4470be20ec0bfad70397a0f46b3c Mon Sep 17 00:00:00 2001 From: Anders Holck Date: Thu, 30 Apr 2026 22:01:46 +0200 Subject: [PATCH] Dual-translation similarity check to detect already-understood languages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Ask LLM for both English and skip-language translations (one call) - Compare skip-language result to original text (word overlap) - Suppress if 50%+ words match (was already in skip language) - Added på (UTF-8) to Swedish word filter - Eliminates false translations of Swedish with tech jargon --- main.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 11 deletions(-) diff --git a/main.c b/main.c index 5ab0c69..108bbda 100644 --- a/main.c +++ b/main.c @@ -124,6 +124,7 @@ static struct { int level; /* window to display in */ pid_t pid; char target[128]; /* channel/nick to echo translation to */ + char original[512]; /* original text for similarity check */ } translate_pending[MAX_TRANSLATE]; static int translate_count = 0; @@ -203,6 +204,7 @@ static int needs_translation(const char *text) "dig", "du", "vi", "de", "sig", "hade", "sedan", "bara", "\xc3\xa4r", "\xc3\xa5", "f\xc3\xb6r", + "p\xc3\xa5", NULL}}, {"english", {"the", "and", "that", "this", "with", "have", "was", "are", "you", "not", @@ -283,10 +285,11 @@ static void translate_async(const char *text, int level, const char *target) char body[2048]; snprintf(body, sizeof(body), "{\"model\":\"%s\",\"messages\":[" - "{\"role\":\"system\",\"content\":\"You are a language filter for IRC. Respond with ONLY the word SKIP or a translation. Rules: If the sentence structure and grammar is %s, respond SKIP. Code snippets, function names, URLs, and technical jargon inside a sentence do NOT change the language. Only translate if the grammar itself is in another language. Translate to %s.\"}," + "{\"role\":\"system\",\"content\":\"Translate the message to both %s and %s. Format: %s translation|||%s translation. Nothing else.\"}," "{\"role\":\"user\",\"content\":\"%s\"}" "],\"stream\":false,\"tool_choice\":\"none\"}", - ai_cfg.model, ai_cfg.skip_langs, ai_cfg.target_lang, escaped); + ai_cfg.model, ai_cfg.target_lang, ai_cfg.skip_langs, + ai_cfg.target_lang, ai_cfg.skip_langs, escaped); char req[4096]; int rlen = snprintf(req, sizeof(req), @@ -349,6 +352,9 @@ static void translate_async(const char *text, int level, const char *target) snprintf(translate_pending[translate_count].target, sizeof(translate_pending[translate_count].target), "%s", target ? target : ""); + snprintf(translate_pending[translate_count].original, + sizeof(translate_pending[translate_count].original), + "%s", text); translate_count++; } @@ -1528,25 +1534,70 @@ int main(int argc, char *argv[]) /* Strip trailing whitespace */ while (n > 0 && (tbuf[n-1] == '\n' || tbuf[n-1] == '\r' || tbuf[n-1] == ' ')) tbuf[--n] = '\0'; - /* Skip if LLM says it's already in a skip language */ - if (n == 0 || - strcasecmp(tbuf, "SKIP") == 0 || - strncasecmp(tbuf, "SKIP", 4) == 0) { - /* do nothing */ - } else { + if (n == 0) goto skip_translate; + + /* Parse "english|||swedish" format */ + char *sep = strstr(tbuf, "|||"); + char *english = tbuf; + char *skip_part = NULL; + if (sep) { + *sep = '\0'; + skip_part = sep + 3; + /* Strip leading space */ + while (*skip_part == ' ') skip_part++; + } + + /* Compare skip_part to original — if similar, suppress */ + int suppress = 0; + if (skip_part && translate_pending[ti].original[0]) { + /* Simple word overlap: count matching words */ + char orig_lower[512], skip_lower[512]; + snprintf(orig_lower, sizeof(orig_lower), "%s", + translate_pending[ti].original); + snprintf(skip_lower, sizeof(skip_lower), "%s", skip_part); + for (char *p = orig_lower; *p; p++) + if (*p >= 'A' && *p <= 'Z') *p += 32; + for (char *p = skip_lower; *p; p++) + if (*p >= 'A' && *p <= 'Z') *p += 32; + /* Count words in original that appear in skip translation */ + int total = 0, matches = 0; + char *tok = strtok(orig_lower, " ,.!?:;"); + while (tok) { + if (strlen(tok) > 2) { + total++; + if (strstr(skip_lower, tok)) matches++; + } + tok = strtok(NULL, " ,.!?:;"); + } + if (total > 0 && matches * 100 / total >= 50) + suppress = 1; + } + + /* Also check old SKIP response */ + if (strcasecmp(english, "SKIP") == 0 || + strncasecmp(english, "SKIP", 4) == 0) + suppress = 1; + + if (!suppress && english[0]) { + /* Strip trailing whitespace from english */ + size_t elen = strlen(english); + while (elen > 0 && (english[elen-1] == ' ' || + english[elen-1] == '\n')) + english[--elen] = '\0'; wprintf(translate_pending[ti].level, - " \033[3m%s\033[0m\n", tbuf); + " \033[3m%s\033[0m\n", english); if (translate_pending[ti].target[0]) { char pfx[IRC_MAX]; snprintf(pfx, sizeof(pfx), "PRIVMSG %s :", translate_pending[ti].target); irc_send_converted(pfx, - (unsigned char *)tbuf, strlen(tbuf)); + (unsigned char *)english, strlen(english)); wprintf(translate_pending[ti].level, - "<%s> %s\n", nick, tbuf); + "<%s> %s\n", nick, english); } } } + skip_translate: translate_count--; if (ti < translate_count) translate_pending[ti] = translate_pending[translate_count];