Dual-translation similarity check to detect already-understood languages

- Ask LLM for both English and skip-language translations (one call) - Compare skip-language result to original text (word overlap) - Suppress if 50%+ words match (was already in skip language) - Added på (UTF-8) to Swedish word filter - Eliminates false translations of Swedish with tech jargon
2026-04-30 22:01:46 +02:00
parent 6f1c8d1f5b
commit 57d8a0bea9
1 changed files with 62 additions and 11 deletions
@@ -124,6 +124,7 @@ static struct {
 	int level;    /* window to display in */
 	pid_t pid;
 	char target[128]; /* channel/nick to echo translation to */
 	char original[512]; /* original text for similarity check */
 } translate_pending[MAX_TRANSLATE];
 static int translate_count = 0;
@@ -203,6 +204,7 @@ static int needs_translation(const char *text)
 		             "dig", "du", "vi", "de", "sig",
 		             "hade", "sedan", "bara",
 		             "\xc3\xa4r", "\xc3\xa5", "f\xc3\xb6r",
 		             "p\xc3\xa5",
 		             NULL}},
 		{"english", {"the", "and", "that", "this", "with",
 		             "have", "was", "are", "you", "not",
@@ -283,10 +285,11 @@ static void translate_async(const char *text, int level, const char *target)
 		char body[2048];
 		snprintf(body, sizeof(body),
 			"{\"model\":\"%s\",\"messages\":["
-			"{\"role\":\"system\",\"content\":\"You are a language filter for IRC. Respond with ONLY the word SKIP or a translation. Rules: If the sentence structure and grammar is %s, respond SKIP. Code snippets, function names, URLs, and technical jargon inside a sentence do NOT change the language. Only translate if the grammar itself is in another language. Translate to %s.\"},"
+			"{\"role\":\"system\",\"content\":\"Translate the message to both %s and %s. Format: %s translation|||%s translation. Nothing else.\"},"
 			"{\"role\":\"user\",\"content\":\"%s\"}"
 			"],\"stream\":false,\"tool_choice\":\"none\"}",
-			ai_cfg.model, ai_cfg.skip_langs, ai_cfg.target_lang, escaped);
+			ai_cfg.model, ai_cfg.target_lang, ai_cfg.skip_langs,
 			ai_cfg.target_lang, ai_cfg.skip_langs, escaped);
 		char req[4096];
 		int rlen = snprintf(req, sizeof(req),
@@ -349,6 +352,9 @@ static void translate_async(const char *text, int level, const char *target)
 	snprintf(translate_pending[translate_count].target,
 	         sizeof(translate_pending[translate_count].target),
 	         "%s", target ? target : "");
 	snprintf(translate_pending[translate_count].original,
 	         sizeof(translate_pending[translate_count].original),
 	         "%s", text);
 	translate_count++;
 }
@@ -1528,25 +1534,70 @@ int main(int argc, char *argv[])
 					/* Strip trailing whitespace */
 					while (n > 0 && (tbuf[n-1] == '\n' || tbuf[n-1] == '\r' || tbuf[n-1] == ' '))
 						tbuf[--n] = '\0';
-					/* Skip if LLM says it's already in a skip language */
+					if (n == 0) goto skip_translate;
-					if (n == 0 ||
+
-					    strcasecmp(tbuf, "SKIP") == 0 ||
+					/* Parse "english|||swedish" format */
-					    strncasecmp(tbuf, "SKIP", 4) == 0) {
+					char *sep = strstr(tbuf, "|||");
-						/* do nothing */
+					char *english = tbuf;
-					} else {
+					char *skip_part = NULL;
 					if (sep) {
 						*sep = '\0';
 						skip_part = sep + 3;
 						/* Strip leading space */
 						while (*skip_part == ' ') skip_part++;
 					}
 					/* Compare skip_part to original — if similar, suppress */
 					int suppress = 0;
 					if (skip_part && translate_pending[ti].original[0]) {
 						/* Simple word overlap: count matching words */
 						char orig_lower[512], skip_lower[512];
 						snprintf(orig_lower, sizeof(orig_lower), "%s",
 						         translate_pending[ti].original);
 						snprintf(skip_lower, sizeof(skip_lower), "%s", skip_part);
 						for (char *p = orig_lower; *p; p++)
 							if (*p >= 'A' && *p <= 'Z') *p += 32;
 						for (char *p = skip_lower; *p; p++)
 							if (*p >= 'A' && *p <= 'Z') *p += 32;
 						/* Count words in original that appear in skip translation */
 						int total = 0, matches = 0;
 						char *tok = strtok(orig_lower, " ,.!?:;");
 						while (tok) {
 							if (strlen(tok) > 2) {
 								total++;
 								if (strstr(skip_lower, tok)) matches++;
 							}
 							tok = strtok(NULL, " ,.!?:;");
 						}
 						if (total > 0 && matches * 100 / total >= 50)
 							suppress = 1;
 					}
 					/* Also check old SKIP response */
 					if (strcasecmp(english, "SKIP") == 0 ||
 					    strncasecmp(english, "SKIP", 4) == 0)
 						suppress = 1;
 					if (!suppress && english[0]) {
 						/* Strip trailing whitespace from english */
 						size_t elen = strlen(english);
 						while (elen > 0 && (english[elen-1] == ' ' ||
 						       english[elen-1] == '\n'))
 							english[--elen] = '\0';
 						wprintf(translate_pending[ti].level,
-						        "  \033[3m%s\033[0m\n", tbuf);
+						        "  \033[3m%s\033[0m\n", english);
 						if (translate_pending[ti].target[0]) {
 							char pfx[IRC_MAX];
 							snprintf(pfx, sizeof(pfx), "PRIVMSG %s :",
 							         translate_pending[ti].target);
 							irc_send_converted(pfx,
-							    (unsigned char *)tbuf, strlen(tbuf));
+							    (unsigned char *)english, strlen(english));
 							wprintf(translate_pending[ti].level,
-							        "<%s> %s\n", nick, tbuf);
+							        "<%s> %s\n", nick, english);
 						}
 					}
 				}
 				skip_translate:
 				translate_count--;
 				if (ti < translate_count)
 					translate_pending[ti] = translate_pending[translate_count];