Dual-translation similarity check to detect already-understood languages
- Ask LLM for both English and skip-language translations (one call) - Compare skip-language result to original text (word overlap) - Suppress if 50%+ words match (was already in skip language) - Added på (UTF-8) to Swedish word filter - Eliminates false translations of Swedish with tech jargon
This commit is contained in:
@@ -124,6 +124,7 @@ static struct {
|
|||||||
int level; /* window to display in */
|
int level; /* window to display in */
|
||||||
pid_t pid;
|
pid_t pid;
|
||||||
char target[128]; /* channel/nick to echo translation to */
|
char target[128]; /* channel/nick to echo translation to */
|
||||||
|
char original[512]; /* original text for similarity check */
|
||||||
} translate_pending[MAX_TRANSLATE];
|
} translate_pending[MAX_TRANSLATE];
|
||||||
static int translate_count = 0;
|
static int translate_count = 0;
|
||||||
|
|
||||||
@@ -203,6 +204,7 @@ static int needs_translation(const char *text)
|
|||||||
"dig", "du", "vi", "de", "sig",
|
"dig", "du", "vi", "de", "sig",
|
||||||
"hade", "sedan", "bara",
|
"hade", "sedan", "bara",
|
||||||
"\xc3\xa4r", "\xc3\xa5", "f\xc3\xb6r",
|
"\xc3\xa4r", "\xc3\xa5", "f\xc3\xb6r",
|
||||||
|
"p\xc3\xa5",
|
||||||
NULL}},
|
NULL}},
|
||||||
{"english", {"the", "and", "that", "this", "with",
|
{"english", {"the", "and", "that", "this", "with",
|
||||||
"have", "was", "are", "you", "not",
|
"have", "was", "are", "you", "not",
|
||||||
@@ -283,10 +285,11 @@ static void translate_async(const char *text, int level, const char *target)
|
|||||||
char body[2048];
|
char body[2048];
|
||||||
snprintf(body, sizeof(body),
|
snprintf(body, sizeof(body),
|
||||||
"{\"model\":\"%s\",\"messages\":["
|
"{\"model\":\"%s\",\"messages\":["
|
||||||
"{\"role\":\"system\",\"content\":\"You are a language filter for IRC. Respond with ONLY the word SKIP or a translation. Rules: If the sentence structure and grammar is %s, respond SKIP. Code snippets, function names, URLs, and technical jargon inside a sentence do NOT change the language. Only translate if the grammar itself is in another language. Translate to %s.\"},"
|
"{\"role\":\"system\",\"content\":\"Translate the message to both %s and %s. Format: %s translation|||%s translation. Nothing else.\"},"
|
||||||
"{\"role\":\"user\",\"content\":\"%s\"}"
|
"{\"role\":\"user\",\"content\":\"%s\"}"
|
||||||
"],\"stream\":false,\"tool_choice\":\"none\"}",
|
"],\"stream\":false,\"tool_choice\":\"none\"}",
|
||||||
ai_cfg.model, ai_cfg.skip_langs, ai_cfg.target_lang, escaped);
|
ai_cfg.model, ai_cfg.target_lang, ai_cfg.skip_langs,
|
||||||
|
ai_cfg.target_lang, ai_cfg.skip_langs, escaped);
|
||||||
|
|
||||||
char req[4096];
|
char req[4096];
|
||||||
int rlen = snprintf(req, sizeof(req),
|
int rlen = snprintf(req, sizeof(req),
|
||||||
@@ -349,6 +352,9 @@ static void translate_async(const char *text, int level, const char *target)
|
|||||||
snprintf(translate_pending[translate_count].target,
|
snprintf(translate_pending[translate_count].target,
|
||||||
sizeof(translate_pending[translate_count].target),
|
sizeof(translate_pending[translate_count].target),
|
||||||
"%s", target ? target : "");
|
"%s", target ? target : "");
|
||||||
|
snprintf(translate_pending[translate_count].original,
|
||||||
|
sizeof(translate_pending[translate_count].original),
|
||||||
|
"%s", text);
|
||||||
translate_count++;
|
translate_count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1528,25 +1534,70 @@ int main(int argc, char *argv[])
|
|||||||
/* Strip trailing whitespace */
|
/* Strip trailing whitespace */
|
||||||
while (n > 0 && (tbuf[n-1] == '\n' || tbuf[n-1] == '\r' || tbuf[n-1] == ' '))
|
while (n > 0 && (tbuf[n-1] == '\n' || tbuf[n-1] == '\r' || tbuf[n-1] == ' '))
|
||||||
tbuf[--n] = '\0';
|
tbuf[--n] = '\0';
|
||||||
/* Skip if LLM says it's already in a skip language */
|
if (n == 0) goto skip_translate;
|
||||||
if (n == 0 ||
|
|
||||||
strcasecmp(tbuf, "SKIP") == 0 ||
|
/* Parse "english|||swedish" format */
|
||||||
strncasecmp(tbuf, "SKIP", 4) == 0) {
|
char *sep = strstr(tbuf, "|||");
|
||||||
/* do nothing */
|
char *english = tbuf;
|
||||||
} else {
|
char *skip_part = NULL;
|
||||||
|
if (sep) {
|
||||||
|
*sep = '\0';
|
||||||
|
skip_part = sep + 3;
|
||||||
|
/* Strip leading space */
|
||||||
|
while (*skip_part == ' ') skip_part++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Compare skip_part to original — if similar, suppress */
|
||||||
|
int suppress = 0;
|
||||||
|
if (skip_part && translate_pending[ti].original[0]) {
|
||||||
|
/* Simple word overlap: count matching words */
|
||||||
|
char orig_lower[512], skip_lower[512];
|
||||||
|
snprintf(orig_lower, sizeof(orig_lower), "%s",
|
||||||
|
translate_pending[ti].original);
|
||||||
|
snprintf(skip_lower, sizeof(skip_lower), "%s", skip_part);
|
||||||
|
for (char *p = orig_lower; *p; p++)
|
||||||
|
if (*p >= 'A' && *p <= 'Z') *p += 32;
|
||||||
|
for (char *p = skip_lower; *p; p++)
|
||||||
|
if (*p >= 'A' && *p <= 'Z') *p += 32;
|
||||||
|
/* Count words in original that appear in skip translation */
|
||||||
|
int total = 0, matches = 0;
|
||||||
|
char *tok = strtok(orig_lower, " ,.!?:;");
|
||||||
|
while (tok) {
|
||||||
|
if (strlen(tok) > 2) {
|
||||||
|
total++;
|
||||||
|
if (strstr(skip_lower, tok)) matches++;
|
||||||
|
}
|
||||||
|
tok = strtok(NULL, " ,.!?:;");
|
||||||
|
}
|
||||||
|
if (total > 0 && matches * 100 / total >= 50)
|
||||||
|
suppress = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Also check old SKIP response */
|
||||||
|
if (strcasecmp(english, "SKIP") == 0 ||
|
||||||
|
strncasecmp(english, "SKIP", 4) == 0)
|
||||||
|
suppress = 1;
|
||||||
|
|
||||||
|
if (!suppress && english[0]) {
|
||||||
|
/* Strip trailing whitespace from english */
|
||||||
|
size_t elen = strlen(english);
|
||||||
|
while (elen > 0 && (english[elen-1] == ' ' ||
|
||||||
|
english[elen-1] == '\n'))
|
||||||
|
english[--elen] = '\0';
|
||||||
wprintf(translate_pending[ti].level,
|
wprintf(translate_pending[ti].level,
|
||||||
" \033[3m%s\033[0m\n", tbuf);
|
" \033[3m%s\033[0m\n", english);
|
||||||
if (translate_pending[ti].target[0]) {
|
if (translate_pending[ti].target[0]) {
|
||||||
char pfx[IRC_MAX];
|
char pfx[IRC_MAX];
|
||||||
snprintf(pfx, sizeof(pfx), "PRIVMSG %s :",
|
snprintf(pfx, sizeof(pfx), "PRIVMSG %s :",
|
||||||
translate_pending[ti].target);
|
translate_pending[ti].target);
|
||||||
irc_send_converted(pfx,
|
irc_send_converted(pfx,
|
||||||
(unsigned char *)tbuf, strlen(tbuf));
|
(unsigned char *)english, strlen(english));
|
||||||
wprintf(translate_pending[ti].level,
|
wprintf(translate_pending[ti].level,
|
||||||
"<%s> %s\n", nick, tbuf);
|
"<%s> %s\n", nick, english);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
skip_translate:
|
||||||
translate_count--;
|
translate_count--;
|
||||||
if (ti < translate_count)
|
if (ti < translate_count)
|
||||||
translate_pending[ti] = translate_pending[translate_count];
|
translate_pending[ti] = translate_pending[translate_count];
|
||||||
|
|||||||
Reference in New Issue
Block a user