From 0e608c587103d5ca0fe4deeca3e36749ca5cf463 Mon Sep 17 00:00:00 2001 From: Yo'av Moshe Date: Sat, 20 Apr 2024 11:49:56 +0200 Subject: [PATCH] allow multiple models and avoid translation if not needed --- Pipfile | 3 +++ src/documents/tasks.py | 33 +++++++++++++++++++++++---------- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/Pipfile b/Pipfile index 034812088..1bcb1f012 100644 --- a/Pipfile +++ b/Pipfile @@ -94,3 +94,6 @@ types-Pygments = "*" types-colorama = "*" types-psycopg2 = "*" types-setuptools = "*" + +[translation] +bergamot = "0.4.5" diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 359a436e6..4aaa219bd 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -13,6 +13,7 @@ from django.conf import settings from django.db import transaction from django.db.models.signals import post_save from filelock import FileLock +from langdetect import detect from whoosh.writing import AsyncWriter from documents import index @@ -52,17 +53,29 @@ logger = logging.getLogger("paperless.tasks") def translate_content(content): import bergamot - service = bergamot.Service(bergamot.ServiceConfig()) + models = settings.TRANSLATION_MODELS.split(",") + original_language = detect(content) - model = service.modelFromConfigPath( - bergamot.REPOSITORY.modelConfigPath("browsermt", settings.TRANSLATION_MODEL), - ) - result = service.translate( - model, - bergamot.VectorString([content]), - bergamot.ResponseOptions(), - ) - return next(r.target.text for r in result) + # Avoid translating if we already have the target language + if original_language == settings.TRANSLATION_TARGET_LANGUAGE: + return "" + + for model in models: + # Find the right model for the translation + # bergamot models usually end with "tiny" or "base" so we remove that + if original_language in model.replace("base", "").replace("tiny", ""): + service = bergamot.Service(bergamot.ServiceConfig()) + + model = service.modelFromConfigPath( + bergamot.REPOSITORY.modelConfigPath("browsermt", model), + ) + result = service.translate( + model, + bergamot.VectorString([content]), + bergamot.ResponseOptions(), + ) + return next(r.target.text for r in result) + return "" @shared_task