change translation engine to Bergamot so it will run locally

This commit is contained in:
Yo'av Moshe 2024-04-19 23:57:04 +02:00
parent 84d072b763
commit 2996430b66

View File

@ -6,7 +6,6 @@ from pathlib import Path
from tempfile import TemporaryDirectory from tempfile import TemporaryDirectory
from typing import Optional from typing import Optional
import requests
import tqdm import tqdm
from celery import Task from celery import Task
from celery import shared_task from celery import shared_task
@ -51,26 +50,19 @@ logger = logging.getLogger("paperless.tasks")
def translate_content(content): def translate_content(content):
import bergamot
headers = { service = bergamot.Service(bergamot.ServiceConfig())
"Authorization": "DeepL-Auth-Key " + settings.DEEPL_TOKEN,
"Content-Type": "application/json",
}
json_data = { model = service.modelFromConfigPath(
"text": [ bergamot.REPOSITORY.modelConfigPath("browsermt", settings.TRANSLATION_MODEL),
content,
],
"target_lang": settings.TRANSLATION_TARGET,
}
response = requests.post(
"https://api-free.deepl.com/v2/translate",
headers=headers,
json=json_data,
) )
result = service.translate(
return response.json()["translations"][0]["text"] model,
bergamot.VectorString([content]),
bergamot.ResponseOptions(),
)
return next(r.target.text for r in result)
@shared_task @shared_task
@ -268,10 +260,13 @@ def update_document_archive_file(document_id):
) )
oldDocument = Document.objects.get(pk=document.pk) oldDocument = Document.objects.get(pk=document.pk)
content = parser.get_text() content = parser.get_text()
translation = (
translate_content(content) if settings.TRANSLATION_MODEL else ""
)
Document.objects.filter(pk=document.pk).update( Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum, archive_checksum=checksum,
content=content, content=content,
translation=translate_content(content), translation=translation,
archive_filename=document.archive_filename, archive_filename=document.archive_filename,
) )
newDocument = Document.objects.get(pk=document.pk) newDocument = Document.objects.get(pk=document.pk)