diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index 8913b1b6f..25b58b3c9 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -41,6 +41,8 @@ from documents.settings import EXPORTER_THUMBNAIL_NAME from documents.utils import copy_file_with_basic_stats from paperless import version from paperless.db import GnuPG +from paperless.models import CommonSettings +from paperless.models import OcrSettings from paperless_mail.models import MailAccount from paperless_mail.models import MailRule @@ -291,6 +293,14 @@ class Command(BaseCommand): serializers.serialize("json", CustomField.objects.all()), ) + manifest += json.loads( + serializers.serialize("json", CommonSettings.objects.all()), + ) + + manifest += json.loads( + serializers.serialize("json", OcrSettings.objects.all()), + ) + # These are treated specially and included in the per-document manifest # if that setting is enabled. Otherwise, they are just exported to the bulk # manifest diff --git a/src/paperless/migrations/0001_initial.py b/src/paperless/migrations/0001_initial.py index db6617a6e..efe1ecd99 100644 --- a/src/paperless/migrations/0001_initial.py +++ b/src/paperless/migrations/0001_initial.py @@ -9,7 +9,7 @@ def _create_singleton(apps, schema_editor): """ Creates the first and only instance of the settings models """ - for model_name in ["CommonSettings", "OcrSettings", "TextSettings", "TikaSettings"]: + for model_name in ["CommonSettings", "OcrSettings"]: settings_model = apps.get_model("paperless", model_name) settings_model.objects.create() @@ -189,60 +189,5 @@ class Migration(migrations.Migration): "verbose_name": "ocr settings", }, ), - migrations.CreateModel( - name="TextSettings", - fields=[ - ( - "id", - models.AutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "thumbnail_font_name", - models.CharField( - blank=True, - max_length=64, - null=True, - verbose_name="Sets the output PDF type", - ), - ), - ], - options={ - "abstract": False, - }, - ), - migrations.CreateModel( - name="TikaSettings", - fields=[ - ( - "id", - models.AutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "tika_url", - models.URLField(blank=True, null=True, verbose_name="Tika URL"), - ), - ( - "gotenberg_url", - models.URLField( - blank=True, - null=True, - verbose_name="Gotenberg URL", - ), - ), - ], - options={ - "abstract": False, - }, - ), migrations.RunPython(_create_singleton, migrations.RunPython.noop), ] diff --git a/src/paperless/models.py b/src/paperless/models.py index 2b0ecb056..643aeb6fd 100644 --- a/src/paperless/models.py +++ b/src/paperless/models.py @@ -146,40 +146,3 @@ class OcrSettings(AbstractSingletonModel): def __str__(self) -> str: return "OcrSettings" - - -class TextSettings(AbstractSingletonModel): - """ - Settings for the text parser - """ - - thumbnail_font_name = models.CharField( - verbose_name=_("Sets the output PDF type"), - null=True, - blank=True, - max_length=64, - ) - - -class TikaSettings(AbstractSingletonModel): - """ - Settings for the Tika parser - """ - - tika_url = models.URLField( - verbose_name=_("Tika URL"), - null=True, - blank=True, - ) - gotenberg_url = models.URLField( - verbose_name=_("Gotenberg URL"), - null=True, - blank=True, - ) - - -class ConsumerSettings(AbstractSingletonModel): - delete_duplicates = models.BooleanField( - verbose_name=_("Delete duplicate consumer files"), - null=True, - ) diff --git a/src/paperless/serialisers.py b/src/paperless/serialisers.py index 496a06c5f..473c7ee6b 100644 --- a/src/paperless/serialisers.py +++ b/src/paperless/serialisers.py @@ -3,6 +3,7 @@ from django.contrib.auth.models import Permission from django.contrib.auth.models import User from rest_framework import serializers +from paperless.models import CommonSettings from paperless.models import OcrSettings @@ -117,6 +118,12 @@ class ProfileSerializer(serializers.ModelSerializer): ) +class CommonSettingsSerializer(serializers.ModelSerializer): + class Meta: + model = CommonSettings + fields = ["all"] + + class OcrSettingsSerializer(serializers.ModelSerializer): class Meta: model = OcrSettings diff --git a/src/paperless/urls.py b/src/paperless/urls.py index 67fddbee0..35e2bd989 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -34,9 +34,11 @@ from documents.views import TasksViewSet from documents.views import UiSettingsView from documents.views import UnifiedSearchViewSet from paperless.consumers import StatusConsumer +from paperless.views import CommonSettingsViewSet from paperless.views import FaviconView from paperless.views import GenerateAuthTokenView from paperless.views import GroupViewSet +from paperless.views import OcrSettingsViewSet from paperless.views import ProfileView from paperless.views import UserViewSet from paperless_mail.views import MailAccountTestView @@ -59,6 +61,8 @@ api_router.register(r"mail_rules", MailRuleViewSet) api_router.register(r"share_links", ShareLinkViewSet) api_router.register(r"consumption_templates", ConsumptionTemplateViewSet) api_router.register(r"custom_fields", CustomFieldViewSet) +api_router.register(r"common_settings", CommonSettingsViewSet) +api_router.register(r"ocr_settings", OcrSettingsViewSet) urlpatterns = [ diff --git a/src/paperless/views.py b/src/paperless/views.py index 5d04b6bf4..4229d3836 100644 --- a/src/paperless/views.py +++ b/src/paperless/views.py @@ -18,7 +18,9 @@ from rest_framework.viewsets import ModelViewSet from documents.permissions import PaperlessObjectPermissions from paperless.filters import GroupFilterSet from paperless.filters import UserFilterSet +from paperless.models import CommonSettings from paperless.models import OcrSettings +from paperless.serialisers import CommonSettingsSerializer from paperless.serialisers import GroupSerializer from paperless.serialisers import OcrSettingsSerializer from paperless.serialisers import ProfileSerializer @@ -164,6 +166,15 @@ class GenerateAuthTokenView(GenericAPIView): ) +class CommonSettingsViewSet(ModelViewSet): + model = CommonSettings + + queryset = CommonSettings.objects + + serializer_class = CommonSettingsSerializer + permission_classes = (IsAuthenticated,) + + class OcrSettingsViewSet(ModelViewSet): model = OcrSettings diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 70b926432..e4e56fb8c 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -3,6 +3,7 @@ import re import subprocess import tempfile from pathlib import Path +from typing import TYPE_CHECKING from typing import Optional from django.conf import settings @@ -11,7 +12,7 @@ from PIL import Image from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.parsers import make_thumbnail_from_pdf -from paperless_tesseract.models import OcrSettings as OcrSettingModel +from paperless.models import OcrSettings as OcrSettingModel from paperless_tesseract.setting_schema import OcrSetting from paperless_tesseract.setting_schema import get_ocr_settings @@ -71,7 +72,7 @@ class RasterisedDocumentParser(DocumentParser): self.logging_group, ) - def is_image(self, mime_type): + def is_image(self, mime_type) -> bool: return mime_type in [ "image/png", "image/jpeg", @@ -81,7 +82,7 @@ class RasterisedDocumentParser(DocumentParser): "image/webp", ] - def has_alpha(self, image): + def has_alpha(self, image) -> bool: with Image.open(image) as im: return im.mode in ("RGBA", "LA") @@ -96,7 +97,7 @@ class RasterisedDocumentParser(DocumentParser): ], ) - def get_dpi(self, image): + def get_dpi(self, image) -> Optional[int]: try: with Image.open(image) as im: x, y = im.info["dpi"] @@ -105,7 +106,7 @@ class RasterisedDocumentParser(DocumentParser): self.log.warning(f"Error while getting DPI from image {image}: {e}") return None - def calculate_a4_dpi(self, image): + def calculate_a4_dpi(self, image) -> Optional[int]: try: with Image.open(image) as im: width, height = im.size @@ -118,7 +119,11 @@ class RasterisedDocumentParser(DocumentParser): self.log.warning(f"Error while calculating DPI for image {image}: {e}") return None - def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path): + def extract_text( + self, + sidecar_file: Optional[Path], + pdf_file: Path, + ) -> Optional[str]: # When re-doing OCR, the sidecar contains ONLY the new text, not # the whole text, so do not utilize it in that case if ( @@ -179,7 +184,8 @@ class RasterisedDocumentParser(DocumentParser): sidecar_file, safe_fallback=False, ): - assert isinstance(self.parser_settings, OcrSetting) + if TYPE_CHECKING: + assert isinstance(self.parser_settings, OcrSetting) ocrmypdf_args = { "input_file": input_file, "output_file": output_file, diff --git a/src/paperless_tesseract/setting_schema.py b/src/paperless_tesseract/setting_schema.py index 5a9527734..907241ebd 100644 --- a/src/paperless_tesseract/setting_schema.py +++ b/src/paperless_tesseract/setting_schema.py @@ -4,7 +4,8 @@ from typing import Optional from django.conf import settings -from paperless_tesseract.models import OcrSettings as OcrSettingModel +from paperless.models import CommonSettings +from paperless.models import OcrSettings as OcrSettingModel @dataclasses.dataclass(frozen=True) @@ -25,16 +26,20 @@ class OcrSetting: def get_ocr_settings() -> OcrSetting: - db_settings = OcrSettingModel.objects.all().first() + ocr_db_settings = OcrSettingModel.objects.all().first() # Workaround for a test where the migration hasn't run to create the single model - if db_settings is None: + if ocr_db_settings is None: OcrSettingModel.objects.create() - db_settings = OcrSettingModel.objects.all().first() - assert db_settings is not None + ocr_db_settings = OcrSettingModel.objects.all().first() + + cmn_db_settings = CommonSettings.objects.all().first() + if cmn_db_settings is None: + CommonSettings.objects.create() + cmn_db_settings = CommonSettings.objects.all().first() user_args = None - if db_settings.user_args: - user_args = db_settings.user_args + if ocr_db_settings.user_args: + user_args = ocr_db_settings.user_args elif settings.OCR_USER_ARGS is not None: try: user_args = json.loads(settings.OCR_USER_ARGS) @@ -42,23 +47,25 @@ def get_ocr_settings() -> OcrSetting: user_args = {} return OcrSetting( - pages=db_settings.pages or settings.OCR_PAGES, - language=db_settings.language or settings.OCR_LANGUAGE, - output_type=db_settings.output_type or settings.OCR_OUTPUT_TYPE, - mode=db_settings.mode or settings.OCR_MODE, + pages=ocr_db_settings.pages or settings.OCR_PAGES, + language=ocr_db_settings.language or settings.OCR_LANGUAGE, + output_type=cmn_db_settings.output_type or settings.OCR_OUTPUT_TYPE, + mode=ocr_db_settings.mode or settings.OCR_MODE, skip_archive_file=( - db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE + ocr_db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE ), - image_dpi=db_settings.image_dpi or settings.OCR_IMAGE_DPI, - clean=db_settings.unpaper_clean or settings.OCR_CLEAN, - deskew=db_settings.deskew or settings.OCR_DESKEW, - rotate=db_settings.rotate_pages or settings.OCR_ROTATE_PAGES, + image_dpi=ocr_db_settings.image_dpi or settings.OCR_IMAGE_DPI, + clean=ocr_db_settings.unpaper_clean or settings.OCR_CLEAN, + deskew=ocr_db_settings.deskew or settings.OCR_DESKEW, + rotate=ocr_db_settings.rotate_pages or settings.OCR_ROTATE_PAGES, rotate_threshold=( - db_settings.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD + ocr_db_settings.rotate_pages_threshold + or settings.OCR_ROTATE_PAGES_THRESHOLD ), - max_image_pixel=db_settings.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS, + max_image_pixel=ocr_db_settings.max_image_pixels + or settings.OCR_MAX_IMAGE_PIXELS, color_conversion_strategy=( - db_settings.color_conversion_strategy + ocr_db_settings.color_conversion_strategy or settings.OCR_COLOR_CONVERSION_STRATEGY ), user_args=user_args, diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 1d7a1cafb..f64cb69f0 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -2,7 +2,6 @@ import os import shutil import tempfile import uuid -from contextlib import AbstractContextManager from pathlib import Path from unittest import mock @@ -17,28 +16,6 @@ from documents.tests.utils import FileSystemAssertsMixin from paperless_tesseract.parsers import RasterisedDocumentParser from paperless_tesseract.parsers import post_process_text -image_to_string_calls = [] - - -def fake_convert(input_file, output_file, **kwargs): - with open(input_file) as f: - lines = f.readlines() - - for i, line in enumerate(lines): - with open(output_file % i, "w") as f2: - f2.write(line.strip()) - - -class FakeImageFile(AbstractContextManager): - def __init__(self, fname): - self.fname = fname - - def __exit__(self, exc_type, exc_val, exc_tb): - pass - - def __enter__(self): - return os.path.basename(self.fname) - class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): SAMPLE_FILES = Path(__file__).resolve().parent / "samples" diff --git a/src/paperless_tesseract/tests/test_parser_custom_settings.py b/src/paperless_tesseract/tests/test_parser_custom_settings.py new file mode 100644 index 000000000..f2c663b14 --- /dev/null +++ b/src/paperless_tesseract/tests/test_parser_custom_settings.py @@ -0,0 +1,120 @@ +from django.test import TestCase +from django.test import override_settings + +from documents.tests.utils import DirectoriesMixin +from documents.tests.utils import FileSystemAssertsMixin +from paperless.models import CommonSettings +from paperless.models import OcrSettings +from paperless_tesseract.parsers import RasterisedDocumentParser + + +class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase): + @staticmethod + def get_params(): + return RasterisedDocumentParser(None).construct_ocrmypdf_parameters( + input_file="input.pdf", + output_file="output.pdf", + sidecar_file="sidecar.txt", + mime_type="application/pdf", + safe_fallback=False, + ) + + def test_db_settings_ocr_pages(self): + with override_settings(OCR_PAGES=10): + instance = OcrSettings.objects.all().first() + instance.pages = 5 + instance.save() + + params = self.get_params() + self.assertEqual(params["pages"], "1-5") + + def test_db_settings_ocr_language(self): + with override_settings(OCR_LANGUAGE="eng+deu"): + instance = OcrSettings.objects.all().first() + instance.language = "fra+ita" + instance.save() + + params = self.get_params() + self.assertEqual(params["language"], "fra+ita") + + def test_db_settings_ocr_output_type(self): + with override_settings(OCR_LANGUAGE="pdfa-3"): + instance = OcrSettings.objects.all().first() + instance.output_type = CommonSettings.OutputTypeChoices.PDF_A + instance.save() + + params = self.get_params() + self.assertEqual(params["output_type"], "pdfa") + + def test_db_settings_ocr_mode(self): + with override_settings(OCR_MODE="redo"): + instance = OcrSettings.objects.all().first() + instance.mode = OcrSettings.ModeChoices.SKIP + instance.save() + + params = self.get_params() + self.assertTrue(params["skip_text"]) + self.assertNotIn("redo_ocr", params) + self.assertNotIn("force_ocr", params) + + def test_db_settings_ocr_clean(self): + with override_settings(OCR_CLEAN="clean-final"): + instance = OcrSettings.objects.all().first() + instance.unpaper_clean = OcrSettings.CleanChoices.CLEAN + instance.save() + + params = self.get_params() + self.assertTrue(params["clean"]) + self.assertNotIn("clean_final", params) + + with override_settings(OCR_CLEAN="clean-final"): + instance = OcrSettings.objects.all().first() + instance.unpaper_clean = OcrSettings.CleanChoices.FINAL + instance.save() + + params = self.get_params() + self.assertTrue(params["clean_final"]) + self.assertNotIn("clean", params) + + def test_db_settings_ocr_deskew(self): + with override_settings(OCR_DESKEW=False): + instance = OcrSettings.objects.all().first() + instance.deskew = True + instance.save() + + params = self.get_params() + self.assertTrue(params["deskew"]) + + def test_db_settings_ocr_rotate(self): + with override_settings(OCR_ROTATE_PAGES=False, OCR_ROTATE_PAGES_THRESHOLD=30.0): + instance = OcrSettings.objects.all().first() + instance.rotate_pages = True + instance.rotate_pages_threshold = 15.0 + instance.save() + + params = self.get_params() + self.assertTrue(params["rotate_pages"]) + self.assertAlmostEqual(params["rotate_pages_threshold"], 15.0) + + def test_db_settings_ocr_max_pixels(self): + with override_settings(OCR_MAX_IMAGE_PIXELS=2_000_000.0): + instance = OcrSettings.objects.all().first() + instance.max_image_pixels = 1_000_000.0 + instance.save() + + params = self.get_params() + self.assertAlmostEqual(params["max_image_mpixels"], 1.0) + + def test_db_settings_ocr_color_convert(self): + with override_settings(OCR_COLOR_CONVERSION_STRATEGY="LeaveColorUnchanged"): + instance = OcrSettings.objects.all().first() + instance.color_conversion_strategy = ( + OcrSettings.ColorConvertChoices.INDEPENDENT + ) + instance.save() + + params = self.get_params() + self.assertEqual( + params["color_conversion_strategy"], + "UseDeviceIndependentColor", + ) diff --git a/src/setup.cfg b/src/setup.cfg index 861ae452a..dc5e9e33a 100644 --- a/src/setup.cfg +++ b/src/setup.cfg @@ -18,6 +18,7 @@ omit = exclude_also = if settings.AUDIT_LOG_ENABLED: if AUDIT_LOG_ENABLED: + if TYPE_CHECKING: [mypy] plugins = mypy_django_plugin.main, mypy_drf_plugin.main, numpy.typing.mypy_plugin