More fixes and work

2023-12-14 14:41:27 -08:00 · 2023-12-14 14:41:27 -08:00 · 25cc7ada6b
commit 25cc7ada6b
parent a7753b1f89
11 changed files with 193 additions and 142 deletions
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@ -41,6 +41,8 @@ from documents.settings import EXPORTER_THUMBNAIL_NAME
 from documents.utils import copy_file_with_basic_stats
 from paperless import version
 from paperless.db import GnuPG
+from paperless.models import CommonSettings
+from paperless.models import OcrSettings
 from paperless_mail.models import MailAccount
 from paperless_mail.models import MailRule

@ -291,6 +293,14 @@ class Command(BaseCommand):
                serializers.serialize("json", CustomField.objects.all()),
            )

+            manifest += json.loads(
+                serializers.serialize("json", CommonSettings.objects.all()),
+            )
+
+            manifest += json.loads(
+                serializers.serialize("json", OcrSettings.objects.all()),
+            )
+
            # These are treated specially and included in the per-document manifest
            # if that setting is enabled.  Otherwise, they are just exported to the bulk
            # manifest
--- a/src/paperless/migrations/0001_initial.py
+++ b/src/paperless/migrations/0001_initial.py
@ -9,7 +9,7 @@ def _create_singleton(apps, schema_editor):
    """
    Creates the first and only instance of the settings models
    """
-    for model_name in ["CommonSettings", "OcrSettings", "TextSettings", "TikaSettings"]:
+    for model_name in ["CommonSettings", "OcrSettings"]:
        settings_model = apps.get_model("paperless", model_name)
        settings_model.objects.create()

@ -189,60 +189,5 @@ class Migration(migrations.Migration):
                "verbose_name": "ocr settings",
            },
        ),
-        migrations.CreateModel(
-            name="TextSettings",
-            fields=[
-                (
-                    "id",
-                    models.AutoField(
-                        auto_created=True,
-                        primary_key=True,
-                        serialize=False,
-                        verbose_name="ID",
-                    ),
-                ),
-                (
-                    "thumbnail_font_name",
-                    models.CharField(
-                        blank=True,
-                        max_length=64,
-                        null=True,
-                        verbose_name="Sets the output PDF type",
-                    ),
-                ),
-            ],
-            options={
-                "abstract": False,
-            },
-        ),
-        migrations.CreateModel(
-            name="TikaSettings",
-            fields=[
-                (
-                    "id",
-                    models.AutoField(
-                        auto_created=True,
-                        primary_key=True,
-                        serialize=False,
-                        verbose_name="ID",
-                    ),
-                ),
-                (
-                    "tika_url",
-                    models.URLField(blank=True, null=True, verbose_name="Tika URL"),
-                ),
-                (
-                    "gotenberg_url",
-                    models.URLField(
-                        blank=True,
-                        null=True,
-                        verbose_name="Gotenberg URL",
-                    ),
-                ),
-            ],
-            options={
-                "abstract": False,
-            },
-        ),
        migrations.RunPython(_create_singleton, migrations.RunPython.noop),
    ]
--- a/src/paperless/models.py
+++ b/src/paperless/models.py
@ -146,40 +146,3 @@ class OcrSettings(AbstractSingletonModel):

    def __str__(self) -> str:
        return "OcrSettings"
-
-
-class TextSettings(AbstractSingletonModel):
-    """
-    Settings for the text parser
-    """
-
-    thumbnail_font_name = models.CharField(
-        verbose_name=_("Sets the output PDF type"),
-        null=True,
-        blank=True,
-        max_length=64,
-    )
-
-
-class TikaSettings(AbstractSingletonModel):
-    """
-    Settings for the Tika parser
-    """
-
-    tika_url = models.URLField(
-        verbose_name=_("Tika URL"),
-        null=True,
-        blank=True,
-    )
-    gotenberg_url = models.URLField(
-        verbose_name=_("Gotenberg URL"),
-        null=True,
-        blank=True,
-    )
-
-
-class ConsumerSettings(AbstractSingletonModel):
-    delete_duplicates = models.BooleanField(
-        verbose_name=_("Delete duplicate consumer files"),
-        null=True,
-    )
--- a/src/paperless/serialisers.py
+++ b/src/paperless/serialisers.py
@ -3,6 +3,7 @@ from django.contrib.auth.models import Permission
 from django.contrib.auth.models import User
 from rest_framework import serializers

+from paperless.models import CommonSettings
 from paperless.models import OcrSettings


@ -117,6 +118,12 @@ class ProfileSerializer(serializers.ModelSerializer):
        )


+class CommonSettingsSerializer(serializers.ModelSerializer):
+    class Meta:
+        model = CommonSettings
+        fields = ["all"]
+
+
 class OcrSettingsSerializer(serializers.ModelSerializer):
    class Meta:
        model = OcrSettings
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@ -34,9 +34,11 @@ from documents.views import TasksViewSet
 from documents.views import UiSettingsView
 from documents.views import UnifiedSearchViewSet
 from paperless.consumers import StatusConsumer
+from paperless.views import CommonSettingsViewSet
 from paperless.views import FaviconView
 from paperless.views import GenerateAuthTokenView
 from paperless.views import GroupViewSet
+from paperless.views import OcrSettingsViewSet
 from paperless.views import ProfileView
 from paperless.views import UserViewSet
 from paperless_mail.views import MailAccountTestView
@ -59,6 +61,8 @@ api_router.register(r"mail_rules", MailRuleViewSet)
 api_router.register(r"share_links", ShareLinkViewSet)
 api_router.register(r"consumption_templates", ConsumptionTemplateViewSet)
 api_router.register(r"custom_fields", CustomFieldViewSet)
+api_router.register(r"common_settings", CommonSettingsViewSet)
+api_router.register(r"ocr_settings", OcrSettingsViewSet)


 urlpatterns = [
--- a/src/paperless/views.py
+++ b/src/paperless/views.py
@ -18,7 +18,9 @@ from rest_framework.viewsets import ModelViewSet
 from documents.permissions import PaperlessObjectPermissions
 from paperless.filters import GroupFilterSet
 from paperless.filters import UserFilterSet
+from paperless.models import CommonSettings
 from paperless.models import OcrSettings
+from paperless.serialisers import CommonSettingsSerializer
 from paperless.serialisers import GroupSerializer
 from paperless.serialisers import OcrSettingsSerializer
 from paperless.serialisers import ProfileSerializer
@ -164,6 +166,15 @@ class GenerateAuthTokenView(GenericAPIView):
        )


+class CommonSettingsViewSet(ModelViewSet):
+    model = CommonSettings
+
+    queryset = CommonSettings.objects
+
+    serializer_class = CommonSettingsSerializer
+    permission_classes = (IsAuthenticated,)
+
+
 class OcrSettingsViewSet(ModelViewSet):
    model = OcrSettings

--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@ -3,6 +3,7 @@ import re
 import subprocess
 import tempfile
 from pathlib import Path
+from typing import TYPE_CHECKING
 from typing import Optional

 from django.conf import settings
@ -11,7 +12,7 @@ from PIL import Image
 from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
-from paperless_tesseract.models import OcrSettings as OcrSettingModel
+from paperless.models import OcrSettings as OcrSettingModel
 from paperless_tesseract.setting_schema import OcrSetting
 from paperless_tesseract.setting_schema import get_ocr_settings

@ -71,7 +72,7 @@ class RasterisedDocumentParser(DocumentParser):
            self.logging_group,
        )

-    def is_image(self, mime_type):
+    def is_image(self, mime_type) -> bool:
        return mime_type in [
            "image/png",
            "image/jpeg",
@ -81,7 +82,7 @@ class RasterisedDocumentParser(DocumentParser):
            "image/webp",
        ]

-    def has_alpha(self, image):
+    def has_alpha(self, image) -> bool:
        with Image.open(image) as im:
            return im.mode in ("RGBA", "LA")

@ -96,7 +97,7 @@ class RasterisedDocumentParser(DocumentParser):
            ],
        )

-    def get_dpi(self, image):
+    def get_dpi(self, image) -> Optional[int]:
        try:
            with Image.open(image) as im:
                x, y = im.info["dpi"]
@ -105,7 +106,7 @@ class RasterisedDocumentParser(DocumentParser):
            self.log.warning(f"Error while getting DPI from image {image}: {e}")
            return None

-    def calculate_a4_dpi(self, image):
+    def calculate_a4_dpi(self, image) -> Optional[int]:
        try:
            with Image.open(image) as im:
                width, height = im.size
@ -118,7 +119,11 @@ class RasterisedDocumentParser(DocumentParser):
            self.log.warning(f"Error while calculating DPI for image {image}: {e}")
            return None

-    def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path):
+    def extract_text(
+        self,
+        sidecar_file: Optional[Path],
+        pdf_file: Path,
+    ) -> Optional[str]:
        # When re-doing OCR, the sidecar contains ONLY the new text, not
        # the whole text, so do not utilize it in that case
        if (
@ -179,7 +184,8 @@ class RasterisedDocumentParser(DocumentParser):
        sidecar_file,
        safe_fallback=False,
    ):
-        assert isinstance(self.parser_settings, OcrSetting)
+        if TYPE_CHECKING:
+            assert isinstance(self.parser_settings, OcrSetting)
        ocrmypdf_args = {
            "input_file": input_file,
            "output_file": output_file,
--- a/src/paperless_tesseract/setting_schema.py
+++ b/src/paperless_tesseract/setting_schema.py
@ -4,7 +4,8 @@ from typing import Optional

 from django.conf import settings

-from paperless_tesseract.models import OcrSettings as OcrSettingModel
+from paperless.models import CommonSettings
+from paperless.models import OcrSettings as OcrSettingModel


@dataclasses.dataclass(frozen=True)
@ -25,16 +26,20 @@ class OcrSetting:


 def get_ocr_settings() -> OcrSetting:
-    db_settings = OcrSettingModel.objects.all().first()
+    ocr_db_settings = OcrSettingModel.objects.all().first()
    # Workaround for a test where the migration hasn't run to create the single model
-    if db_settings is None:
+    if ocr_db_settings is None:
        OcrSettingModel.objects.create()
-        db_settings = OcrSettingModel.objects.all().first()
-    assert db_settings is not None
+        ocr_db_settings = OcrSettingModel.objects.all().first()
+
+    cmn_db_settings = CommonSettings.objects.all().first()
+    if cmn_db_settings is None:
+        CommonSettings.objects.create()
+        cmn_db_settings = CommonSettings.objects.all().first()

    user_args = None
-    if db_settings.user_args:
-        user_args = db_settings.user_args
+    if ocr_db_settings.user_args:
+        user_args = ocr_db_settings.user_args
    elif settings.OCR_USER_ARGS is not None:
        try:
            user_args = json.loads(settings.OCR_USER_ARGS)
@ -42,23 +47,25 @@ def get_ocr_settings() -> OcrSetting:
            user_args = {}

    return OcrSetting(
-        pages=db_settings.pages or settings.OCR_PAGES,
-        language=db_settings.language or settings.OCR_LANGUAGE,
-        output_type=db_settings.output_type or settings.OCR_OUTPUT_TYPE,
-        mode=db_settings.mode or settings.OCR_MODE,
+        pages=ocr_db_settings.pages or settings.OCR_PAGES,
+        language=ocr_db_settings.language or settings.OCR_LANGUAGE,
+        output_type=cmn_db_settings.output_type or settings.OCR_OUTPUT_TYPE,
+        mode=ocr_db_settings.mode or settings.OCR_MODE,
        skip_archive_file=(
-            db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
+            ocr_db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
        ),
-        image_dpi=db_settings.image_dpi or settings.OCR_IMAGE_DPI,
-        clean=db_settings.unpaper_clean or settings.OCR_CLEAN,
-        deskew=db_settings.deskew or settings.OCR_DESKEW,
-        rotate=db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
+        image_dpi=ocr_db_settings.image_dpi or settings.OCR_IMAGE_DPI,
+        clean=ocr_db_settings.unpaper_clean or settings.OCR_CLEAN,
+        deskew=ocr_db_settings.deskew or settings.OCR_DESKEW,
+        rotate=ocr_db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
        rotate_threshold=(
-            db_settings.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD
+            ocr_db_settings.rotate_pages_threshold
+            or settings.OCR_ROTATE_PAGES_THRESHOLD
        ),
-        max_image_pixel=db_settings.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS,
+        max_image_pixel=ocr_db_settings.max_image_pixels
+        or settings.OCR_MAX_IMAGE_PIXELS,
        color_conversion_strategy=(
-            db_settings.color_conversion_strategy
+            ocr_db_settings.color_conversion_strategy
            or settings.OCR_COLOR_CONVERSION_STRATEGY
        ),
        user_args=user_args,
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@ -2,7 +2,6 @@ import os
 import shutil
 import tempfile
 import uuid
-from contextlib import AbstractContextManager
 from pathlib import Path
 from unittest import mock

@ -17,28 +16,6 @@ from documents.tests.utils import FileSystemAssertsMixin
 from paperless_tesseract.parsers import RasterisedDocumentParser
 from paperless_tesseract.parsers import post_process_text

-image_to_string_calls = []
-
-
-def fake_convert(input_file, output_file, **kwargs):
-    with open(input_file) as f:
-        lines = f.readlines()
-
-    for i, line in enumerate(lines):
-        with open(output_file % i, "w") as f2:
-            f2.write(line.strip())
-
-
-class FakeImageFile(AbstractContextManager):
-    def __init__(self, fname):
-        self.fname = fname
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        pass
-
-    def __enter__(self):
-        return os.path.basename(self.fname)
-

 class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
--- a/src/paperless_tesseract/tests/test_parser_custom_settings.py
+++ b/src/paperless_tesseract/tests/test_parser_custom_settings.py
@ -0,0 +1,120 @@
+from django.test import TestCase
+from django.test import override_settings
+
+from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import FileSystemAssertsMixin
+from paperless.models import CommonSettings
+from paperless.models import OcrSettings
+from paperless_tesseract.parsers import RasterisedDocumentParser
+
+
+class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
+    @staticmethod
+    def get_params():
+        return RasterisedDocumentParser(None).construct_ocrmypdf_parameters(
+            input_file="input.pdf",
+            output_file="output.pdf",
+            sidecar_file="sidecar.txt",
+            mime_type="application/pdf",
+            safe_fallback=False,
+        )
+
+    def test_db_settings_ocr_pages(self):
+        with override_settings(OCR_PAGES=10):
+            instance = OcrSettings.objects.all().first()
+            instance.pages = 5
+            instance.save()
+
+            params = self.get_params()
+        self.assertEqual(params["pages"], "1-5")
+
+    def test_db_settings_ocr_language(self):
+        with override_settings(OCR_LANGUAGE="eng+deu"):
+            instance = OcrSettings.objects.all().first()
+            instance.language = "fra+ita"
+            instance.save()
+
+            params = self.get_params()
+        self.assertEqual(params["language"], "fra+ita")
+
+    def test_db_settings_ocr_output_type(self):
+        with override_settings(OCR_LANGUAGE="pdfa-3"):
+            instance = OcrSettings.objects.all().first()
+            instance.output_type = CommonSettings.OutputTypeChoices.PDF_A
+            instance.save()
+
+            params = self.get_params()
+        self.assertEqual(params["output_type"], "pdfa")
+
+    def test_db_settings_ocr_mode(self):
+        with override_settings(OCR_MODE="redo"):
+            instance = OcrSettings.objects.all().first()
+            instance.mode = OcrSettings.ModeChoices.SKIP
+            instance.save()
+
+            params = self.get_params()
+        self.assertTrue(params["skip_text"])
+        self.assertNotIn("redo_ocr", params)
+        self.assertNotIn("force_ocr", params)
+
+    def test_db_settings_ocr_clean(self):
+        with override_settings(OCR_CLEAN="clean-final"):
+            instance = OcrSettings.objects.all().first()
+            instance.unpaper_clean = OcrSettings.CleanChoices.CLEAN
+            instance.save()
+
+            params = self.get_params()
+        self.assertTrue(params["clean"])
+        self.assertNotIn("clean_final", params)
+
+        with override_settings(OCR_CLEAN="clean-final"):
+            instance = OcrSettings.objects.all().first()
+            instance.unpaper_clean = OcrSettings.CleanChoices.FINAL
+            instance.save()
+
+            params = self.get_params()
+        self.assertTrue(params["clean_final"])
+        self.assertNotIn("clean", params)
+
+    def test_db_settings_ocr_deskew(self):
+        with override_settings(OCR_DESKEW=False):
+            instance = OcrSettings.objects.all().first()
+            instance.deskew = True
+            instance.save()
+
+            params = self.get_params()
+        self.assertTrue(params["deskew"])
+
+    def test_db_settings_ocr_rotate(self):
+        with override_settings(OCR_ROTATE_PAGES=False, OCR_ROTATE_PAGES_THRESHOLD=30.0):
+            instance = OcrSettings.objects.all().first()
+            instance.rotate_pages = True
+            instance.rotate_pages_threshold = 15.0
+            instance.save()
+
+            params = self.get_params()
+        self.assertTrue(params["rotate_pages"])
+        self.assertAlmostEqual(params["rotate_pages_threshold"], 15.0)
+
+    def test_db_settings_ocr_max_pixels(self):
+        with override_settings(OCR_MAX_IMAGE_PIXELS=2_000_000.0):
+            instance = OcrSettings.objects.all().first()
+            instance.max_image_pixels = 1_000_000.0
+            instance.save()
+
+            params = self.get_params()
+        self.assertAlmostEqual(params["max_image_mpixels"], 1.0)
+
+    def test_db_settings_ocr_color_convert(self):
+        with override_settings(OCR_COLOR_CONVERSION_STRATEGY="LeaveColorUnchanged"):
+            instance = OcrSettings.objects.all().first()
+            instance.color_conversion_strategy = (
+                OcrSettings.ColorConvertChoices.INDEPENDENT
+            )
+            instance.save()
+
+            params = self.get_params()
+        self.assertEqual(
+            params["color_conversion_strategy"],
+            "UseDeviceIndependentColor",
+        )
--- a/src/setup.cfg
+++ b/src/setup.cfg
@ -18,6 +18,7 @@ omit =
 exclude_also =
    if settings.AUDIT_LOG_ENABLED:
    if AUDIT_LOG_ENABLED:
+    if TYPE_CHECKING:

 [mypy]
 plugins = mypy_django_plugin.main, mypy_drf_plugin.main, numpy.typing.mypy_plugin