From 74e845974c2ff8d96a432bc60b4966694e2b7b95 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Tue, 19 Dec 2023 10:21:51 -0800 Subject: [PATCH] Updates to use a single configuration object for all settings --- .../management/commands/document_exporter.py | 9 +- src/documents/parsers.py | 2 +- .../tests/test_management_exporter.py | 16 ++-- src/paperless/config.py | 88 +++++++++++++++++ src/paperless/migrations/0001_initial.py | 32 ++----- src/paperless/models.py | 94 ++++++++++++------- src/paperless/serialisers.py | 13 +-- src/paperless/urls.py | 6 +- src/paperless/views.py | 23 ++--- src/paperless_tesseract/parsers.py | 82 +++++++--------- src/paperless_tesseract/setting_schema.py | 72 -------------- .../tests/test_parser_custom_settings.py | 39 ++++---- src/paperless_tika/parsers.py | 15 ++- 13 files changed, 242 insertions(+), 249 deletions(-) create mode 100644 src/paperless/config.py delete mode 100644 src/paperless_tesseract/setting_schema.py diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index 25b58b3c9..bd5e322e3 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -41,8 +41,7 @@ from documents.settings import EXPORTER_THUMBNAIL_NAME from documents.utils import copy_file_with_basic_stats from paperless import version from paperless.db import GnuPG -from paperless.models import CommonSettings -from paperless.models import OcrSettings +from paperless.models import ApplicationConfiguration from paperless_mail.models import MailAccount from paperless_mail.models import MailRule @@ -294,11 +293,7 @@ class Command(BaseCommand): ) manifest += json.loads( - serializers.serialize("json", CommonSettings.objects.all()), - ) - - manifest += json.loads( - serializers.serialize("json", OcrSettings.objects.all()), + serializers.serialize("json", ApplicationConfiguration.objects.all()), ) # These are treated specially and included in the per-document manifest diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 0989b0792..cb28c4298 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -320,7 +320,7 @@ class DocumentParser(LoggingMixin): def __init__(self, logging_group, progress_callback=None): super().__init__() self.logging_group = logging_group - self.parser_settings = self.get_settings() + self.settings = self.get_settings() os.makedirs(settings.SCRATCH_DIR, exist_ok=True) self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) diff --git a/src/documents/tests/test_management_exporter.py b/src/documents/tests/test_management_exporter.py index 99d141e46..898dfbc53 100644 --- a/src/documents/tests/test_management_exporter.py +++ b/src/documents/tests/test_management_exporter.py @@ -168,7 +168,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase): manifest = self._do_export(use_filename_format=use_filename_format) - self.assertEqual(len(manifest), 184) + self.assertEqual(len(manifest), 178) # dont include consumer or AnonymousUser users self.assertEqual( @@ -262,7 +262,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertEqual(Document.objects.get(id=self.d4.id).title, "wow_dec") self.assertEqual(GroupObjectPermission.objects.count(), 1) self.assertEqual(UserObjectPermission.objects.count(), 1) - self.assertEqual(Permission.objects.count(), 132) + self.assertEqual(Permission.objects.count(), 128) messages = check_sanity() # everything is alright after the test self.assertEqual(len(messages), 0) @@ -694,15 +694,15 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase): os.path.join(self.dirs.media_dir, "documents"), ) - self.assertEqual(ContentType.objects.count(), 33) - self.assertEqual(Permission.objects.count(), 132) + self.assertEqual(ContentType.objects.count(), 32) + self.assertEqual(Permission.objects.count(), 128) manifest = self._do_export() with paperless_environment(): self.assertEqual( len(list(filter(lambda e: e["model"] == "auth.permission", manifest))), - 132, + 128, ) # add 1 more to db to show objects are not re-created by import Permission.objects.create( @@ -710,7 +710,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase): codename="test_perm", content_type_id=1, ) - self.assertEqual(Permission.objects.count(), 133) + self.assertEqual(Permission.objects.count(), 129) # will cause an import error self.user.delete() @@ -719,5 +719,5 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase): with self.assertRaises(IntegrityError): call_command("document_importer", "--no-progress-bar", self.target) - self.assertEqual(ContentType.objects.count(), 33) - self.assertEqual(Permission.objects.count(), 133) + self.assertEqual(ContentType.objects.count(), 32) + self.assertEqual(Permission.objects.count(), 129) diff --git a/src/paperless/config.py b/src/paperless/config.py new file mode 100644 index 000000000..cd55cf420 --- /dev/null +++ b/src/paperless/config.py @@ -0,0 +1,88 @@ +import dataclasses +import json +from typing import Optional + +from django.conf import settings + +from paperless.models import ApplicationConfiguration + + +@dataclasses.dataclass +class OutputTypeConfig: + """ + Almost all parsers care about the chosen PDF output format + """ + + output_type: str = dataclasses.field(init=False) + + @staticmethod + def _get_config_instance() -> ApplicationConfiguration: + app_config = ApplicationConfiguration.objects.all().first() + # Workaround for a test where the migration hasn't run to create the single model + if app_config is None: + ApplicationConfiguration.objects.create() + app_config = ApplicationConfiguration.objects.all().first() + return app_config + + def __post_init__(self) -> None: + app_config = self._get_config_instance() + + self.output_type = app_config.output_type or settings.OCR_OUTPUT_TYPE + + +@dataclasses.dataclass +class OcrConfig(OutputTypeConfig): + """ + Specific settings for the Tesseract based parser. Options generall + correspond almost directly to the OCRMyPDF options + """ + + pages: Optional[int] = dataclasses.field(init=False) + language: str = dataclasses.field(init=False) + mode: str = dataclasses.field(init=False) + skip_archive_file: str = dataclasses.field(init=False) + image_dpi: Optional[int] = dataclasses.field(init=False) + clean: str = dataclasses.field(init=False) + deskew: bool = dataclasses.field(init=False) + rotate: bool = dataclasses.field(init=False) + rotate_threshold: float = dataclasses.field(init=False) + max_image_pixel: Optional[float] = dataclasses.field(init=False) + color_conversion_strategy: str = dataclasses.field(init=False) + user_args: Optional[dict[str, str]] = dataclasses.field(init=False) + + def __post_init__(self) -> None: + super().__post_init__() + + app_config = self._get_config_instance() + + self.pages = app_config.pages or settings.OCR_PAGES + self.language = app_config.language or settings.OCR_LANGUAGE + self.mode = app_config.mode or settings.OCR_MODE + self.skip_archive_file = ( + app_config.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE + ) + self.image_dpi = app_config.image_dpi or settings.OCR_IMAGE_DPI + self.clean = app_config.unpaper_clean or settings.OCR_CLEAN + self.deskew = app_config.deskew or settings.OCR_DESKEW + self.rotate = app_config.rotate_pages or settings.OCR_ROTATE_PAGES + self.rotate_threshold = ( + app_config.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD + ) + self.max_image_pixel = ( + app_config.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS + ) + self.color_conversion_strategy = ( + app_config.color_conversion_strategy + or settings.OCR_COLOR_CONVERSION_STRATEGY + ) + + user_args = None + if app_config.user_args: + user_args = app_config.user_args + elif settings.OCR_USER_ARGS is not None: + try: + user_args = json.loads(settings.OCR_USER_ARGS) + except json.JSONDecodeError: + user_args = {} + + self.user_args = user_args diff --git a/src/paperless/migrations/0001_initial.py b/src/paperless/migrations/0001_initial.py index 710eb2798..96f06f6ca 100644 --- a/src/paperless/migrations/0001_initial.py +++ b/src/paperless/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 4.2.7 on 2023-12-14 17:12 +# Generated by Django 4.2.7 on 2023-12-19 17:51 import django.core.validators from django.db import migrations @@ -7,11 +7,10 @@ from django.db import models def _create_singleton(apps, schema_editor): """ - Creates the first and only instance of the settings models + Creates the first and only instance of the configuration model """ - for model_name in ["CommonSettings", "OcrSettings"]: - settings_model = apps.get_model("paperless", model_name) - settings_model.objects.create() + settings_model = apps.get_model("paperless", "ApplicationConfiguration") + settings_model.objects.create() class Migration(migrations.Migration): @@ -21,7 +20,7 @@ class Migration(migrations.Migration): operations = [ migrations.CreateModel( - name="CommonSettings", + name="ApplicationConfiguration", fields=[ ( "id", @@ -48,23 +47,6 @@ class Migration(migrations.Migration): verbose_name="Sets the output PDF type", ), ), - ], - options={ - "abstract": False, - }, - ), - migrations.CreateModel( - name="OcrSettings", - fields=[ - ( - "id", - models.AutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), ( "pages", models.PositiveIntegerField( @@ -88,9 +70,9 @@ class Migration(migrations.Migration): blank=True, choices=[ ("skip", "skip"), - ("skip_noarchive", "skip_noarchive"), ("redo", "redo"), ("force", "force"), + ("skip_noarchive", "skip_noarchive"), ], max_length=16, null=True, @@ -186,7 +168,7 @@ class Migration(migrations.Migration): ), ], options={ - "verbose_name": "ocr settings", + "verbose_name": "paperless application settings", }, ), migrations.RunPython(_create_singleton, migrations.RunPython.noop), diff --git a/src/paperless/models.py b/src/paperless/models.py index 672177f6f..1edb460f9 100644 --- a/src/paperless/models.py +++ b/src/paperless/models.py @@ -17,18 +17,67 @@ class AbstractSingletonModel(models.Model): super().save(*args, **kwargs) -class CommonSettings(AbstractSingletonModel): +class OutputTypeChoices(models.TextChoices): + """ + Matches to --output-type + """ + + PDF = ("pdf", _("pdf")) + PDF_A = ("pdfa", _("pdfa")) + PDF_A1 = ("pdfa-1", _("pdfa-1")) + PDF_A2 = ("pdfa-2", _("pdfa-2")) + PDF_A3 = ("pdfa-3", _("pdfa-3")) + + +class ModeChoices(models.TextChoices): + """ + Matches to --skip-text, --redo-ocr, --force-ocr + and our own custom setting + """ + + SKIP = ("skip", _("skip")) + REDO = ("redo", _("redo")) + FORCE = ("force", _("force")) + SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive")) + + +class ArchiveFileChoices(models.TextChoices): + """ + Settings to control creation of an archive PDF file + """ + + NEVER = ("never", _("never")) + WITH_TEXT = ("with_text", _("with_text")) + ALWAYS = ("always", _("always")) + + +class CleanChoices(models.TextChoices): + """ + Matches to --clean, --clean-final + """ + + CLEAN = ("clean", _("clean")) + FINAL = ("clean-final", _("clean-final")) + NONE = ("none", _("none")) + + +class ColorConvertChoices(models.TextChoices): + """ + Refer to the Ghostscript documentation for valid options + """ + + UNCHANGED = ("LeaveColorUnchanged", _("LeaveColorUnchanged")) + RGB = ("RGB", _("RGB")) + INDEPENDENT = ("UseDeviceIndependentColor", _("UseDeviceIndependentColor")) + GRAY = ("Gray", _("Gray")) + CMYK = ("CMYK", _("CMYK")) + + +class ApplicationConfiguration(AbstractSingletonModel): """ Settings which are common across more than 1 parser """ - class OutputTypeChoices(models.TextChoices): - PDF = ("pdf", _("pdf")) - PDF_A = ("pdfa", _("pdfa")) - PDF_A1 = ("pdfa-1", _("pdfa-1")) - PDF_A2 = ("pdfa-2", _("pdfa-2")) - PDF_A3 = ("pdfa-3", _("pdfa-3")) - output_type = models.CharField( verbose_name=_("Sets the output PDF type"), null=True, @@ -37,35 +86,10 @@ class CommonSettings(AbstractSingletonModel): choices=OutputTypeChoices.choices, ) - -class OcrSettings(AbstractSingletonModel): """ Settings for the Tesseract based OCR parser """ - class ModeChoices(models.TextChoices): - SKIP = ("skip", _("skip")) - SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive")) - REDO = ("redo", _("redo")) - FORCE = ("force", _("force")) - - class ArchiveFileChoices(models.TextChoices): - NEVER = ("never", _("never")) - WITH_TEXT = ("with_text", _("with_text")) - ALWAYS = ("always", _("always")) - - class CleanChoices(models.TextChoices): - CLEAN = ("clean", _("clean")) - FINAL = ("clean-final", _("clean-final")) - NONE = ("none", _("none")) - - class ColorConvertChoices(models.TextChoices): - UNCHANGED = ("LeaveColorUnchanged", _("LeaveColorUnchanged")) - RGB = ("RGB", _("RGB")) - INDEPENDENT = ("UseDeviceIndependentColor", _("UseDeviceIndependentColor")) - GRAY = ("Gray", _("Gray")) - CMYK = ("CMYK", _("CMYK")) - pages = models.PositiveIntegerField( verbose_name=_("Do OCR from page 1 to this value"), null=True, @@ -142,7 +166,7 @@ class OcrSettings(AbstractSingletonModel): ) class Meta: - verbose_name = _("ocr settings") + verbose_name = _("paperless application settings") def __str__(self) -> str: - return "OcrSettings" + return "ApplicationConfiguration" diff --git a/src/paperless/serialisers.py b/src/paperless/serialisers.py index 4185e3bed..bf649df05 100644 --- a/src/paperless/serialisers.py +++ b/src/paperless/serialisers.py @@ -3,8 +3,7 @@ from django.contrib.auth.models import Permission from django.contrib.auth.models import User from rest_framework import serializers -from paperless.models import CommonSettings -from paperless.models import OcrSettings +from paperless.models import ApplicationConfiguration class ObfuscatedUserPasswordField(serializers.Field): @@ -118,13 +117,7 @@ class ProfileSerializer(serializers.ModelSerializer): ) -class CommonSettingsSerializer(serializers.ModelSerializer): +class ApplicationConfigurationSerializer(serializers.ModelSerializer): class Meta: - model = CommonSettings - fields = "__all__" - - -class OcrSettingsSerializer(serializers.ModelSerializer): - class Meta: - model = OcrSettings + model = ApplicationConfiguration fields = "__all__" diff --git a/src/paperless/urls.py b/src/paperless/urls.py index 35e2bd989..79602db00 100644 --- a/src/paperless/urls.py +++ b/src/paperless/urls.py @@ -34,11 +34,10 @@ from documents.views import TasksViewSet from documents.views import UiSettingsView from documents.views import UnifiedSearchViewSet from paperless.consumers import StatusConsumer -from paperless.views import CommonSettingsViewSet +from paperless.views import ApplicationConfigurationViewSet from paperless.views import FaviconView from paperless.views import GenerateAuthTokenView from paperless.views import GroupViewSet -from paperless.views import OcrSettingsViewSet from paperless.views import ProfileView from paperless.views import UserViewSet from paperless_mail.views import MailAccountTestView @@ -61,8 +60,7 @@ api_router.register(r"mail_rules", MailRuleViewSet) api_router.register(r"share_links", ShareLinkViewSet) api_router.register(r"consumption_templates", ConsumptionTemplateViewSet) api_router.register(r"custom_fields", CustomFieldViewSet) -api_router.register(r"common_settings", CommonSettingsViewSet) -api_router.register(r"ocr_settings", OcrSettingsViewSet) +api_router.register(r"config", ApplicationConfigurationViewSet) urlpatterns = [ diff --git a/src/paperless/views.py b/src/paperless/views.py index 4229d3836..da8cddb22 100644 --- a/src/paperless/views.py +++ b/src/paperless/views.py @@ -18,11 +18,9 @@ from rest_framework.viewsets import ModelViewSet from documents.permissions import PaperlessObjectPermissions from paperless.filters import GroupFilterSet from paperless.filters import UserFilterSet -from paperless.models import CommonSettings -from paperless.models import OcrSettings -from paperless.serialisers import CommonSettingsSerializer +from paperless.models import ApplicationConfiguration +from paperless.serialisers import ApplicationConfigurationSerializer from paperless.serialisers import GroupSerializer -from paperless.serialisers import OcrSettingsSerializer from paperless.serialisers import ProfileSerializer from paperless.serialisers import UserSerializer @@ -166,19 +164,10 @@ class GenerateAuthTokenView(GenericAPIView): ) -class CommonSettingsViewSet(ModelViewSet): - model = CommonSettings +class ApplicationConfigurationViewSet(ModelViewSet): + model = ApplicationConfiguration - queryset = CommonSettings.objects + queryset = ApplicationConfiguration.objects - serializer_class = CommonSettingsSerializer - permission_classes = (IsAuthenticated,) - - -class OcrSettingsViewSet(ModelViewSet): - model = OcrSettings - - queryset = OcrSettings.objects - - serializer_class = OcrSettingsSerializer + serializer_class = ApplicationConfigurationSerializer permission_classes = (IsAuthenticated,) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index e4e56fb8c..4172a5752 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -12,9 +12,10 @@ from PIL import Image from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.parsers import make_thumbnail_from_pdf -from paperless.models import OcrSettings as OcrSettingModel -from paperless_tesseract.setting_schema import OcrSetting -from paperless_tesseract.setting_schema import get_ocr_settings +from paperless.config import OcrConfig +from paperless.models import ArchiveFileChoices +from paperless.models import CleanChoices +from paperless.models import ModeChoices class NoTextFoundException(Exception): @@ -33,8 +34,8 @@ class RasterisedDocumentParser(DocumentParser): logging_name = "paperless.parsing.tesseract" - def get_settings(self) -> OcrSetting: - return get_ocr_settings() + def get_settings(self) -> OcrConfig: + return OcrConfig() def extract_metadata(self, document_path, mime_type): result = [] @@ -129,7 +130,7 @@ class RasterisedDocumentParser(DocumentParser): if ( sidecar_file is not None and os.path.isfile(sidecar_file) - and self.parser_settings.mode != "redo" + and self.settings.mode != "redo" ): text = self.read_file_handle_unicode_errors(sidecar_file) @@ -185,7 +186,7 @@ class RasterisedDocumentParser(DocumentParser): safe_fallback=False, ): if TYPE_CHECKING: - assert isinstance(self.parser_settings, OcrSetting) + assert isinstance(self.settings, OcrConfig) ocrmypdf_args = { "input_file": input_file, "output_file": output_file, @@ -193,55 +194,47 @@ class RasterisedDocumentParser(DocumentParser): # processes via the task library. "use_threads": True, "jobs": settings.THREADS_PER_WORKER, - "language": self.parser_settings.language, - "output_type": self.parser_settings.output_type, + "language": self.settings.language, + "output_type": self.settings.output_type, "progress_bar": False, } if "pdfa" in ocrmypdf_args["output_type"]: ocrmypdf_args[ "color_conversion_strategy" - ] = self.parser_settings.color_conversion_strategy + ] = self.settings.color_conversion_strategy - if ( - self.parser_settings.mode == OcrSettingModel.ModeChoices.FORCE - or safe_fallback - ): + if self.settings.mode == ModeChoices.FORCE or safe_fallback: ocrmypdf_args["force_ocr"] = True - elif self.parser_settings.mode in { - OcrSettingModel.ModeChoices.SKIP, - OcrSettingModel.ModeChoices.SKIP_NO_ARCHIVE, + elif self.settings.mode in { + ModeChoices.SKIP, + ModeChoices.SKIP_NO_ARCHIVE, }: ocrmypdf_args["skip_text"] = True - elif self.parser_settings.mode == OcrSettingModel.ModeChoices.REDO: + elif self.settings.mode == ModeChoices.REDO: ocrmypdf_args["redo_ocr"] = True else: - raise ParseError(f"Invalid ocr mode: {self.parser_settings.mode}") + raise ParseError(f"Invalid ocr mode: {self.settings.mode}") - if self.parser_settings.clean == OcrSettingModel.CleanChoices.CLEAN: + if self.settings.clean == CleanChoices.CLEAN: ocrmypdf_args["clean"] = True - elif self.parser_settings.clean == OcrSettingModel.CleanChoices.FINAL: - if self.parser_settings.mode == OcrSettingModel.ModeChoices.REDO: + elif self.settings.clean == CleanChoices.FINAL: + if self.settings.mode == ModeChoices.REDO: ocrmypdf_args["clean"] = True else: # --clean-final is not compatible with --redo-ocr ocrmypdf_args["clean_final"] = True - if ( - self.parser_settings.deskew - and self.parser_settings.mode != OcrSettingModel.ModeChoices.REDO - ): + if self.settings.deskew and self.settings.mode != ModeChoices.REDO: # --deskew is not compatible with --redo-ocr ocrmypdf_args["deskew"] = True - if self.parser_settings.rotate: + if self.settings.rotate: ocrmypdf_args["rotate_pages"] = True - ocrmypdf_args[ - "rotate_pages_threshold" - ] = self.parser_settings.rotate_threshold + ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold - if self.parser_settings.pages is not None: - ocrmypdf_args["pages"] = f"1-{self.parser_settings.pages}" + if self.settings.pages is not None: + ocrmypdf_args["pages"] = f"1-{self.settings.pages}" else: # sidecar is incompatible with pages ocrmypdf_args["sidecar"] = sidecar_file @@ -260,8 +253,8 @@ class RasterisedDocumentParser(DocumentParser): if dpi: self.log.debug(f"Detected DPI for image {input_file}: {dpi}") ocrmypdf_args["image_dpi"] = dpi - elif self.parser_settings.image_dpi is not None: - ocrmypdf_args["image_dpi"] = self.parser_settings.image_dpi + elif self.settings.image_dpi is not None: + ocrmypdf_args["image_dpi"] = self.settings.image_dpi elif a4_dpi: ocrmypdf_args["image_dpi"] = a4_dpi else: @@ -275,18 +268,18 @@ class RasterisedDocumentParser(DocumentParser): f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail", ) - if self.parser_settings.user_args is not None: + if self.settings.user_args is not None: try: - ocrmypdf_args = {**ocrmypdf_args, **self.parser_settings.user_args} + ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args} except Exception as e: self.log.warning( f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " f"they will not be used. Error: {e}", ) - if self.parser_settings.max_image_pixel is not None: + if self.settings.max_image_pixel is not None: # Convert pixels to mega-pixels and provide to ocrmypdf - max_pixels_mpixels = self.parser_settings.max_image_pixel / 1_000_000.0 + max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0 if max_pixels_mpixels > 0: self.log.debug( f"Calculated {max_pixels_mpixels} megapixels for OCR", @@ -318,11 +311,11 @@ class RasterisedDocumentParser(DocumentParser): # If the original has text, and the user doesn't want an archive, # we're done here skip_archive_for_text = ( - self.parser_settings.mode == OcrSettingModel.ModeChoices.SKIP_NO_ARCHIVE - or self.parser_settings.skip_archive_file + self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE + or self.settings.skip_archive_file in { - OcrSettingModel.ArchiveFileChoices.WITH_TEXT, - OcrSettingModel.ArchiveFileChoices.ALWAYS, + ArchiveFileChoices.WITH_TEXT, + ArchiveFileChoices.ALWAYS, } ) if skip_archive_for_text and original_has_text: @@ -353,10 +346,7 @@ class RasterisedDocumentParser(DocumentParser): self.log.debug(f"Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) - if ( - self.parser_settings.skip_archive_file - != OcrSettingModel.ArchiveFileChoices.ALWAYS - ): + if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS: self.archive_path = archive_path self.text = self.extract_text(sidecar_file, archive_path) diff --git a/src/paperless_tesseract/setting_schema.py b/src/paperless_tesseract/setting_schema.py deleted file mode 100644 index 907241ebd..000000000 --- a/src/paperless_tesseract/setting_schema.py +++ /dev/null @@ -1,72 +0,0 @@ -import dataclasses -import json -from typing import Optional - -from django.conf import settings - -from paperless.models import CommonSettings -from paperless.models import OcrSettings as OcrSettingModel - - -@dataclasses.dataclass(frozen=True) -class OcrSetting: - pages: Optional[int] - language: str - output_type: str - mode: str - skip_archive_file: str - image_dpi: Optional[int] - clean: str - deskew: bool - rotate: bool - rotate_threshold: float - max_image_pixel: Optional[float] - color_conversion_strategy: str - user_args: Optional[dict[str, str]] - - -def get_ocr_settings() -> OcrSetting: - ocr_db_settings = OcrSettingModel.objects.all().first() - # Workaround for a test where the migration hasn't run to create the single model - if ocr_db_settings is None: - OcrSettingModel.objects.create() - ocr_db_settings = OcrSettingModel.objects.all().first() - - cmn_db_settings = CommonSettings.objects.all().first() - if cmn_db_settings is None: - CommonSettings.objects.create() - cmn_db_settings = CommonSettings.objects.all().first() - - user_args = None - if ocr_db_settings.user_args: - user_args = ocr_db_settings.user_args - elif settings.OCR_USER_ARGS is not None: - try: - user_args = json.loads(settings.OCR_USER_ARGS) - except json.JSONDecodeError: - user_args = {} - - return OcrSetting( - pages=ocr_db_settings.pages or settings.OCR_PAGES, - language=ocr_db_settings.language or settings.OCR_LANGUAGE, - output_type=cmn_db_settings.output_type or settings.OCR_OUTPUT_TYPE, - mode=ocr_db_settings.mode or settings.OCR_MODE, - skip_archive_file=( - ocr_db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE - ), - image_dpi=ocr_db_settings.image_dpi or settings.OCR_IMAGE_DPI, - clean=ocr_db_settings.unpaper_clean or settings.OCR_CLEAN, - deskew=ocr_db_settings.deskew or settings.OCR_DESKEW, - rotate=ocr_db_settings.rotate_pages or settings.OCR_ROTATE_PAGES, - rotate_threshold=( - ocr_db_settings.rotate_pages_threshold - or settings.OCR_ROTATE_PAGES_THRESHOLD - ), - max_image_pixel=ocr_db_settings.max_image_pixels - or settings.OCR_MAX_IMAGE_PIXELS, - color_conversion_strategy=( - ocr_db_settings.color_conversion_strategy - or settings.OCR_COLOR_CONVERSION_STRATEGY - ), - user_args=user_args, - ) diff --git a/src/paperless_tesseract/tests/test_parser_custom_settings.py b/src/paperless_tesseract/tests/test_parser_custom_settings.py index f2c663b14..90f198fa0 100644 --- a/src/paperless_tesseract/tests/test_parser_custom_settings.py +++ b/src/paperless_tesseract/tests/test_parser_custom_settings.py @@ -3,8 +3,11 @@ from django.test import override_settings from documents.tests.utils import DirectoriesMixin from documents.tests.utils import FileSystemAssertsMixin -from paperless.models import CommonSettings -from paperless.models import OcrSettings +from paperless.models import ApplicationConfiguration +from paperless.models import CleanChoices +from paperless.models import ColorConvertChoices +from paperless.models import ModeChoices +from paperless.models import OutputTypeChoices from paperless_tesseract.parsers import RasterisedDocumentParser @@ -21,7 +24,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas def test_db_settings_ocr_pages(self): with override_settings(OCR_PAGES=10): - instance = OcrSettings.objects.all().first() + instance = ApplicationConfiguration.objects.all().first() instance.pages = 5 instance.save() @@ -30,7 +33,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas def test_db_settings_ocr_language(self): with override_settings(OCR_LANGUAGE="eng+deu"): - instance = OcrSettings.objects.all().first() + instance = ApplicationConfiguration.objects.all().first() instance.language = "fra+ita" instance.save() @@ -39,8 +42,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas def test_db_settings_ocr_output_type(self): with override_settings(OCR_LANGUAGE="pdfa-3"): - instance = OcrSettings.objects.all().first() - instance.output_type = CommonSettings.OutputTypeChoices.PDF_A + instance = ApplicationConfiguration.objects.all().first() + instance.output_type = OutputTypeChoices.PDF_A instance.save() params = self.get_params() @@ -48,8 +51,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas def test_db_settings_ocr_mode(self): with override_settings(OCR_MODE="redo"): - instance = OcrSettings.objects.all().first() - instance.mode = OcrSettings.ModeChoices.SKIP + instance = ApplicationConfiguration.objects.all().first() + instance.mode = ModeChoices.SKIP instance.save() params = self.get_params() @@ -59,8 +62,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas def test_db_settings_ocr_clean(self): with override_settings(OCR_CLEAN="clean-final"): - instance = OcrSettings.objects.all().first() - instance.unpaper_clean = OcrSettings.CleanChoices.CLEAN + instance = ApplicationConfiguration.objects.all().first() + instance.unpaper_clean = CleanChoices.CLEAN instance.save() params = self.get_params() @@ -68,8 +71,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas self.assertNotIn("clean_final", params) with override_settings(OCR_CLEAN="clean-final"): - instance = OcrSettings.objects.all().first() - instance.unpaper_clean = OcrSettings.CleanChoices.FINAL + instance = ApplicationConfiguration.objects.all().first() + instance.unpaper_clean = CleanChoices.FINAL instance.save() params = self.get_params() @@ -78,7 +81,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas def test_db_settings_ocr_deskew(self): with override_settings(OCR_DESKEW=False): - instance = OcrSettings.objects.all().first() + instance = ApplicationConfiguration.objects.all().first() instance.deskew = True instance.save() @@ -87,7 +90,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas def test_db_settings_ocr_rotate(self): with override_settings(OCR_ROTATE_PAGES=False, OCR_ROTATE_PAGES_THRESHOLD=30.0): - instance = OcrSettings.objects.all().first() + instance = ApplicationConfiguration.objects.all().first() instance.rotate_pages = True instance.rotate_pages_threshold = 15.0 instance.save() @@ -98,7 +101,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas def test_db_settings_ocr_max_pixels(self): with override_settings(OCR_MAX_IMAGE_PIXELS=2_000_000.0): - instance = OcrSettings.objects.all().first() + instance = ApplicationConfiguration.objects.all().first() instance.max_image_pixels = 1_000_000.0 instance.save() @@ -107,10 +110,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas def test_db_settings_ocr_color_convert(self): with override_settings(OCR_COLOR_CONVERSION_STRATEGY="LeaveColorUnchanged"): - instance = OcrSettings.objects.all().first() - instance.color_conversion_strategy = ( - OcrSettings.ColorConvertChoices.INDEPENDENT - ) + instance = ApplicationConfiguration.objects.all().first() + instance.color_conversion_strategy = ColorConvertChoices.INDEPENDENT instance.save() params = self.get_params() diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index ed9996039..4c07b5de3 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -10,6 +10,8 @@ from tika_client import TikaClient from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.parsers import make_thumbnail_from_pdf +from paperless.config import OutputTypeConfig +from paperless.models import OutputTypeChoices class TikaDocumentParser(DocumentParser): @@ -91,11 +93,14 @@ class TikaDocumentParser(DocumentParser): timeout=settings.CELERY_TASK_TIME_LIMIT, ) as client, client.libre_office.to_pdf() as route: # Set the output format of the resulting PDF - if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}: + if settings.OCR_OUTPUT_TYPE in { + OutputTypeChoices.PDF_A, + OutputTypeChoices.PDF_A2, + }: route.pdf_format(PdfAFormat.A2b) - elif settings.OCR_OUTPUT_TYPE == "pdfa-1": + elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: route.pdf_format(PdfAFormat.A1a) - elif settings.OCR_OUTPUT_TYPE == "pdfa-3": + elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3: route.pdf_format(PdfAFormat.A3b) route.convert(document_path) @@ -112,8 +117,8 @@ class TikaDocumentParser(DocumentParser): f"Error while converting document to PDF: {err}", ) from err - def get_settings(self): + def get_settings(self) -> OutputTypeConfig: """ This parser does not implement additional settings yet """ - return None + return OutputTypeConfig()