diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 6a0d1ec02..5d6fe7f65 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -420,7 +420,7 @@ class Consumer(LoggingMixin): document_parser: DocumentParser = parser_class( self.logging_group, - progress_callback, + progress_callback=progress_callback, ) self.log.debug(f"Parser: {type(document_parser).__name__}") diff --git a/src/documents/parsers.py b/src/documents/parsers.py index aa2645e07..89fafdb82 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -125,8 +125,10 @@ def get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentPar if not options: return None + best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0] + # Return the parser with the highest weight. - return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"] + return best_parser["parser"] def run_convert( @@ -318,6 +320,7 @@ class DocumentParser(LoggingMixin): def __init__(self, logging_group, progress_callback=None): super().__init__() self.logging_group = logging_group + self.parser_settings = self.get_settings() os.makedirs(settings.SCRATCH_DIR, exist_ok=True) self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR) @@ -330,6 +333,10 @@ class DocumentParser(LoggingMixin): if self.progress_callback: self.progress_callback(current_progress, max_progress) + def get_settings(self): + # return None + raise NotImplementedError + def read_file_handle_unicode_errors(self, filepath: Path) -> str: """ Helper utility for reading from a file, and handling a problem with its diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 30986aaa0..0c4831932 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -50,6 +50,12 @@ def __get_boolean(key: str, default: str = "NO") -> bool: return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true")) +def __get_optional_boolean(key: str) -> Optional[bool]: + if key in os.environ: + return __get_boolean(key) + return None + + def __get_int(key: str, default: int) -> int: """ Return an integer value based on the environment variable or a default @@ -57,6 +63,12 @@ def __get_int(key: str, default: int) -> int: return int(os.getenv(key, default)) +def __get_optional_int(key: str) -> Optional[int]: + if key in os.environ: + return __get_int(key, -1) + return None + + def __get_float(key: str, default: float) -> float: """ Return an integer value based on the environment variable or a default @@ -64,6 +76,12 @@ def __get_float(key: str, default: float) -> float: return float(os.getenv(key, default)) +def __get_optional_float(key: str) -> Optional[float]: + if key in os.environ: + return __get_float(key, -1) + return None + + def __get_path( key: str, default: Optional[Union[PathLike, str]] = None, @@ -796,11 +814,10 @@ CONSUMER_BARCODE_STRING: Final[str] = os.getenv( "PATCHT", ) -consumer_barcode_scanner_tmp: Final[str] = os.getenv( +CONSUMER_BARCODE_SCANNER: Final[str] = os.getenv( "PAPERLESS_CONSUMER_BARCODE_SCANNER", "PYZBAR", -) -CONSUMER_BARCODE_SCANNER = consumer_barcode_scanner_tmp.upper() +).upper() CONSUMER_ENABLE_ASN_BARCODE: Final[bool] = __get_boolean( "PAPERLESS_CONSUMER_ENABLE_ASN_BARCODE", @@ -811,15 +828,12 @@ CONSUMER_ASN_BARCODE_PREFIX: Final[str] = os.getenv( "ASN", ) - -CONSUMER_BARCODE_UPSCALE: Final[float] = float( - os.getenv("PAPERLESS_CONSUMER_BARCODE_UPSCALE", 0.0), +CONSUMER_BARCODE_UPSCALE: Final[float] = __get_float( + "PAPERLESS_CONSUMER_BARCODE_UPSCALE", + 0.0, ) - -CONSUMER_BARCODE_DPI: Final[str] = int( - os.getenv("PAPERLESS_CONSUMER_BARCODE_DPI", 300), -) +CONSUMER_BARCODE_DPI: Final[int] = __get_int("PAPERLESS_CONSUMER_BARCODE_DPI", 300) CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED: Final[bool] = __get_boolean( "PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED", @@ -834,7 +848,7 @@ CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean( "PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT", ) -OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0)) +OCR_PAGES = __get_optional_int("PAPERLESS_OCR_PAGES") # The default language that tesseract will attempt to use when parsing # documents. It should be a 3-letter language code consistent with ISO 639. @@ -848,28 +862,29 @@ OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never") -OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") +OCR_IMAGE_DPI = __get_optional_int("PAPERLESS_OCR_IMAGE_DPI") OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean") -OCR_DESKEW = __get_boolean("PAPERLESS_OCR_DESKEW", "true") +OCR_DESKEW: Final[bool] = __get_boolean("PAPERLESS_OCR_DESKEW", "true") -OCR_ROTATE_PAGES = __get_boolean("PAPERLESS_OCR_ROTATE_PAGES", "true") +OCR_ROTATE_PAGES: Final[bool] = __get_boolean("PAPERLESS_OCR_ROTATE_PAGES", "true") -OCR_ROTATE_PAGES_THRESHOLD = float( - os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0), +OCR_ROTATE_PAGES_THRESHOLD: Final[float] = __get_float( + "PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", + 12.0, ) -OCR_MAX_IMAGE_PIXELS: Optional[int] = None -if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None: - OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS")) +OCR_MAX_IMAGE_PIXELS: Final[Optional[int]] = __get_optional_int( + "PAPERLESS_OCR_MAX_IMAGE_PIXELS", +) OCR_COLOR_CONVERSION_STRATEGY = os.getenv( "PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY", "RGB", ) -OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") +OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS") # GNUPG needs a home directory for some reason GNUPG_HOME = os.getenv("HOME", "/tmp") diff --git a/src/paperless_tesseract/migrations/0001_initial.py b/src/paperless_tesseract/migrations/0001_initial.py new file mode 100644 index 000000000..9cd546ea0 --- /dev/null +++ b/src/paperless_tesseract/migrations/0001_initial.py @@ -0,0 +1,132 @@ +# Generated by Django 4.2.7 on 2023-12-07 22:52 + +import django.core.validators +from django.db import migrations +from django.db import models + + +def _create_singleton(apps, schema_editor): + settings_model = apps.get_model("paperless_tesseract", "OcrSettings") + settings_model.objects.create() + + +class Migration(migrations.Migration): + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="OcrSettings", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("pages", models.PositiveIntegerField(blank=True, null=True)), + ("language", models.CharField(blank=True, max_length=32, null=True)), + ( + "output_type", + models.CharField( + blank=True, + choices=[ + ("pdf", "pdf"), + ("pdfa", "pdfa"), + ("pdfa-1", "pdfa-1"), + ("pdfa-2", "pdfa-2"), + ("pdfa-3", "pdfa-3"), + ], + max_length=8, + null=True, + ), + ), + ( + "mode", + models.CharField( + blank=True, + choices=[ + ("skip", "skip"), + ("skip_noarchive", "skip_noarchive"), + ("redo", "redo"), + ("force", "force"), + ], + max_length=8, + null=True, + ), + ), + ( + "skip_archive_file", + models.CharField( + blank=True, + choices=[ + ("never", "never"), + ("with_text", "with_text"), + ("always", "always"), + ], + max_length=16, + null=True, + ), + ), + ("image_dpi", models.PositiveIntegerField(null=True)), + ( + "unpaper_clean", + models.CharField( + blank=True, + choices=[ + ("clean", "clean"), + ("clean-final", "clean-final"), + ("none", "none"), + ], + max_length=16, + null=True, + ), + ), + ("deskew", models.BooleanField(null=True)), + ("rotate_pages", models.BooleanField(null=True)), + ( + "rotate_pages_threshold", + models.FloatField( + null=True, + validators=[django.core.validators.MinValueValidator(0.0)], + ), + ), + ( + "max_image_pixels", + models.FloatField( + null=True, + validators=[ + django.core.validators.MinValueValidator(1000000.0), + ], + ), + ), + ( + "color_conversion_strategy", + models.CharField( + blank=True, + choices=[ + ("LeaveColorUnchanged", "LeaveColorUnchanged"), + ("RGB", "RGB"), + ("UseDeviceIndependentColor", "UseDeviceIndependentColor"), + ("Gray", "Gray"), + ("CMYK", "CMYK"), + ], + max_length=32, + null=True, + ), + ), + ("user_args", models.JSONField(blank=True, null=True)), + ], + options={ + "verbose_name": "ocr settings", + }, + ), + migrations.RunPython( + code=_create_singleton, + reverse_code=migrations.RunPython.noop, + ), + ] diff --git a/src/paperless_tesseract/models.py b/src/paperless_tesseract/models.py index 8ec29aee3..f43930635 100644 --- a/src/paperless_tesseract/models.py +++ b/src/paperless_tesseract/models.py @@ -1,7 +1,10 @@ +from django.core.exceptions import ValidationError from django.core.validators import MinValueValidator from django.db import models from django.utils.translation import gettext_lazy as _ +DEFAULT_SINGLETON_INSTANCE_ID = 1 + class OcrSettings(models.Model): class OutputTypeChoices(models.TextChoices): @@ -12,49 +15,100 @@ class OcrSettings(models.Model): PDF_A3 = ("pdfa-3", _("pdfa-3")) class ModeChoices(models.TextChoices): - SKIP = ("skip", _("pdf")) - REDO = ("redo", _("pdfa")) - FORCE = ("force", _("pdfa-1")) + SKIP = ("skip", _("skip")) + SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive")) + REDO = ("redo", _("redo")) + FORCE = ("force", _("force")) class ArchiveFileChoices(models.TextChoices): - NEVER = ("never", _("pdf")) - WITH_TEXT = ("with_text", _("pdfa")) - ALWAYS = ("always", _("pdfa-1")) + NEVER = ("never", _("never")) + WITH_TEXT = ("with_text", _("with_text")) + ALWAYS = ("always", _("always")) + + class CleanChoices(models.TextChoices): + CLEAN = ("clean", _("clean")) + FINAL = ("clean-final", _("clean-final")) + NONE = ("none", _("none")) + + class ColorConvertChoices(models.TextChoices): + UNCHANGED = ("LeaveColorUnchanged", _("LeaveColorUnchanged")) + RGB = ("RGB", _("RGB")) + INDEPENDENT = ("UseDeviceIndependentColor", _("UseDeviceIndependentColor")) + GRAY = ("Gray", _("Gray")) + CMYK = ("CMYK", _("CMYK")) + + pages = models.PositiveIntegerField(null=True, blank=True) - pages = models.PositiveIntegerField(null=True) language = models.CharField(null=True, blank=True, max_length=32) + output_type = models.CharField( - max_length=10, + null=True, + blank=True, + max_length=8, choices=OutputTypeChoices.choices, - default=OutputTypeChoices.PDF_A, ) + mode = models.CharField( - max_length=50, + null=True, + blank=True, + max_length=8, choices=ModeChoices.choices, - default=ModeChoices.SKIP, ) + skip_archive_file = models.CharField( - max_length=50, + null=True, + blank=True, + max_length=16, choices=ArchiveFileChoices.choices, - default=ArchiveFileChoices.NEVER, ) + image_dpi = models.PositiveIntegerField(null=True) - clean = models.CharField(null=True, blank=True) - deskew = models.BooleanField(default=True) - rotate_pages = models.BooleanField(default=True) + + unpaper_clean = models.CharField( + null=True, + blank=True, + max_length=16, + choices=CleanChoices.choices, + ) + + deskew = models.BooleanField(null=True) + + rotate_pages = models.BooleanField(null=True) + rotate_pages_threshold = models.FloatField( - default=12.0, + null=True, validators=[MinValueValidator(0.0)], ) - max_image_pixel = models.PositiveBigIntegerField( + + max_image_pixels = models.FloatField( null=True, validators=[MinValueValidator(1_000_000.0)], ) - color_conversion_strategy = models.CharField(blank=True, null=True) - user_args = models.JSONField(blank=True, null=True) + + color_conversion_strategy = models.CharField( + blank=True, + null=True, + max_length=32, + choices=ColorConvertChoices.choices, + ) + + user_args = models.JSONField(null=True) class Meta: verbose_name = _("ocr settings") def __str__(self) -> str: return "" + + def save(self, *args, **kwargs): + if not self.pk and OcrSettings.objects.exists(): + # if you'll not check for self.pk + # then error will also be raised in the update of exists model + raise ValidationError( + "There is can be only one JuicerBaseSettings instance", + ) + return super().save(*args, **kwargs) + + @classmethod + def object(cls): + return cls._default_manager.all().first() # Since only one item diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 46d106bd7..70b926432 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -1,4 +1,3 @@ -import json import os import re import subprocess @@ -12,6 +11,9 @@ from PIL import Image from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.parsers import make_thumbnail_from_pdf +from paperless_tesseract.models import OcrSettings as OcrSettingModel +from paperless_tesseract.setting_schema import OcrSetting +from paperless_tesseract.setting_schema import get_ocr_settings class NoTextFoundException(Exception): @@ -30,6 +32,9 @@ class RasterisedDocumentParser(DocumentParser): logging_name = "paperless.parsing.tesseract" + def get_settings(self) -> OcrSetting: + return get_ocr_settings() + def extract_metadata(self, document_path, mime_type): result = [] if mime_type == "application/pdf": @@ -119,7 +124,7 @@ class RasterisedDocumentParser(DocumentParser): if ( sidecar_file is not None and os.path.isfile(sidecar_file) - and settings.OCR_MODE != "redo" + and self.parser_settings.mode != "redo" ): text = self.read_file_handle_unicode_errors(sidecar_file) @@ -174,6 +179,7 @@ class RasterisedDocumentParser(DocumentParser): sidecar_file, safe_fallback=False, ): + assert isinstance(self.parser_settings, OcrSetting) ocrmypdf_args = { "input_file": input_file, "output_file": output_file, @@ -181,46 +187,55 @@ class RasterisedDocumentParser(DocumentParser): # processes via the task library. "use_threads": True, "jobs": settings.THREADS_PER_WORKER, - "language": settings.OCR_LANGUAGE, - "output_type": settings.OCR_OUTPUT_TYPE, + "language": self.parser_settings.language, + "output_type": self.parser_settings.output_type, "progress_bar": False, } if "pdfa" in ocrmypdf_args["output_type"]: ocrmypdf_args[ "color_conversion_strategy" - ] = settings.OCR_COLOR_CONVERSION_STRATEGY + ] = self.parser_settings.color_conversion_strategy - if settings.OCR_MODE == "force" or safe_fallback: + if ( + self.parser_settings.mode == OcrSettingModel.ModeChoices.FORCE + or safe_fallback + ): ocrmypdf_args["force_ocr"] = True - elif settings.OCR_MODE in ["skip", "skip_noarchive"]: + elif self.parser_settings.mode in { + OcrSettingModel.ModeChoices.SKIP, + OcrSettingModel.ModeChoices.SKIP_NO_ARCHIVE, + }: ocrmypdf_args["skip_text"] = True - elif settings.OCR_MODE == "redo": + elif self.parser_settings.mode == OcrSettingModel.ModeChoices.REDO: ocrmypdf_args["redo_ocr"] = True else: - raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}") + raise ParseError(f"Invalid ocr mode: {self.parser_settings.mode}") - if settings.OCR_CLEAN == "clean": + if self.parser_settings.clean == OcrSettingModel.CleanChoices.CLEAN: ocrmypdf_args["clean"] = True - elif settings.OCR_CLEAN == "clean-final": - if settings.OCR_MODE == "redo": + elif self.parser_settings.clean == OcrSettingModel.CleanChoices.FINAL: + if self.parser_settings.mode == OcrSettingModel.ModeChoices.REDO: ocrmypdf_args["clean"] = True else: # --clean-final is not compatible with --redo-ocr ocrmypdf_args["clean_final"] = True - if settings.OCR_DESKEW and settings.OCR_MODE != "redo": + if ( + self.parser_settings.deskew + and self.parser_settings.mode != OcrSettingModel.ModeChoices.REDO + ): # --deskew is not compatible with --redo-ocr ocrmypdf_args["deskew"] = True - if settings.OCR_ROTATE_PAGES: + if self.parser_settings.rotate: ocrmypdf_args["rotate_pages"] = True ocrmypdf_args[ "rotate_pages_threshold" - ] = settings.OCR_ROTATE_PAGES_THRESHOLD + ] = self.parser_settings.rotate_threshold - if settings.OCR_PAGES > 0: - ocrmypdf_args["pages"] = f"1-{settings.OCR_PAGES}" + if self.parser_settings.pages is not None: + ocrmypdf_args["pages"] = f"1-{self.parser_settings.pages}" else: # sidecar is incompatible with pages ocrmypdf_args["sidecar"] = sidecar_file @@ -239,8 +254,8 @@ class RasterisedDocumentParser(DocumentParser): if dpi: self.log.debug(f"Detected DPI for image {input_file}: {dpi}") ocrmypdf_args["image_dpi"] = dpi - elif settings.OCR_IMAGE_DPI: - ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI + elif self.parser_settings.image_dpi is not None: + ocrmypdf_args["image_dpi"] = self.parser_settings.image_dpi elif a4_dpi: ocrmypdf_args["image_dpi"] = a4_dpi else: @@ -254,19 +269,18 @@ class RasterisedDocumentParser(DocumentParser): f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail", ) - if settings.OCR_USER_ARGS: + if self.parser_settings.user_args is not None: try: - user_args = json.loads(settings.OCR_USER_ARGS) - ocrmypdf_args = {**ocrmypdf_args, **user_args} + ocrmypdf_args = {**ocrmypdf_args, **self.parser_settings.user_args} except Exception as e: self.log.warning( f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " f"they will not be used. Error: {e}", ) - if settings.OCR_MAX_IMAGE_PIXELS is not None: + if self.parser_settings.max_image_pixel is not None: # Convert pixels to mega-pixels and provide to ocrmypdf - max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0 + max_pixels_mpixels = self.parser_settings.max_image_pixel / 1_000_000.0 if max_pixels_mpixels > 0: self.log.debug( f"Calculated {max_pixels_mpixels} megapixels for OCR", @@ -298,8 +312,12 @@ class RasterisedDocumentParser(DocumentParser): # If the original has text, and the user doesn't want an archive, # we're done here skip_archive_for_text = ( - settings.OCR_MODE == "skip_noarchive" - or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"] + self.parser_settings.mode == OcrSettingModel.ModeChoices.SKIP_NO_ARCHIVE + or self.parser_settings.skip_archive_file + in { + OcrSettingModel.ArchiveFileChoices.WITH_TEXT, + OcrSettingModel.ArchiveFileChoices.ALWAYS, + } ) if skip_archive_for_text and original_has_text: self.log.debug("Document has text, skipping OCRmyPDF entirely.") @@ -329,7 +347,10 @@ class RasterisedDocumentParser(DocumentParser): self.log.debug(f"Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) - if settings.OCR_SKIP_ARCHIVE_FILE != "always": + if ( + self.parser_settings.skip_archive_file + != OcrSettingModel.ArchiveFileChoices.ALWAYS + ): self.archive_path = archive_path self.text = self.extract_text(sidecar_file, archive_path) diff --git a/src/paperless_tesseract/setting_schema.py b/src/paperless_tesseract/setting_schema.py new file mode 100644 index 000000000..0008f8c41 --- /dev/null +++ b/src/paperless_tesseract/setting_schema.py @@ -0,0 +1,58 @@ +import dataclasses +import json +from typing import Optional + +from django.conf import settings + +from paperless_tesseract.models import OcrSettings as OcrSettingModel + + +@dataclasses.dataclass(frozen=True) +class OcrSetting: + pages: Optional[int] + language: str + output_type: str + mode: str + skip_archive_file: str + image_dpi: Optional[int] + clean: str + deskew: bool + rotate: bool + rotate_threshold: float + max_image_pixel: Optional[float] + color_conversion_strategy: str + user_args: Optional[dict[str, str]] + + +def get_ocr_settings() -> OcrSetting: + db_settings = OcrSettingModel.objects.all().first() + assert db_settings is not None + + user_args = None + if db_settings.user_args: + user_args = db_settings.user_args + elif settings.OCR_USER_ARGS is not None: + user_args = json.loads(settings.OCR_USER_ARGS) + + return OcrSetting( + pages=db_settings.pages or settings.OCR_PAGES, + language=db_settings.language or settings.OCR_LANGUAGE, + output_type=db_settings.output_type or settings.OCR_OUTPUT_TYPE, + mode=db_settings.mode or settings.OCR_MODE, + skip_archive_file=( + db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE + ), + image_dpi=db_settings.image_dpi or settings.OCR_IMAGE_DPI, + clean=db_settings.unpaper_clean or settings.OCR_CLEAN, + deskew=db_settings.deskew or settings.OCR_DESKEW, + rotate=db_settings.rotate_pages or settings.OCR_ROTATE_PAGES, + rotate_threshold=( + db_settings.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD + ), + max_image_pixel=db_settings.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS, + color_conversion_strategy=( + db_settings.color_conversion_strategy + or settings.OCR_COLOR_CONVERSION_STRATEGY + ), + user_args=user_args, + ) diff --git a/src/paperless_tesseract/signals.py b/src/paperless_tesseract/signals.py index 7d6f6902f..dce2bf859 100644 --- a/src/paperless_tesseract/signals.py +++ b/src/paperless_tesseract/signals.py @@ -5,8 +5,10 @@ def get_parser(*args, **kwargs): def tesseract_consumer_declaration(sender, **kwargs): + from paperless_tesseract.parsers import RasterisedDocumentParser + return { - "parser": get_parser, + "parser": RasterisedDocumentParser, "weight": 0, "mime_types": { "application/pdf": ".pdf", diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 6a60ac3b7..1d7a1cafb 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -769,43 +769,52 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertEqual(params["sidecar"], "sidecar.txt") with override_settings(OCR_CLEAN="none"): + parser = RasterisedDocumentParser(None) params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertNotIn("clean", params) self.assertNotIn("clean_final", params) with override_settings(OCR_CLEAN="clean"): + parser = RasterisedDocumentParser(None) params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertTrue(params["clean"]) self.assertNotIn("clean_final", params) with override_settings(OCR_CLEAN="clean-final", OCR_MODE="skip"): + parser = RasterisedDocumentParser(None) params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertTrue(params["clean_final"]) self.assertNotIn("clean", params) with override_settings(OCR_CLEAN="clean-final", OCR_MODE="redo"): + parser = RasterisedDocumentParser(None) params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertTrue(params["clean"]) self.assertNotIn("clean_final", params) with override_settings(OCR_DESKEW=True, OCR_MODE="skip"): + parser = RasterisedDocumentParser(None) params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertTrue(params["deskew"]) with override_settings(OCR_DESKEW=True, OCR_MODE="redo"): + parser = RasterisedDocumentParser(None) params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertNotIn("deskew", params) with override_settings(OCR_DESKEW=False, OCR_MODE="skip"): + parser = RasterisedDocumentParser(None) params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertNotIn("deskew", params) with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0): + parser = RasterisedDocumentParser(None) params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertIn("max_image_mpixels", params) self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4) with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0): + parser = RasterisedDocumentParser(None) params = parser.construct_ocrmypdf_parameters("", "", "", "") self.assertNotIn("max_image_mpixels", params)