At least partially working for the tesseract parser

This commit is contained in:
Trenton H 2023-12-07 15:45:50 -08:00
parent 9867db9616
commit 30281bd593
9 changed files with 368 additions and 70 deletions

View File

@ -420,7 +420,7 @@ class Consumer(LoggingMixin):
document_parser: DocumentParser = parser_class(
self.logging_group,
progress_callback,
progress_callback=progress_callback,
)
self.log.debug(f"Parser: {type(document_parser).__name__}")

View File

@ -125,8 +125,10 @@ def get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentPar
if not options:
return None
best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
# Return the parser with the highest weight.
return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
return best_parser["parser"]
def run_convert(
@ -318,6 +320,7 @@ class DocumentParser(LoggingMixin):
def __init__(self, logging_group, progress_callback=None):
super().__init__()
self.logging_group = logging_group
self.parser_settings = self.get_settings()
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
@ -330,6 +333,10 @@ class DocumentParser(LoggingMixin):
if self.progress_callback:
self.progress_callback(current_progress, max_progress)
def get_settings(self):
# return None
raise NotImplementedError
def read_file_handle_unicode_errors(self, filepath: Path) -> str:
"""
Helper utility for reading from a file, and handling a problem with its

View File

@ -50,6 +50,12 @@ def __get_boolean(key: str, default: str = "NO") -> bool:
return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true"))
def __get_optional_boolean(key: str) -> Optional[bool]:
if key in os.environ:
return __get_boolean(key)
return None
def __get_int(key: str, default: int) -> int:
"""
Return an integer value based on the environment variable or a default
@ -57,6 +63,12 @@ def __get_int(key: str, default: int) -> int:
return int(os.getenv(key, default))
def __get_optional_int(key: str) -> Optional[int]:
if key in os.environ:
return __get_int(key, -1)
return None
def __get_float(key: str, default: float) -> float:
"""
Return an integer value based on the environment variable or a default
@ -64,6 +76,12 @@ def __get_float(key: str, default: float) -> float:
return float(os.getenv(key, default))
def __get_optional_float(key: str) -> Optional[float]:
if key in os.environ:
return __get_float(key, -1)
return None
def __get_path(
key: str,
default: Optional[Union[PathLike, str]] = None,
@ -796,11 +814,10 @@ CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
"PATCHT",
)
consumer_barcode_scanner_tmp: Final[str] = os.getenv(
CONSUMER_BARCODE_SCANNER: Final[str] = os.getenv(
"PAPERLESS_CONSUMER_BARCODE_SCANNER",
"PYZBAR",
)
CONSUMER_BARCODE_SCANNER = consumer_barcode_scanner_tmp.upper()
).upper()
CONSUMER_ENABLE_ASN_BARCODE: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_ASN_BARCODE",
@ -811,15 +828,12 @@ CONSUMER_ASN_BARCODE_PREFIX: Final[str] = os.getenv(
"ASN",
)
CONSUMER_BARCODE_UPSCALE: Final[float] = float(
os.getenv("PAPERLESS_CONSUMER_BARCODE_UPSCALE", 0.0),
CONSUMER_BARCODE_UPSCALE: Final[float] = __get_float(
"PAPERLESS_CONSUMER_BARCODE_UPSCALE",
0.0,
)
CONSUMER_BARCODE_DPI: Final[str] = int(
os.getenv("PAPERLESS_CONSUMER_BARCODE_DPI", 300),
)
CONSUMER_BARCODE_DPI: Final[int] = __get_int("PAPERLESS_CONSUMER_BARCODE_DPI", 300)
CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED",
@ -834,7 +848,7 @@ CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT",
)
OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))
OCR_PAGES = __get_optional_int("PAPERLESS_OCR_PAGES")
# The default language that tesseract will attempt to use when parsing
# documents. It should be a 3-letter language code consistent with ISO 639.
@ -848,28 +862,29 @@ OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
OCR_IMAGE_DPI = __get_optional_int("PAPERLESS_OCR_IMAGE_DPI")
OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean")
OCR_DESKEW = __get_boolean("PAPERLESS_OCR_DESKEW", "true")
OCR_DESKEW: Final[bool] = __get_boolean("PAPERLESS_OCR_DESKEW", "true")
OCR_ROTATE_PAGES = __get_boolean("PAPERLESS_OCR_ROTATE_PAGES", "true")
OCR_ROTATE_PAGES: Final[bool] = __get_boolean("PAPERLESS_OCR_ROTATE_PAGES", "true")
OCR_ROTATE_PAGES_THRESHOLD = float(
os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0),
OCR_ROTATE_PAGES_THRESHOLD: Final[float] = __get_float(
"PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD",
12.0,
)
OCR_MAX_IMAGE_PIXELS: Optional[int] = None
if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
OCR_MAX_IMAGE_PIXELS: Final[Optional[int]] = __get_optional_int(
"PAPERLESS_OCR_MAX_IMAGE_PIXELS",
)
OCR_COLOR_CONVERSION_STRATEGY = os.getenv(
"PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY",
"RGB",
)
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS")
# GNUPG needs a home directory for some reason
GNUPG_HOME = os.getenv("HOME", "/tmp")

View File

@ -0,0 +1,132 @@
# Generated by Django 4.2.7 on 2023-12-07 22:52
import django.core.validators
from django.db import migrations
from django.db import models
def _create_singleton(apps, schema_editor):
settings_model = apps.get_model("paperless_tesseract", "OcrSettings")
settings_model.objects.create()
class Migration(migrations.Migration):
initial = True
dependencies = []
operations = [
migrations.CreateModel(
name="OcrSettings",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
("pages", models.PositiveIntegerField(blank=True, null=True)),
("language", models.CharField(blank=True, max_length=32, null=True)),
(
"output_type",
models.CharField(
blank=True,
choices=[
("pdf", "pdf"),
("pdfa", "pdfa"),
("pdfa-1", "pdfa-1"),
("pdfa-2", "pdfa-2"),
("pdfa-3", "pdfa-3"),
],
max_length=8,
null=True,
),
),
(
"mode",
models.CharField(
blank=True,
choices=[
("skip", "skip"),
("skip_noarchive", "skip_noarchive"),
("redo", "redo"),
("force", "force"),
],
max_length=8,
null=True,
),
),
(
"skip_archive_file",
models.CharField(
blank=True,
choices=[
("never", "never"),
("with_text", "with_text"),
("always", "always"),
],
max_length=16,
null=True,
),
),
("image_dpi", models.PositiveIntegerField(null=True)),
(
"unpaper_clean",
models.CharField(
blank=True,
choices=[
("clean", "clean"),
("clean-final", "clean-final"),
("none", "none"),
],
max_length=16,
null=True,
),
),
("deskew", models.BooleanField(null=True)),
("rotate_pages", models.BooleanField(null=True)),
(
"rotate_pages_threshold",
models.FloatField(
null=True,
validators=[django.core.validators.MinValueValidator(0.0)],
),
),
(
"max_image_pixels",
models.FloatField(
null=True,
validators=[
django.core.validators.MinValueValidator(1000000.0),
],
),
),
(
"color_conversion_strategy",
models.CharField(
blank=True,
choices=[
("LeaveColorUnchanged", "LeaveColorUnchanged"),
("RGB", "RGB"),
("UseDeviceIndependentColor", "UseDeviceIndependentColor"),
("Gray", "Gray"),
("CMYK", "CMYK"),
],
max_length=32,
null=True,
),
),
("user_args", models.JSONField(blank=True, null=True)),
],
options={
"verbose_name": "ocr settings",
},
),
migrations.RunPython(
code=_create_singleton,
reverse_code=migrations.RunPython.noop,
),
]

View File

@ -1,7 +1,10 @@
from django.core.exceptions import ValidationError
from django.core.validators import MinValueValidator
from django.db import models
from django.utils.translation import gettext_lazy as _
DEFAULT_SINGLETON_INSTANCE_ID = 1
class OcrSettings(models.Model):
class OutputTypeChoices(models.TextChoices):
@ -12,49 +15,100 @@ class OcrSettings(models.Model):
PDF_A3 = ("pdfa-3", _("pdfa-3"))
class ModeChoices(models.TextChoices):
SKIP = ("skip", _("pdf"))
REDO = ("redo", _("pdfa"))
FORCE = ("force", _("pdfa-1"))
SKIP = ("skip", _("skip"))
SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive"))
REDO = ("redo", _("redo"))
FORCE = ("force", _("force"))
class ArchiveFileChoices(models.TextChoices):
NEVER = ("never", _("pdf"))
WITH_TEXT = ("with_text", _("pdfa"))
ALWAYS = ("always", _("pdfa-1"))
NEVER = ("never", _("never"))
WITH_TEXT = ("with_text", _("with_text"))
ALWAYS = ("always", _("always"))
class CleanChoices(models.TextChoices):
CLEAN = ("clean", _("clean"))
FINAL = ("clean-final", _("clean-final"))
NONE = ("none", _("none"))
class ColorConvertChoices(models.TextChoices):
UNCHANGED = ("LeaveColorUnchanged", _("LeaveColorUnchanged"))
RGB = ("RGB", _("RGB"))
INDEPENDENT = ("UseDeviceIndependentColor", _("UseDeviceIndependentColor"))
GRAY = ("Gray", _("Gray"))
CMYK = ("CMYK", _("CMYK"))
pages = models.PositiveIntegerField(null=True, blank=True)
pages = models.PositiveIntegerField(null=True)
language = models.CharField(null=True, blank=True, max_length=32)
output_type = models.CharField(
max_length=10,
null=True,
blank=True,
max_length=8,
choices=OutputTypeChoices.choices,
default=OutputTypeChoices.PDF_A,
)
mode = models.CharField(
max_length=50,
null=True,
blank=True,
max_length=8,
choices=ModeChoices.choices,
default=ModeChoices.SKIP,
)
skip_archive_file = models.CharField(
max_length=50,
null=True,
blank=True,
max_length=16,
choices=ArchiveFileChoices.choices,
default=ArchiveFileChoices.NEVER,
)
image_dpi = models.PositiveIntegerField(null=True)
clean = models.CharField(null=True, blank=True)
deskew = models.BooleanField(default=True)
rotate_pages = models.BooleanField(default=True)
unpaper_clean = models.CharField(
null=True,
blank=True,
max_length=16,
choices=CleanChoices.choices,
)
deskew = models.BooleanField(null=True)
rotate_pages = models.BooleanField(null=True)
rotate_pages_threshold = models.FloatField(
default=12.0,
null=True,
validators=[MinValueValidator(0.0)],
)
max_image_pixel = models.PositiveBigIntegerField(
max_image_pixels = models.FloatField(
null=True,
validators=[MinValueValidator(1_000_000.0)],
)
color_conversion_strategy = models.CharField(blank=True, null=True)
user_args = models.JSONField(blank=True, null=True)
color_conversion_strategy = models.CharField(
blank=True,
null=True,
max_length=32,
choices=ColorConvertChoices.choices,
)
user_args = models.JSONField(null=True)
class Meta:
verbose_name = _("ocr settings")
def __str__(self) -> str:
return ""
def save(self, *args, **kwargs):
if not self.pk and OcrSettings.objects.exists():
# if you'll not check for self.pk
# then error will also be raised in the update of exists model
raise ValidationError(
"There is can be only one JuicerBaseSettings instance",
)
return super().save(*args, **kwargs)
@classmethod
def object(cls):
return cls._default_manager.all().first() # Since only one item

View File

@ -1,4 +1,3 @@
import json
import os
import re
import subprocess
@ -12,6 +11,9 @@ from PIL import Image
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless_tesseract.models import OcrSettings as OcrSettingModel
from paperless_tesseract.setting_schema import OcrSetting
from paperless_tesseract.setting_schema import get_ocr_settings
class NoTextFoundException(Exception):
@ -30,6 +32,9 @@ class RasterisedDocumentParser(DocumentParser):
logging_name = "paperless.parsing.tesseract"
def get_settings(self) -> OcrSetting:
return get_ocr_settings()
def extract_metadata(self, document_path, mime_type):
result = []
if mime_type == "application/pdf":
@ -119,7 +124,7 @@ class RasterisedDocumentParser(DocumentParser):
if (
sidecar_file is not None
and os.path.isfile(sidecar_file)
and settings.OCR_MODE != "redo"
and self.parser_settings.mode != "redo"
):
text = self.read_file_handle_unicode_errors(sidecar_file)
@ -174,6 +179,7 @@ class RasterisedDocumentParser(DocumentParser):
sidecar_file,
safe_fallback=False,
):
assert isinstance(self.parser_settings, OcrSetting)
ocrmypdf_args = {
"input_file": input_file,
"output_file": output_file,
@ -181,46 +187,55 @@ class RasterisedDocumentParser(DocumentParser):
# processes via the task library.
"use_threads": True,
"jobs": settings.THREADS_PER_WORKER,
"language": settings.OCR_LANGUAGE,
"output_type": settings.OCR_OUTPUT_TYPE,
"language": self.parser_settings.language,
"output_type": self.parser_settings.output_type,
"progress_bar": False,
}
if "pdfa" in ocrmypdf_args["output_type"]:
ocrmypdf_args[
"color_conversion_strategy"
] = settings.OCR_COLOR_CONVERSION_STRATEGY
] = self.parser_settings.color_conversion_strategy
if settings.OCR_MODE == "force" or safe_fallback:
if (
self.parser_settings.mode == OcrSettingModel.ModeChoices.FORCE
or safe_fallback
):
ocrmypdf_args["force_ocr"] = True
elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
elif self.parser_settings.mode in {
OcrSettingModel.ModeChoices.SKIP,
OcrSettingModel.ModeChoices.SKIP_NO_ARCHIVE,
}:
ocrmypdf_args["skip_text"] = True
elif settings.OCR_MODE == "redo":
elif self.parser_settings.mode == OcrSettingModel.ModeChoices.REDO:
ocrmypdf_args["redo_ocr"] = True
else:
raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}")
raise ParseError(f"Invalid ocr mode: {self.parser_settings.mode}")
if settings.OCR_CLEAN == "clean":
if self.parser_settings.clean == OcrSettingModel.CleanChoices.CLEAN:
ocrmypdf_args["clean"] = True
elif settings.OCR_CLEAN == "clean-final":
if settings.OCR_MODE == "redo":
elif self.parser_settings.clean == OcrSettingModel.CleanChoices.FINAL:
if self.parser_settings.mode == OcrSettingModel.ModeChoices.REDO:
ocrmypdf_args["clean"] = True
else:
# --clean-final is not compatible with --redo-ocr
ocrmypdf_args["clean_final"] = True
if settings.OCR_DESKEW and settings.OCR_MODE != "redo":
if (
self.parser_settings.deskew
and self.parser_settings.mode != OcrSettingModel.ModeChoices.REDO
):
# --deskew is not compatible with --redo-ocr
ocrmypdf_args["deskew"] = True
if settings.OCR_ROTATE_PAGES:
if self.parser_settings.rotate:
ocrmypdf_args["rotate_pages"] = True
ocrmypdf_args[
"rotate_pages_threshold"
] = settings.OCR_ROTATE_PAGES_THRESHOLD
] = self.parser_settings.rotate_threshold
if settings.OCR_PAGES > 0:
ocrmypdf_args["pages"] = f"1-{settings.OCR_PAGES}"
if self.parser_settings.pages is not None:
ocrmypdf_args["pages"] = f"1-{self.parser_settings.pages}"
else:
# sidecar is incompatible with pages
ocrmypdf_args["sidecar"] = sidecar_file
@ -239,8 +254,8 @@ class RasterisedDocumentParser(DocumentParser):
if dpi:
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
ocrmypdf_args["image_dpi"] = dpi
elif settings.OCR_IMAGE_DPI:
ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI
elif self.parser_settings.image_dpi is not None:
ocrmypdf_args["image_dpi"] = self.parser_settings.image_dpi
elif a4_dpi:
ocrmypdf_args["image_dpi"] = a4_dpi
else:
@ -254,19 +269,18 @@ class RasterisedDocumentParser(DocumentParser):
f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
)
if settings.OCR_USER_ARGS:
if self.parser_settings.user_args is not None:
try:
user_args = json.loads(settings.OCR_USER_ARGS)
ocrmypdf_args = {**ocrmypdf_args, **user_args}
ocrmypdf_args = {**ocrmypdf_args, **self.parser_settings.user_args}
except Exception as e:
self.log.warning(
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
f"they will not be used. Error: {e}",
)
if settings.OCR_MAX_IMAGE_PIXELS is not None:
if self.parser_settings.max_image_pixel is not None:
# Convert pixels to mega-pixels and provide to ocrmypdf
max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
max_pixels_mpixels = self.parser_settings.max_image_pixel / 1_000_000.0
if max_pixels_mpixels > 0:
self.log.debug(
f"Calculated {max_pixels_mpixels} megapixels for OCR",
@ -298,8 +312,12 @@ class RasterisedDocumentParser(DocumentParser):
# If the original has text, and the user doesn't want an archive,
# we're done here
skip_archive_for_text = (
settings.OCR_MODE == "skip_noarchive"
or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
self.parser_settings.mode == OcrSettingModel.ModeChoices.SKIP_NO_ARCHIVE
or self.parser_settings.skip_archive_file
in {
OcrSettingModel.ArchiveFileChoices.WITH_TEXT,
OcrSettingModel.ArchiveFileChoices.ALWAYS,
}
)
if skip_archive_for_text and original_has_text:
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
@ -329,7 +347,10 @@ class RasterisedDocumentParser(DocumentParser):
self.log.debug(f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
if settings.OCR_SKIP_ARCHIVE_FILE != "always":
if (
self.parser_settings.skip_archive_file
!= OcrSettingModel.ArchiveFileChoices.ALWAYS
):
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)

View File

@ -0,0 +1,58 @@
import dataclasses
import json
from typing import Optional
from django.conf import settings
from paperless_tesseract.models import OcrSettings as OcrSettingModel
@dataclasses.dataclass(frozen=True)
class OcrSetting:
pages: Optional[int]
language: str
output_type: str
mode: str
skip_archive_file: str
image_dpi: Optional[int]
clean: str
deskew: bool
rotate: bool
rotate_threshold: float
max_image_pixel: Optional[float]
color_conversion_strategy: str
user_args: Optional[dict[str, str]]
def get_ocr_settings() -> OcrSetting:
db_settings = OcrSettingModel.objects.all().first()
assert db_settings is not None
user_args = None
if db_settings.user_args:
user_args = db_settings.user_args
elif settings.OCR_USER_ARGS is not None:
user_args = json.loads(settings.OCR_USER_ARGS)
return OcrSetting(
pages=db_settings.pages or settings.OCR_PAGES,
language=db_settings.language or settings.OCR_LANGUAGE,
output_type=db_settings.output_type or settings.OCR_OUTPUT_TYPE,
mode=db_settings.mode or settings.OCR_MODE,
skip_archive_file=(
db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
),
image_dpi=db_settings.image_dpi or settings.OCR_IMAGE_DPI,
clean=db_settings.unpaper_clean or settings.OCR_CLEAN,
deskew=db_settings.deskew or settings.OCR_DESKEW,
rotate=db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
rotate_threshold=(
db_settings.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD
),
max_image_pixel=db_settings.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS,
color_conversion_strategy=(
db_settings.color_conversion_strategy
or settings.OCR_COLOR_CONVERSION_STRATEGY
),
user_args=user_args,
)

View File

@ -5,8 +5,10 @@ def get_parser(*args, **kwargs):
def tesseract_consumer_declaration(sender, **kwargs):
from paperless_tesseract.parsers import RasterisedDocumentParser
return {
"parser": get_parser,
"parser": RasterisedDocumentParser,
"weight": 0,
"mime_types": {
"application/pdf": ".pdf",

View File

@ -769,43 +769,52 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertEqual(params["sidecar"], "sidecar.txt")
with override_settings(OCR_CLEAN="none"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("clean", params)
self.assertNotIn("clean_final", params)
with override_settings(OCR_CLEAN="clean"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertTrue(params["clean"])
self.assertNotIn("clean_final", params)
with override_settings(OCR_CLEAN="clean-final", OCR_MODE="skip"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertTrue(params["clean_final"])
self.assertNotIn("clean", params)
with override_settings(OCR_CLEAN="clean-final", OCR_MODE="redo"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertTrue(params["clean"])
self.assertNotIn("clean_final", params)
with override_settings(OCR_DESKEW=True, OCR_MODE="skip"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertTrue(params["deskew"])
with override_settings(OCR_DESKEW=True, OCR_MODE="redo"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("deskew", params)
with override_settings(OCR_DESKEW=False, OCR_MODE="skip"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("deskew", params)
with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertIn("max_image_mpixels", params)
self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("max_image_mpixels", params)