At least partially working for the tesseract parser
This commit is contained in:
parent
9867db9616
commit
30281bd593
@ -420,7 +420,7 @@ class Consumer(LoggingMixin):
|
||||
|
||||
document_parser: DocumentParser = parser_class(
|
||||
self.logging_group,
|
||||
progress_callback,
|
||||
progress_callback=progress_callback,
|
||||
)
|
||||
|
||||
self.log.debug(f"Parser: {type(document_parser).__name__}")
|
||||
|
@ -125,8 +125,10 @@ def get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentPar
|
||||
if not options:
|
||||
return None
|
||||
|
||||
best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
|
||||
|
||||
# Return the parser with the highest weight.
|
||||
return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
|
||||
return best_parser["parser"]
|
||||
|
||||
|
||||
def run_convert(
|
||||
@ -318,6 +320,7 @@ class DocumentParser(LoggingMixin):
|
||||
def __init__(self, logging_group, progress_callback=None):
|
||||
super().__init__()
|
||||
self.logging_group = logging_group
|
||||
self.parser_settings = self.get_settings()
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
|
||||
|
||||
@ -330,6 +333,10 @@ class DocumentParser(LoggingMixin):
|
||||
if self.progress_callback:
|
||||
self.progress_callback(current_progress, max_progress)
|
||||
|
||||
def get_settings(self):
|
||||
# return None
|
||||
raise NotImplementedError
|
||||
|
||||
def read_file_handle_unicode_errors(self, filepath: Path) -> str:
|
||||
"""
|
||||
Helper utility for reading from a file, and handling a problem with its
|
||||
|
@ -50,6 +50,12 @@ def __get_boolean(key: str, default: str = "NO") -> bool:
|
||||
return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true"))
|
||||
|
||||
|
||||
def __get_optional_boolean(key: str) -> Optional[bool]:
|
||||
if key in os.environ:
|
||||
return __get_boolean(key)
|
||||
return None
|
||||
|
||||
|
||||
def __get_int(key: str, default: int) -> int:
|
||||
"""
|
||||
Return an integer value based on the environment variable or a default
|
||||
@ -57,6 +63,12 @@ def __get_int(key: str, default: int) -> int:
|
||||
return int(os.getenv(key, default))
|
||||
|
||||
|
||||
def __get_optional_int(key: str) -> Optional[int]:
|
||||
if key in os.environ:
|
||||
return __get_int(key, -1)
|
||||
return None
|
||||
|
||||
|
||||
def __get_float(key: str, default: float) -> float:
|
||||
"""
|
||||
Return an integer value based on the environment variable or a default
|
||||
@ -64,6 +76,12 @@ def __get_float(key: str, default: float) -> float:
|
||||
return float(os.getenv(key, default))
|
||||
|
||||
|
||||
def __get_optional_float(key: str) -> Optional[float]:
|
||||
if key in os.environ:
|
||||
return __get_float(key, -1)
|
||||
return None
|
||||
|
||||
|
||||
def __get_path(
|
||||
key: str,
|
||||
default: Optional[Union[PathLike, str]] = None,
|
||||
@ -796,11 +814,10 @@ CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
|
||||
"PATCHT",
|
||||
)
|
||||
|
||||
consumer_barcode_scanner_tmp: Final[str] = os.getenv(
|
||||
CONSUMER_BARCODE_SCANNER: Final[str] = os.getenv(
|
||||
"PAPERLESS_CONSUMER_BARCODE_SCANNER",
|
||||
"PYZBAR",
|
||||
)
|
||||
CONSUMER_BARCODE_SCANNER = consumer_barcode_scanner_tmp.upper()
|
||||
).upper()
|
||||
|
||||
CONSUMER_ENABLE_ASN_BARCODE: Final[bool] = __get_boolean(
|
||||
"PAPERLESS_CONSUMER_ENABLE_ASN_BARCODE",
|
||||
@ -811,15 +828,12 @@ CONSUMER_ASN_BARCODE_PREFIX: Final[str] = os.getenv(
|
||||
"ASN",
|
||||
)
|
||||
|
||||
|
||||
CONSUMER_BARCODE_UPSCALE: Final[float] = float(
|
||||
os.getenv("PAPERLESS_CONSUMER_BARCODE_UPSCALE", 0.0),
|
||||
CONSUMER_BARCODE_UPSCALE: Final[float] = __get_float(
|
||||
"PAPERLESS_CONSUMER_BARCODE_UPSCALE",
|
||||
0.0,
|
||||
)
|
||||
|
||||
|
||||
CONSUMER_BARCODE_DPI: Final[str] = int(
|
||||
os.getenv("PAPERLESS_CONSUMER_BARCODE_DPI", 300),
|
||||
)
|
||||
CONSUMER_BARCODE_DPI: Final[int] = __get_int("PAPERLESS_CONSUMER_BARCODE_DPI", 300)
|
||||
|
||||
CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED: Final[bool] = __get_boolean(
|
||||
"PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED",
|
||||
@ -834,7 +848,7 @@ CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean(
|
||||
"PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT",
|
||||
)
|
||||
|
||||
OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))
|
||||
OCR_PAGES = __get_optional_int("PAPERLESS_OCR_PAGES")
|
||||
|
||||
# The default language that tesseract will attempt to use when parsing
|
||||
# documents. It should be a 3-letter language code consistent with ISO 639.
|
||||
@ -848,28 +862,29 @@ OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
|
||||
|
||||
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
|
||||
|
||||
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
|
||||
OCR_IMAGE_DPI = __get_optional_int("PAPERLESS_OCR_IMAGE_DPI")
|
||||
|
||||
OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean")
|
||||
|
||||
OCR_DESKEW = __get_boolean("PAPERLESS_OCR_DESKEW", "true")
|
||||
OCR_DESKEW: Final[bool] = __get_boolean("PAPERLESS_OCR_DESKEW", "true")
|
||||
|
||||
OCR_ROTATE_PAGES = __get_boolean("PAPERLESS_OCR_ROTATE_PAGES", "true")
|
||||
OCR_ROTATE_PAGES: Final[bool] = __get_boolean("PAPERLESS_OCR_ROTATE_PAGES", "true")
|
||||
|
||||
OCR_ROTATE_PAGES_THRESHOLD = float(
|
||||
os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0),
|
||||
OCR_ROTATE_PAGES_THRESHOLD: Final[float] = __get_float(
|
||||
"PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD",
|
||||
12.0,
|
||||
)
|
||||
|
||||
OCR_MAX_IMAGE_PIXELS: Optional[int] = None
|
||||
if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
|
||||
OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
|
||||
OCR_MAX_IMAGE_PIXELS: Final[Optional[int]] = __get_optional_int(
|
||||
"PAPERLESS_OCR_MAX_IMAGE_PIXELS",
|
||||
)
|
||||
|
||||
OCR_COLOR_CONVERSION_STRATEGY = os.getenv(
|
||||
"PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY",
|
||||
"RGB",
|
||||
)
|
||||
|
||||
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
|
||||
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS")
|
||||
|
||||
# GNUPG needs a home directory for some reason
|
||||
GNUPG_HOME = os.getenv("HOME", "/tmp")
|
||||
|
132
src/paperless_tesseract/migrations/0001_initial.py
Normal file
132
src/paperless_tesseract/migrations/0001_initial.py
Normal file
@ -0,0 +1,132 @@
|
||||
# Generated by Django 4.2.7 on 2023-12-07 22:52
|
||||
|
||||
import django.core.validators
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
|
||||
|
||||
def _create_singleton(apps, schema_editor):
|
||||
settings_model = apps.get_model("paperless_tesseract", "OcrSettings")
|
||||
settings_model.objects.create()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
initial = True
|
||||
|
||||
dependencies = []
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name="OcrSettings",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
("pages", models.PositiveIntegerField(blank=True, null=True)),
|
||||
("language", models.CharField(blank=True, max_length=32, null=True)),
|
||||
(
|
||||
"output_type",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
choices=[
|
||||
("pdf", "pdf"),
|
||||
("pdfa", "pdfa"),
|
||||
("pdfa-1", "pdfa-1"),
|
||||
("pdfa-2", "pdfa-2"),
|
||||
("pdfa-3", "pdfa-3"),
|
||||
],
|
||||
max_length=8,
|
||||
null=True,
|
||||
),
|
||||
),
|
||||
(
|
||||
"mode",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
choices=[
|
||||
("skip", "skip"),
|
||||
("skip_noarchive", "skip_noarchive"),
|
||||
("redo", "redo"),
|
||||
("force", "force"),
|
||||
],
|
||||
max_length=8,
|
||||
null=True,
|
||||
),
|
||||
),
|
||||
(
|
||||
"skip_archive_file",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
choices=[
|
||||
("never", "never"),
|
||||
("with_text", "with_text"),
|
||||
("always", "always"),
|
||||
],
|
||||
max_length=16,
|
||||
null=True,
|
||||
),
|
||||
),
|
||||
("image_dpi", models.PositiveIntegerField(null=True)),
|
||||
(
|
||||
"unpaper_clean",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
choices=[
|
||||
("clean", "clean"),
|
||||
("clean-final", "clean-final"),
|
||||
("none", "none"),
|
||||
],
|
||||
max_length=16,
|
||||
null=True,
|
||||
),
|
||||
),
|
||||
("deskew", models.BooleanField(null=True)),
|
||||
("rotate_pages", models.BooleanField(null=True)),
|
||||
(
|
||||
"rotate_pages_threshold",
|
||||
models.FloatField(
|
||||
null=True,
|
||||
validators=[django.core.validators.MinValueValidator(0.0)],
|
||||
),
|
||||
),
|
||||
(
|
||||
"max_image_pixels",
|
||||
models.FloatField(
|
||||
null=True,
|
||||
validators=[
|
||||
django.core.validators.MinValueValidator(1000000.0),
|
||||
],
|
||||
),
|
||||
),
|
||||
(
|
||||
"color_conversion_strategy",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
choices=[
|
||||
("LeaveColorUnchanged", "LeaveColorUnchanged"),
|
||||
("RGB", "RGB"),
|
||||
("UseDeviceIndependentColor", "UseDeviceIndependentColor"),
|
||||
("Gray", "Gray"),
|
||||
("CMYK", "CMYK"),
|
||||
],
|
||||
max_length=32,
|
||||
null=True,
|
||||
),
|
||||
),
|
||||
("user_args", models.JSONField(blank=True, null=True)),
|
||||
],
|
||||
options={
|
||||
"verbose_name": "ocr settings",
|
||||
},
|
||||
),
|
||||
migrations.RunPython(
|
||||
code=_create_singleton,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
]
|
@ -1,7 +1,10 @@
|
||||
from django.core.exceptions import ValidationError
|
||||
from django.core.validators import MinValueValidator
|
||||
from django.db import models
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
DEFAULT_SINGLETON_INSTANCE_ID = 1
|
||||
|
||||
|
||||
class OcrSettings(models.Model):
|
||||
class OutputTypeChoices(models.TextChoices):
|
||||
@ -12,49 +15,100 @@ class OcrSettings(models.Model):
|
||||
PDF_A3 = ("pdfa-3", _("pdfa-3"))
|
||||
|
||||
class ModeChoices(models.TextChoices):
|
||||
SKIP = ("skip", _("pdf"))
|
||||
REDO = ("redo", _("pdfa"))
|
||||
FORCE = ("force", _("pdfa-1"))
|
||||
SKIP = ("skip", _("skip"))
|
||||
SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive"))
|
||||
REDO = ("redo", _("redo"))
|
||||
FORCE = ("force", _("force"))
|
||||
|
||||
class ArchiveFileChoices(models.TextChoices):
|
||||
NEVER = ("never", _("pdf"))
|
||||
WITH_TEXT = ("with_text", _("pdfa"))
|
||||
ALWAYS = ("always", _("pdfa-1"))
|
||||
NEVER = ("never", _("never"))
|
||||
WITH_TEXT = ("with_text", _("with_text"))
|
||||
ALWAYS = ("always", _("always"))
|
||||
|
||||
class CleanChoices(models.TextChoices):
|
||||
CLEAN = ("clean", _("clean"))
|
||||
FINAL = ("clean-final", _("clean-final"))
|
||||
NONE = ("none", _("none"))
|
||||
|
||||
class ColorConvertChoices(models.TextChoices):
|
||||
UNCHANGED = ("LeaveColorUnchanged", _("LeaveColorUnchanged"))
|
||||
RGB = ("RGB", _("RGB"))
|
||||
INDEPENDENT = ("UseDeviceIndependentColor", _("UseDeviceIndependentColor"))
|
||||
GRAY = ("Gray", _("Gray"))
|
||||
CMYK = ("CMYK", _("CMYK"))
|
||||
|
||||
pages = models.PositiveIntegerField(null=True, blank=True)
|
||||
|
||||
pages = models.PositiveIntegerField(null=True)
|
||||
language = models.CharField(null=True, blank=True, max_length=32)
|
||||
|
||||
output_type = models.CharField(
|
||||
max_length=10,
|
||||
null=True,
|
||||
blank=True,
|
||||
max_length=8,
|
||||
choices=OutputTypeChoices.choices,
|
||||
default=OutputTypeChoices.PDF_A,
|
||||
)
|
||||
|
||||
mode = models.CharField(
|
||||
max_length=50,
|
||||
null=True,
|
||||
blank=True,
|
||||
max_length=8,
|
||||
choices=ModeChoices.choices,
|
||||
default=ModeChoices.SKIP,
|
||||
)
|
||||
|
||||
skip_archive_file = models.CharField(
|
||||
max_length=50,
|
||||
null=True,
|
||||
blank=True,
|
||||
max_length=16,
|
||||
choices=ArchiveFileChoices.choices,
|
||||
default=ArchiveFileChoices.NEVER,
|
||||
)
|
||||
|
||||
image_dpi = models.PositiveIntegerField(null=True)
|
||||
clean = models.CharField(null=True, blank=True)
|
||||
deskew = models.BooleanField(default=True)
|
||||
rotate_pages = models.BooleanField(default=True)
|
||||
|
||||
unpaper_clean = models.CharField(
|
||||
null=True,
|
||||
blank=True,
|
||||
max_length=16,
|
||||
choices=CleanChoices.choices,
|
||||
)
|
||||
|
||||
deskew = models.BooleanField(null=True)
|
||||
|
||||
rotate_pages = models.BooleanField(null=True)
|
||||
|
||||
rotate_pages_threshold = models.FloatField(
|
||||
default=12.0,
|
||||
null=True,
|
||||
validators=[MinValueValidator(0.0)],
|
||||
)
|
||||
max_image_pixel = models.PositiveBigIntegerField(
|
||||
|
||||
max_image_pixels = models.FloatField(
|
||||
null=True,
|
||||
validators=[MinValueValidator(1_000_000.0)],
|
||||
)
|
||||
color_conversion_strategy = models.CharField(blank=True, null=True)
|
||||
user_args = models.JSONField(blank=True, null=True)
|
||||
|
||||
color_conversion_strategy = models.CharField(
|
||||
blank=True,
|
||||
null=True,
|
||||
max_length=32,
|
||||
choices=ColorConvertChoices.choices,
|
||||
)
|
||||
|
||||
user_args = models.JSONField(null=True)
|
||||
|
||||
class Meta:
|
||||
verbose_name = _("ocr settings")
|
||||
|
||||
def __str__(self) -> str:
|
||||
return ""
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
if not self.pk and OcrSettings.objects.exists():
|
||||
# if you'll not check for self.pk
|
||||
# then error will also be raised in the update of exists model
|
||||
raise ValidationError(
|
||||
"There is can be only one JuicerBaseSettings instance",
|
||||
)
|
||||
return super().save(*args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def object(cls):
|
||||
return cls._default_manager.all().first() # Since only one item
|
||||
|
@ -1,4 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
@ -12,6 +11,9 @@ from PIL import Image
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from paperless_tesseract.models import OcrSettings as OcrSettingModel
|
||||
from paperless_tesseract.setting_schema import OcrSetting
|
||||
from paperless_tesseract.setting_schema import get_ocr_settings
|
||||
|
||||
|
||||
class NoTextFoundException(Exception):
|
||||
@ -30,6 +32,9 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
logging_name = "paperless.parsing.tesseract"
|
||||
|
||||
def get_settings(self) -> OcrSetting:
|
||||
return get_ocr_settings()
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
result = []
|
||||
if mime_type == "application/pdf":
|
||||
@ -119,7 +124,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
if (
|
||||
sidecar_file is not None
|
||||
and os.path.isfile(sidecar_file)
|
||||
and settings.OCR_MODE != "redo"
|
||||
and self.parser_settings.mode != "redo"
|
||||
):
|
||||
text = self.read_file_handle_unicode_errors(sidecar_file)
|
||||
|
||||
@ -174,6 +179,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
sidecar_file,
|
||||
safe_fallback=False,
|
||||
):
|
||||
assert isinstance(self.parser_settings, OcrSetting)
|
||||
ocrmypdf_args = {
|
||||
"input_file": input_file,
|
||||
"output_file": output_file,
|
||||
@ -181,46 +187,55 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
# processes via the task library.
|
||||
"use_threads": True,
|
||||
"jobs": settings.THREADS_PER_WORKER,
|
||||
"language": settings.OCR_LANGUAGE,
|
||||
"output_type": settings.OCR_OUTPUT_TYPE,
|
||||
"language": self.parser_settings.language,
|
||||
"output_type": self.parser_settings.output_type,
|
||||
"progress_bar": False,
|
||||
}
|
||||
|
||||
if "pdfa" in ocrmypdf_args["output_type"]:
|
||||
ocrmypdf_args[
|
||||
"color_conversion_strategy"
|
||||
] = settings.OCR_COLOR_CONVERSION_STRATEGY
|
||||
] = self.parser_settings.color_conversion_strategy
|
||||
|
||||
if settings.OCR_MODE == "force" or safe_fallback:
|
||||
if (
|
||||
self.parser_settings.mode == OcrSettingModel.ModeChoices.FORCE
|
||||
or safe_fallback
|
||||
):
|
||||
ocrmypdf_args["force_ocr"] = True
|
||||
elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
|
||||
elif self.parser_settings.mode in {
|
||||
OcrSettingModel.ModeChoices.SKIP,
|
||||
OcrSettingModel.ModeChoices.SKIP_NO_ARCHIVE,
|
||||
}:
|
||||
ocrmypdf_args["skip_text"] = True
|
||||
elif settings.OCR_MODE == "redo":
|
||||
elif self.parser_settings.mode == OcrSettingModel.ModeChoices.REDO:
|
||||
ocrmypdf_args["redo_ocr"] = True
|
||||
else:
|
||||
raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}")
|
||||
raise ParseError(f"Invalid ocr mode: {self.parser_settings.mode}")
|
||||
|
||||
if settings.OCR_CLEAN == "clean":
|
||||
if self.parser_settings.clean == OcrSettingModel.CleanChoices.CLEAN:
|
||||
ocrmypdf_args["clean"] = True
|
||||
elif settings.OCR_CLEAN == "clean-final":
|
||||
if settings.OCR_MODE == "redo":
|
||||
elif self.parser_settings.clean == OcrSettingModel.CleanChoices.FINAL:
|
||||
if self.parser_settings.mode == OcrSettingModel.ModeChoices.REDO:
|
||||
ocrmypdf_args["clean"] = True
|
||||
else:
|
||||
# --clean-final is not compatible with --redo-ocr
|
||||
ocrmypdf_args["clean_final"] = True
|
||||
|
||||
if settings.OCR_DESKEW and settings.OCR_MODE != "redo":
|
||||
if (
|
||||
self.parser_settings.deskew
|
||||
and self.parser_settings.mode != OcrSettingModel.ModeChoices.REDO
|
||||
):
|
||||
# --deskew is not compatible with --redo-ocr
|
||||
ocrmypdf_args["deskew"] = True
|
||||
|
||||
if settings.OCR_ROTATE_PAGES:
|
||||
if self.parser_settings.rotate:
|
||||
ocrmypdf_args["rotate_pages"] = True
|
||||
ocrmypdf_args[
|
||||
"rotate_pages_threshold"
|
||||
] = settings.OCR_ROTATE_PAGES_THRESHOLD
|
||||
] = self.parser_settings.rotate_threshold
|
||||
|
||||
if settings.OCR_PAGES > 0:
|
||||
ocrmypdf_args["pages"] = f"1-{settings.OCR_PAGES}"
|
||||
if self.parser_settings.pages is not None:
|
||||
ocrmypdf_args["pages"] = f"1-{self.parser_settings.pages}"
|
||||
else:
|
||||
# sidecar is incompatible with pages
|
||||
ocrmypdf_args["sidecar"] = sidecar_file
|
||||
@ -239,8 +254,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
if dpi:
|
||||
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
|
||||
ocrmypdf_args["image_dpi"] = dpi
|
||||
elif settings.OCR_IMAGE_DPI:
|
||||
ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI
|
||||
elif self.parser_settings.image_dpi is not None:
|
||||
ocrmypdf_args["image_dpi"] = self.parser_settings.image_dpi
|
||||
elif a4_dpi:
|
||||
ocrmypdf_args["image_dpi"] = a4_dpi
|
||||
else:
|
||||
@ -254,19 +269,18 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
|
||||
)
|
||||
|
||||
if settings.OCR_USER_ARGS:
|
||||
if self.parser_settings.user_args is not None:
|
||||
try:
|
||||
user_args = json.loads(settings.OCR_USER_ARGS)
|
||||
ocrmypdf_args = {**ocrmypdf_args, **user_args}
|
||||
ocrmypdf_args = {**ocrmypdf_args, **self.parser_settings.user_args}
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
|
||||
f"they will not be used. Error: {e}",
|
||||
)
|
||||
|
||||
if settings.OCR_MAX_IMAGE_PIXELS is not None:
|
||||
if self.parser_settings.max_image_pixel is not None:
|
||||
# Convert pixels to mega-pixels and provide to ocrmypdf
|
||||
max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
|
||||
max_pixels_mpixels = self.parser_settings.max_image_pixel / 1_000_000.0
|
||||
if max_pixels_mpixels > 0:
|
||||
self.log.debug(
|
||||
f"Calculated {max_pixels_mpixels} megapixels for OCR",
|
||||
@ -298,8 +312,12 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
# If the original has text, and the user doesn't want an archive,
|
||||
# we're done here
|
||||
skip_archive_for_text = (
|
||||
settings.OCR_MODE == "skip_noarchive"
|
||||
or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
|
||||
self.parser_settings.mode == OcrSettingModel.ModeChoices.SKIP_NO_ARCHIVE
|
||||
or self.parser_settings.skip_archive_file
|
||||
in {
|
||||
OcrSettingModel.ArchiveFileChoices.WITH_TEXT,
|
||||
OcrSettingModel.ArchiveFileChoices.ALWAYS,
|
||||
}
|
||||
)
|
||||
if skip_archive_for_text and original_has_text:
|
||||
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
|
||||
@ -329,7 +347,10 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.log.debug(f"Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
if settings.OCR_SKIP_ARCHIVE_FILE != "always":
|
||||
if (
|
||||
self.parser_settings.skip_archive_file
|
||||
!= OcrSettingModel.ArchiveFileChoices.ALWAYS
|
||||
):
|
||||
self.archive_path = archive_path
|
||||
|
||||
self.text = self.extract_text(sidecar_file, archive_path)
|
||||
|
58
src/paperless_tesseract/setting_schema.py
Normal file
58
src/paperless_tesseract/setting_schema.py
Normal file
@ -0,0 +1,58 @@
|
||||
import dataclasses
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from paperless_tesseract.models import OcrSettings as OcrSettingModel
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class OcrSetting:
|
||||
pages: Optional[int]
|
||||
language: str
|
||||
output_type: str
|
||||
mode: str
|
||||
skip_archive_file: str
|
||||
image_dpi: Optional[int]
|
||||
clean: str
|
||||
deskew: bool
|
||||
rotate: bool
|
||||
rotate_threshold: float
|
||||
max_image_pixel: Optional[float]
|
||||
color_conversion_strategy: str
|
||||
user_args: Optional[dict[str, str]]
|
||||
|
||||
|
||||
def get_ocr_settings() -> OcrSetting:
|
||||
db_settings = OcrSettingModel.objects.all().first()
|
||||
assert db_settings is not None
|
||||
|
||||
user_args = None
|
||||
if db_settings.user_args:
|
||||
user_args = db_settings.user_args
|
||||
elif settings.OCR_USER_ARGS is not None:
|
||||
user_args = json.loads(settings.OCR_USER_ARGS)
|
||||
|
||||
return OcrSetting(
|
||||
pages=db_settings.pages or settings.OCR_PAGES,
|
||||
language=db_settings.language or settings.OCR_LANGUAGE,
|
||||
output_type=db_settings.output_type or settings.OCR_OUTPUT_TYPE,
|
||||
mode=db_settings.mode or settings.OCR_MODE,
|
||||
skip_archive_file=(
|
||||
db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
|
||||
),
|
||||
image_dpi=db_settings.image_dpi or settings.OCR_IMAGE_DPI,
|
||||
clean=db_settings.unpaper_clean or settings.OCR_CLEAN,
|
||||
deskew=db_settings.deskew or settings.OCR_DESKEW,
|
||||
rotate=db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
|
||||
rotate_threshold=(
|
||||
db_settings.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD
|
||||
),
|
||||
max_image_pixel=db_settings.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS,
|
||||
color_conversion_strategy=(
|
||||
db_settings.color_conversion_strategy
|
||||
or settings.OCR_COLOR_CONVERSION_STRATEGY
|
||||
),
|
||||
user_args=user_args,
|
||||
)
|
@ -5,8 +5,10 @@ def get_parser(*args, **kwargs):
|
||||
|
||||
|
||||
def tesseract_consumer_declaration(sender, **kwargs):
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"parser": RasterisedDocumentParser,
|
||||
"weight": 0,
|
||||
"mime_types": {
|
||||
"application/pdf": ".pdf",
|
||||
|
@ -769,43 +769,52 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.assertEqual(params["sidecar"], "sidecar.txt")
|
||||
|
||||
with override_settings(OCR_CLEAN="none"):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertNotIn("clean", params)
|
||||
self.assertNotIn("clean_final", params)
|
||||
|
||||
with override_settings(OCR_CLEAN="clean"):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertTrue(params["clean"])
|
||||
self.assertNotIn("clean_final", params)
|
||||
|
||||
with override_settings(OCR_CLEAN="clean-final", OCR_MODE="skip"):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertTrue(params["clean_final"])
|
||||
self.assertNotIn("clean", params)
|
||||
|
||||
with override_settings(OCR_CLEAN="clean-final", OCR_MODE="redo"):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertTrue(params["clean"])
|
||||
self.assertNotIn("clean_final", params)
|
||||
|
||||
with override_settings(OCR_DESKEW=True, OCR_MODE="skip"):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertTrue(params["deskew"])
|
||||
|
||||
with override_settings(OCR_DESKEW=True, OCR_MODE="redo"):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertNotIn("deskew", params)
|
||||
|
||||
with override_settings(OCR_DESKEW=False, OCR_MODE="skip"):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertNotIn("deskew", params)
|
||||
|
||||
with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertIn("max_image_mpixels", params)
|
||||
self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
|
||||
|
||||
with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
params = parser.construct_ocrmypdf_parameters("", "", "", "")
|
||||
self.assertNotIn("max_image_mpixels", params)
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user