Updates to use a single configuration object for all settings

This commit is contained in:
Trenton H
2023-12-19 10:21:51 -08:00
parent a6c8550db5
commit 74e845974c
13 changed files with 242 additions and 249 deletions

View File

@@ -12,9 +12,10 @@ from PIL import Image
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless.models import OcrSettings as OcrSettingModel
from paperless_tesseract.setting_schema import OcrSetting
from paperless_tesseract.setting_schema import get_ocr_settings
from paperless.config import OcrConfig
from paperless.models import ArchiveFileChoices
from paperless.models import CleanChoices
from paperless.models import ModeChoices
class NoTextFoundException(Exception):
@@ -33,8 +34,8 @@ class RasterisedDocumentParser(DocumentParser):
logging_name = "paperless.parsing.tesseract"
def get_settings(self) -> OcrSetting:
return get_ocr_settings()
def get_settings(self) -> OcrConfig:
return OcrConfig()
def extract_metadata(self, document_path, mime_type):
result = []
@@ -129,7 +130,7 @@ class RasterisedDocumentParser(DocumentParser):
if (
sidecar_file is not None
and os.path.isfile(sidecar_file)
and self.parser_settings.mode != "redo"
and self.settings.mode != "redo"
):
text = self.read_file_handle_unicode_errors(sidecar_file)
@@ -185,7 +186,7 @@ class RasterisedDocumentParser(DocumentParser):
safe_fallback=False,
):
if TYPE_CHECKING:
assert isinstance(self.parser_settings, OcrSetting)
assert isinstance(self.settings, OcrConfig)
ocrmypdf_args = {
"input_file": input_file,
"output_file": output_file,
@@ -193,55 +194,47 @@ class RasterisedDocumentParser(DocumentParser):
# processes via the task library.
"use_threads": True,
"jobs": settings.THREADS_PER_WORKER,
"language": self.parser_settings.language,
"output_type": self.parser_settings.output_type,
"language": self.settings.language,
"output_type": self.settings.output_type,
"progress_bar": False,
}
if "pdfa" in ocrmypdf_args["output_type"]:
ocrmypdf_args[
"color_conversion_strategy"
] = self.parser_settings.color_conversion_strategy
] = self.settings.color_conversion_strategy
if (
self.parser_settings.mode == OcrSettingModel.ModeChoices.FORCE
or safe_fallback
):
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
ocrmypdf_args["force_ocr"] = True
elif self.parser_settings.mode in {
OcrSettingModel.ModeChoices.SKIP,
OcrSettingModel.ModeChoices.SKIP_NO_ARCHIVE,
elif self.settings.mode in {
ModeChoices.SKIP,
ModeChoices.SKIP_NO_ARCHIVE,
}:
ocrmypdf_args["skip_text"] = True
elif self.parser_settings.mode == OcrSettingModel.ModeChoices.REDO:
elif self.settings.mode == ModeChoices.REDO:
ocrmypdf_args["redo_ocr"] = True
else:
raise ParseError(f"Invalid ocr mode: {self.parser_settings.mode}")
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
if self.parser_settings.clean == OcrSettingModel.CleanChoices.CLEAN:
if self.settings.clean == CleanChoices.CLEAN:
ocrmypdf_args["clean"] = True
elif self.parser_settings.clean == OcrSettingModel.CleanChoices.FINAL:
if self.parser_settings.mode == OcrSettingModel.ModeChoices.REDO:
elif self.settings.clean == CleanChoices.FINAL:
if self.settings.mode == ModeChoices.REDO:
ocrmypdf_args["clean"] = True
else:
# --clean-final is not compatible with --redo-ocr
ocrmypdf_args["clean_final"] = True
if (
self.parser_settings.deskew
and self.parser_settings.mode != OcrSettingModel.ModeChoices.REDO
):
if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
# --deskew is not compatible with --redo-ocr
ocrmypdf_args["deskew"] = True
if self.parser_settings.rotate:
if self.settings.rotate:
ocrmypdf_args["rotate_pages"] = True
ocrmypdf_args[
"rotate_pages_threshold"
] = self.parser_settings.rotate_threshold
ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
if self.parser_settings.pages is not None:
ocrmypdf_args["pages"] = f"1-{self.parser_settings.pages}"
if self.settings.pages is not None:
ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
else:
# sidecar is incompatible with pages
ocrmypdf_args["sidecar"] = sidecar_file
@@ -260,8 +253,8 @@ class RasterisedDocumentParser(DocumentParser):
if dpi:
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
ocrmypdf_args["image_dpi"] = dpi
elif self.parser_settings.image_dpi is not None:
ocrmypdf_args["image_dpi"] = self.parser_settings.image_dpi
elif self.settings.image_dpi is not None:
ocrmypdf_args["image_dpi"] = self.settings.image_dpi
elif a4_dpi:
ocrmypdf_args["image_dpi"] = a4_dpi
else:
@@ -275,18 +268,18 @@ class RasterisedDocumentParser(DocumentParser):
f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
)
if self.parser_settings.user_args is not None:
if self.settings.user_args is not None:
try:
ocrmypdf_args = {**ocrmypdf_args, **self.parser_settings.user_args}
ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
except Exception as e:
self.log.warning(
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
f"they will not be used. Error: {e}",
)
if self.parser_settings.max_image_pixel is not None:
if self.settings.max_image_pixel is not None:
# Convert pixels to mega-pixels and provide to ocrmypdf
max_pixels_mpixels = self.parser_settings.max_image_pixel / 1_000_000.0
max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
if max_pixels_mpixels > 0:
self.log.debug(
f"Calculated {max_pixels_mpixels} megapixels for OCR",
@@ -318,11 +311,11 @@ class RasterisedDocumentParser(DocumentParser):
# If the original has text, and the user doesn't want an archive,
# we're done here
skip_archive_for_text = (
self.parser_settings.mode == OcrSettingModel.ModeChoices.SKIP_NO_ARCHIVE
or self.parser_settings.skip_archive_file
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
or self.settings.skip_archive_file
in {
OcrSettingModel.ArchiveFileChoices.WITH_TEXT,
OcrSettingModel.ArchiveFileChoices.ALWAYS,
ArchiveFileChoices.WITH_TEXT,
ArchiveFileChoices.ALWAYS,
}
)
if skip_archive_for_text and original_has_text:
@@ -353,10 +346,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log.debug(f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
if (
self.parser_settings.skip_archive_file
!= OcrSettingModel.ArchiveFileChoices.ALWAYS
):
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)

View File

@@ -1,72 +0,0 @@
import dataclasses
import json
from typing import Optional
from django.conf import settings
from paperless.models import CommonSettings
from paperless.models import OcrSettings as OcrSettingModel
@dataclasses.dataclass(frozen=True)
class OcrSetting:
pages: Optional[int]
language: str
output_type: str
mode: str
skip_archive_file: str
image_dpi: Optional[int]
clean: str
deskew: bool
rotate: bool
rotate_threshold: float
max_image_pixel: Optional[float]
color_conversion_strategy: str
user_args: Optional[dict[str, str]]
def get_ocr_settings() -> OcrSetting:
ocr_db_settings = OcrSettingModel.objects.all().first()
# Workaround for a test where the migration hasn't run to create the single model
if ocr_db_settings is None:
OcrSettingModel.objects.create()
ocr_db_settings = OcrSettingModel.objects.all().first()
cmn_db_settings = CommonSettings.objects.all().first()
if cmn_db_settings is None:
CommonSettings.objects.create()
cmn_db_settings = CommonSettings.objects.all().first()
user_args = None
if ocr_db_settings.user_args:
user_args = ocr_db_settings.user_args
elif settings.OCR_USER_ARGS is not None:
try:
user_args = json.loads(settings.OCR_USER_ARGS)
except json.JSONDecodeError:
user_args = {}
return OcrSetting(
pages=ocr_db_settings.pages or settings.OCR_PAGES,
language=ocr_db_settings.language or settings.OCR_LANGUAGE,
output_type=cmn_db_settings.output_type or settings.OCR_OUTPUT_TYPE,
mode=ocr_db_settings.mode or settings.OCR_MODE,
skip_archive_file=(
ocr_db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
),
image_dpi=ocr_db_settings.image_dpi or settings.OCR_IMAGE_DPI,
clean=ocr_db_settings.unpaper_clean or settings.OCR_CLEAN,
deskew=ocr_db_settings.deskew or settings.OCR_DESKEW,
rotate=ocr_db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
rotate_threshold=(
ocr_db_settings.rotate_pages_threshold
or settings.OCR_ROTATE_PAGES_THRESHOLD
),
max_image_pixel=ocr_db_settings.max_image_pixels
or settings.OCR_MAX_IMAGE_PIXELS,
color_conversion_strategy=(
ocr_db_settings.color_conversion_strategy
or settings.OCR_COLOR_CONVERSION_STRATEGY
),
user_args=user_args,
)

View File

@@ -3,8 +3,11 @@ from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless.models import CommonSettings
from paperless.models import OcrSettings
from paperless.models import ApplicationConfiguration
from paperless.models import CleanChoices
from paperless.models import ColorConvertChoices
from paperless.models import ModeChoices
from paperless.models import OutputTypeChoices
from paperless_tesseract.parsers import RasterisedDocumentParser
@@ -21,7 +24,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_pages(self):
with override_settings(OCR_PAGES=10):
instance = OcrSettings.objects.all().first()
instance = ApplicationConfiguration.objects.all().first()
instance.pages = 5
instance.save()
@@ -30,7 +33,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_language(self):
with override_settings(OCR_LANGUAGE="eng+deu"):
instance = OcrSettings.objects.all().first()
instance = ApplicationConfiguration.objects.all().first()
instance.language = "fra+ita"
instance.save()
@@ -39,8 +42,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_output_type(self):
with override_settings(OCR_LANGUAGE="pdfa-3"):
instance = OcrSettings.objects.all().first()
instance.output_type = CommonSettings.OutputTypeChoices.PDF_A
instance = ApplicationConfiguration.objects.all().first()
instance.output_type = OutputTypeChoices.PDF_A
instance.save()
params = self.get_params()
@@ -48,8 +51,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_mode(self):
with override_settings(OCR_MODE="redo"):
instance = OcrSettings.objects.all().first()
instance.mode = OcrSettings.ModeChoices.SKIP
instance = ApplicationConfiguration.objects.all().first()
instance.mode = ModeChoices.SKIP
instance.save()
params = self.get_params()
@@ -59,8 +62,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_clean(self):
with override_settings(OCR_CLEAN="clean-final"):
instance = OcrSettings.objects.all().first()
instance.unpaper_clean = OcrSettings.CleanChoices.CLEAN
instance = ApplicationConfiguration.objects.all().first()
instance.unpaper_clean = CleanChoices.CLEAN
instance.save()
params = self.get_params()
@@ -68,8 +71,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
self.assertNotIn("clean_final", params)
with override_settings(OCR_CLEAN="clean-final"):
instance = OcrSettings.objects.all().first()
instance.unpaper_clean = OcrSettings.CleanChoices.FINAL
instance = ApplicationConfiguration.objects.all().first()
instance.unpaper_clean = CleanChoices.FINAL
instance.save()
params = self.get_params()
@@ -78,7 +81,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_deskew(self):
with override_settings(OCR_DESKEW=False):
instance = OcrSettings.objects.all().first()
instance = ApplicationConfiguration.objects.all().first()
instance.deskew = True
instance.save()
@@ -87,7 +90,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_rotate(self):
with override_settings(OCR_ROTATE_PAGES=False, OCR_ROTATE_PAGES_THRESHOLD=30.0):
instance = OcrSettings.objects.all().first()
instance = ApplicationConfiguration.objects.all().first()
instance.rotate_pages = True
instance.rotate_pages_threshold = 15.0
instance.save()
@@ -98,7 +101,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_max_pixels(self):
with override_settings(OCR_MAX_IMAGE_PIXELS=2_000_000.0):
instance = OcrSettings.objects.all().first()
instance = ApplicationConfiguration.objects.all().first()
instance.max_image_pixels = 1_000_000.0
instance.save()
@@ -107,10 +110,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_color_convert(self):
with override_settings(OCR_COLOR_CONVERSION_STRATEGY="LeaveColorUnchanged"):
instance = OcrSettings.objects.all().first()
instance.color_conversion_strategy = (
OcrSettings.ColorConvertChoices.INDEPENDENT
)
instance = ApplicationConfiguration.objects.all().first()
instance.color_conversion_strategy = ColorConvertChoices.INDEPENDENT
instance.save()
params = self.get_params()