Updates to use a single configuration object for all settings
This commit is contained in:
@@ -12,9 +12,10 @@ from PIL import Image
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from paperless.models import OcrSettings as OcrSettingModel
|
||||
from paperless_tesseract.setting_schema import OcrSetting
|
||||
from paperless_tesseract.setting_schema import get_ocr_settings
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileChoices
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ModeChoices
|
||||
|
||||
|
||||
class NoTextFoundException(Exception):
|
||||
@@ -33,8 +34,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
|
||||
logging_name = "paperless.parsing.tesseract"
|
||||
|
||||
def get_settings(self) -> OcrSetting:
|
||||
return get_ocr_settings()
|
||||
def get_settings(self) -> OcrConfig:
|
||||
return OcrConfig()
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
result = []
|
||||
@@ -129,7 +130,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
if (
|
||||
sidecar_file is not None
|
||||
and os.path.isfile(sidecar_file)
|
||||
and self.parser_settings.mode != "redo"
|
||||
and self.settings.mode != "redo"
|
||||
):
|
||||
text = self.read_file_handle_unicode_errors(sidecar_file)
|
||||
|
||||
@@ -185,7 +186,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
safe_fallback=False,
|
||||
):
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(self.parser_settings, OcrSetting)
|
||||
assert isinstance(self.settings, OcrConfig)
|
||||
ocrmypdf_args = {
|
||||
"input_file": input_file,
|
||||
"output_file": output_file,
|
||||
@@ -193,55 +194,47 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
# processes via the task library.
|
||||
"use_threads": True,
|
||||
"jobs": settings.THREADS_PER_WORKER,
|
||||
"language": self.parser_settings.language,
|
||||
"output_type": self.parser_settings.output_type,
|
||||
"language": self.settings.language,
|
||||
"output_type": self.settings.output_type,
|
||||
"progress_bar": False,
|
||||
}
|
||||
|
||||
if "pdfa" in ocrmypdf_args["output_type"]:
|
||||
ocrmypdf_args[
|
||||
"color_conversion_strategy"
|
||||
] = self.parser_settings.color_conversion_strategy
|
||||
] = self.settings.color_conversion_strategy
|
||||
|
||||
if (
|
||||
self.parser_settings.mode == OcrSettingModel.ModeChoices.FORCE
|
||||
or safe_fallback
|
||||
):
|
||||
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
|
||||
ocrmypdf_args["force_ocr"] = True
|
||||
elif self.parser_settings.mode in {
|
||||
OcrSettingModel.ModeChoices.SKIP,
|
||||
OcrSettingModel.ModeChoices.SKIP_NO_ARCHIVE,
|
||||
elif self.settings.mode in {
|
||||
ModeChoices.SKIP,
|
||||
ModeChoices.SKIP_NO_ARCHIVE,
|
||||
}:
|
||||
ocrmypdf_args["skip_text"] = True
|
||||
elif self.parser_settings.mode == OcrSettingModel.ModeChoices.REDO:
|
||||
elif self.settings.mode == ModeChoices.REDO:
|
||||
ocrmypdf_args["redo_ocr"] = True
|
||||
else:
|
||||
raise ParseError(f"Invalid ocr mode: {self.parser_settings.mode}")
|
||||
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
|
||||
|
||||
if self.parser_settings.clean == OcrSettingModel.CleanChoices.CLEAN:
|
||||
if self.settings.clean == CleanChoices.CLEAN:
|
||||
ocrmypdf_args["clean"] = True
|
||||
elif self.parser_settings.clean == OcrSettingModel.CleanChoices.FINAL:
|
||||
if self.parser_settings.mode == OcrSettingModel.ModeChoices.REDO:
|
||||
elif self.settings.clean == CleanChoices.FINAL:
|
||||
if self.settings.mode == ModeChoices.REDO:
|
||||
ocrmypdf_args["clean"] = True
|
||||
else:
|
||||
# --clean-final is not compatible with --redo-ocr
|
||||
ocrmypdf_args["clean_final"] = True
|
||||
|
||||
if (
|
||||
self.parser_settings.deskew
|
||||
and self.parser_settings.mode != OcrSettingModel.ModeChoices.REDO
|
||||
):
|
||||
if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
|
||||
# --deskew is not compatible with --redo-ocr
|
||||
ocrmypdf_args["deskew"] = True
|
||||
|
||||
if self.parser_settings.rotate:
|
||||
if self.settings.rotate:
|
||||
ocrmypdf_args["rotate_pages"] = True
|
||||
ocrmypdf_args[
|
||||
"rotate_pages_threshold"
|
||||
] = self.parser_settings.rotate_threshold
|
||||
ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
|
||||
|
||||
if self.parser_settings.pages is not None:
|
||||
ocrmypdf_args["pages"] = f"1-{self.parser_settings.pages}"
|
||||
if self.settings.pages is not None:
|
||||
ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
|
||||
else:
|
||||
# sidecar is incompatible with pages
|
||||
ocrmypdf_args["sidecar"] = sidecar_file
|
||||
@@ -260,8 +253,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
if dpi:
|
||||
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
|
||||
ocrmypdf_args["image_dpi"] = dpi
|
||||
elif self.parser_settings.image_dpi is not None:
|
||||
ocrmypdf_args["image_dpi"] = self.parser_settings.image_dpi
|
||||
elif self.settings.image_dpi is not None:
|
||||
ocrmypdf_args["image_dpi"] = self.settings.image_dpi
|
||||
elif a4_dpi:
|
||||
ocrmypdf_args["image_dpi"] = a4_dpi
|
||||
else:
|
||||
@@ -275,18 +268,18 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
|
||||
)
|
||||
|
||||
if self.parser_settings.user_args is not None:
|
||||
if self.settings.user_args is not None:
|
||||
try:
|
||||
ocrmypdf_args = {**ocrmypdf_args, **self.parser_settings.user_args}
|
||||
ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
|
||||
except Exception as e:
|
||||
self.log.warning(
|
||||
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
|
||||
f"they will not be used. Error: {e}",
|
||||
)
|
||||
|
||||
if self.parser_settings.max_image_pixel is not None:
|
||||
if self.settings.max_image_pixel is not None:
|
||||
# Convert pixels to mega-pixels and provide to ocrmypdf
|
||||
max_pixels_mpixels = self.parser_settings.max_image_pixel / 1_000_000.0
|
||||
max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
|
||||
if max_pixels_mpixels > 0:
|
||||
self.log.debug(
|
||||
f"Calculated {max_pixels_mpixels} megapixels for OCR",
|
||||
@@ -318,11 +311,11 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
# If the original has text, and the user doesn't want an archive,
|
||||
# we're done here
|
||||
skip_archive_for_text = (
|
||||
self.parser_settings.mode == OcrSettingModel.ModeChoices.SKIP_NO_ARCHIVE
|
||||
or self.parser_settings.skip_archive_file
|
||||
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
|
||||
or self.settings.skip_archive_file
|
||||
in {
|
||||
OcrSettingModel.ArchiveFileChoices.WITH_TEXT,
|
||||
OcrSettingModel.ArchiveFileChoices.ALWAYS,
|
||||
ArchiveFileChoices.WITH_TEXT,
|
||||
ArchiveFileChoices.ALWAYS,
|
||||
}
|
||||
)
|
||||
if skip_archive_for_text and original_has_text:
|
||||
@@ -353,10 +346,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.log.debug(f"Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
if (
|
||||
self.parser_settings.skip_archive_file
|
||||
!= OcrSettingModel.ArchiveFileChoices.ALWAYS
|
||||
):
|
||||
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
|
||||
self.archive_path = archive_path
|
||||
|
||||
self.text = self.extract_text(sidecar_file, archive_path)
|
||||
|
||||
@@ -1,72 +0,0 @@
|
||||
import dataclasses
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from paperless.models import CommonSettings
|
||||
from paperless.models import OcrSettings as OcrSettingModel
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class OcrSetting:
|
||||
pages: Optional[int]
|
||||
language: str
|
||||
output_type: str
|
||||
mode: str
|
||||
skip_archive_file: str
|
||||
image_dpi: Optional[int]
|
||||
clean: str
|
||||
deskew: bool
|
||||
rotate: bool
|
||||
rotate_threshold: float
|
||||
max_image_pixel: Optional[float]
|
||||
color_conversion_strategy: str
|
||||
user_args: Optional[dict[str, str]]
|
||||
|
||||
|
||||
def get_ocr_settings() -> OcrSetting:
|
||||
ocr_db_settings = OcrSettingModel.objects.all().first()
|
||||
# Workaround for a test where the migration hasn't run to create the single model
|
||||
if ocr_db_settings is None:
|
||||
OcrSettingModel.objects.create()
|
||||
ocr_db_settings = OcrSettingModel.objects.all().first()
|
||||
|
||||
cmn_db_settings = CommonSettings.objects.all().first()
|
||||
if cmn_db_settings is None:
|
||||
CommonSettings.objects.create()
|
||||
cmn_db_settings = CommonSettings.objects.all().first()
|
||||
|
||||
user_args = None
|
||||
if ocr_db_settings.user_args:
|
||||
user_args = ocr_db_settings.user_args
|
||||
elif settings.OCR_USER_ARGS is not None:
|
||||
try:
|
||||
user_args = json.loads(settings.OCR_USER_ARGS)
|
||||
except json.JSONDecodeError:
|
||||
user_args = {}
|
||||
|
||||
return OcrSetting(
|
||||
pages=ocr_db_settings.pages or settings.OCR_PAGES,
|
||||
language=ocr_db_settings.language or settings.OCR_LANGUAGE,
|
||||
output_type=cmn_db_settings.output_type or settings.OCR_OUTPUT_TYPE,
|
||||
mode=ocr_db_settings.mode or settings.OCR_MODE,
|
||||
skip_archive_file=(
|
||||
ocr_db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
|
||||
),
|
||||
image_dpi=ocr_db_settings.image_dpi or settings.OCR_IMAGE_DPI,
|
||||
clean=ocr_db_settings.unpaper_clean or settings.OCR_CLEAN,
|
||||
deskew=ocr_db_settings.deskew or settings.OCR_DESKEW,
|
||||
rotate=ocr_db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
|
||||
rotate_threshold=(
|
||||
ocr_db_settings.rotate_pages_threshold
|
||||
or settings.OCR_ROTATE_PAGES_THRESHOLD
|
||||
),
|
||||
max_image_pixel=ocr_db_settings.max_image_pixels
|
||||
or settings.OCR_MAX_IMAGE_PIXELS,
|
||||
color_conversion_strategy=(
|
||||
ocr_db_settings.color_conversion_strategy
|
||||
or settings.OCR_COLOR_CONVERSION_STRATEGY
|
||||
),
|
||||
user_args=user_args,
|
||||
)
|
||||
@@ -3,8 +3,11 @@ from django.test import override_settings
|
||||
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from paperless.models import CommonSettings
|
||||
from paperless.models import OcrSettings
|
||||
from paperless.models import ApplicationConfiguration
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ColorConvertChoices
|
||||
from paperless.models import ModeChoices
|
||||
from paperless.models import OutputTypeChoices
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
@@ -21,7 +24,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
|
||||
def test_db_settings_ocr_pages(self):
|
||||
with override_settings(OCR_PAGES=10):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.pages = 5
|
||||
instance.save()
|
||||
|
||||
@@ -30,7 +33,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
|
||||
def test_db_settings_ocr_language(self):
|
||||
with override_settings(OCR_LANGUAGE="eng+deu"):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.language = "fra+ita"
|
||||
instance.save()
|
||||
|
||||
@@ -39,8 +42,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
|
||||
def test_db_settings_ocr_output_type(self):
|
||||
with override_settings(OCR_LANGUAGE="pdfa-3"):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.output_type = CommonSettings.OutputTypeChoices.PDF_A
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.output_type = OutputTypeChoices.PDF_A
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
@@ -48,8 +51,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
|
||||
def test_db_settings_ocr_mode(self):
|
||||
with override_settings(OCR_MODE="redo"):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.mode = OcrSettings.ModeChoices.SKIP
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.mode = ModeChoices.SKIP
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
@@ -59,8 +62,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
|
||||
def test_db_settings_ocr_clean(self):
|
||||
with override_settings(OCR_CLEAN="clean-final"):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.unpaper_clean = OcrSettings.CleanChoices.CLEAN
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.unpaper_clean = CleanChoices.CLEAN
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
@@ -68,8 +71,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
self.assertNotIn("clean_final", params)
|
||||
|
||||
with override_settings(OCR_CLEAN="clean-final"):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.unpaper_clean = OcrSettings.CleanChoices.FINAL
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.unpaper_clean = CleanChoices.FINAL
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
@@ -78,7 +81,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
|
||||
def test_db_settings_ocr_deskew(self):
|
||||
with override_settings(OCR_DESKEW=False):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.deskew = True
|
||||
instance.save()
|
||||
|
||||
@@ -87,7 +90,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
|
||||
def test_db_settings_ocr_rotate(self):
|
||||
with override_settings(OCR_ROTATE_PAGES=False, OCR_ROTATE_PAGES_THRESHOLD=30.0):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.rotate_pages = True
|
||||
instance.rotate_pages_threshold = 15.0
|
||||
instance.save()
|
||||
@@ -98,7 +101,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
|
||||
def test_db_settings_ocr_max_pixels(self):
|
||||
with override_settings(OCR_MAX_IMAGE_PIXELS=2_000_000.0):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.max_image_pixels = 1_000_000.0
|
||||
instance.save()
|
||||
|
||||
@@ -107,10 +110,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
|
||||
|
||||
def test_db_settings_ocr_color_convert(self):
|
||||
with override_settings(OCR_COLOR_CONVERSION_STRATEGY="LeaveColorUnchanged"):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.color_conversion_strategy = (
|
||||
OcrSettings.ColorConvertChoices.INDEPENDENT
|
||||
)
|
||||
instance = ApplicationConfiguration.objects.all().first()
|
||||
instance.color_conversion_strategy = ColorConvertChoices.INDEPENDENT
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
|
||||
Reference in New Issue
Block a user