Updates to use a single configuration object for all settings

This commit is contained in:
Trenton H 2023-12-19 10:21:51 -08:00
parent a6c8550db5
commit 74e845974c
13 changed files with 242 additions and 249 deletions

View File

@ -41,8 +41,7 @@ from documents.settings import EXPORTER_THUMBNAIL_NAME
from documents.utils import copy_file_with_basic_stats
from paperless import version
from paperless.db import GnuPG
from paperless.models import CommonSettings
from paperless.models import OcrSettings
from paperless.models import ApplicationConfiguration
from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
@ -294,11 +293,7 @@ class Command(BaseCommand):
)
manifest += json.loads(
serializers.serialize("json", CommonSettings.objects.all()),
)
manifest += json.loads(
serializers.serialize("json", OcrSettings.objects.all()),
serializers.serialize("json", ApplicationConfiguration.objects.all()),
)
# These are treated specially and included in the per-document manifest

View File

@ -320,7 +320,7 @@ class DocumentParser(LoggingMixin):
def __init__(self, logging_group, progress_callback=None):
super().__init__()
self.logging_group = logging_group
self.parser_settings = self.get_settings()
self.settings = self.get_settings()
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)

View File

@ -168,7 +168,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
manifest = self._do_export(use_filename_format=use_filename_format)
self.assertEqual(len(manifest), 184)
self.assertEqual(len(manifest), 178)
# dont include consumer or AnonymousUser users
self.assertEqual(
@ -262,7 +262,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertEqual(Document.objects.get(id=self.d4.id).title, "wow_dec")
self.assertEqual(GroupObjectPermission.objects.count(), 1)
self.assertEqual(UserObjectPermission.objects.count(), 1)
self.assertEqual(Permission.objects.count(), 132)
self.assertEqual(Permission.objects.count(), 128)
messages = check_sanity()
# everything is alright after the test
self.assertEqual(len(messages), 0)
@ -694,15 +694,15 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
os.path.join(self.dirs.media_dir, "documents"),
)
self.assertEqual(ContentType.objects.count(), 33)
self.assertEqual(Permission.objects.count(), 132)
self.assertEqual(ContentType.objects.count(), 32)
self.assertEqual(Permission.objects.count(), 128)
manifest = self._do_export()
with paperless_environment():
self.assertEqual(
len(list(filter(lambda e: e["model"] == "auth.permission", manifest))),
132,
128,
)
# add 1 more to db to show objects are not re-created by import
Permission.objects.create(
@ -710,7 +710,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
codename="test_perm",
content_type_id=1,
)
self.assertEqual(Permission.objects.count(), 133)
self.assertEqual(Permission.objects.count(), 129)
# will cause an import error
self.user.delete()
@ -719,5 +719,5 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
with self.assertRaises(IntegrityError):
call_command("document_importer", "--no-progress-bar", self.target)
self.assertEqual(ContentType.objects.count(), 33)
self.assertEqual(Permission.objects.count(), 133)
self.assertEqual(ContentType.objects.count(), 32)
self.assertEqual(Permission.objects.count(), 129)

88
src/paperless/config.py Normal file
View File

@ -0,0 +1,88 @@
import dataclasses
import json
from typing import Optional
from django.conf import settings
from paperless.models import ApplicationConfiguration
@dataclasses.dataclass
class OutputTypeConfig:
"""
Almost all parsers care about the chosen PDF output format
"""
output_type: str = dataclasses.field(init=False)
@staticmethod
def _get_config_instance() -> ApplicationConfiguration:
app_config = ApplicationConfiguration.objects.all().first()
# Workaround for a test where the migration hasn't run to create the single model
if app_config is None:
ApplicationConfiguration.objects.create()
app_config = ApplicationConfiguration.objects.all().first()
return app_config
def __post_init__(self) -> None:
app_config = self._get_config_instance()
self.output_type = app_config.output_type or settings.OCR_OUTPUT_TYPE
@dataclasses.dataclass
class OcrConfig(OutputTypeConfig):
"""
Specific settings for the Tesseract based parser. Options generall
correspond almost directly to the OCRMyPDF options
"""
pages: Optional[int] = dataclasses.field(init=False)
language: str = dataclasses.field(init=False)
mode: str = dataclasses.field(init=False)
skip_archive_file: str = dataclasses.field(init=False)
image_dpi: Optional[int] = dataclasses.field(init=False)
clean: str = dataclasses.field(init=False)
deskew: bool = dataclasses.field(init=False)
rotate: bool = dataclasses.field(init=False)
rotate_threshold: float = dataclasses.field(init=False)
max_image_pixel: Optional[float] = dataclasses.field(init=False)
color_conversion_strategy: str = dataclasses.field(init=False)
user_args: Optional[dict[str, str]] = dataclasses.field(init=False)
def __post_init__(self) -> None:
super().__post_init__()
app_config = self._get_config_instance()
self.pages = app_config.pages or settings.OCR_PAGES
self.language = app_config.language or settings.OCR_LANGUAGE
self.mode = app_config.mode or settings.OCR_MODE
self.skip_archive_file = (
app_config.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
)
self.image_dpi = app_config.image_dpi or settings.OCR_IMAGE_DPI
self.clean = app_config.unpaper_clean or settings.OCR_CLEAN
self.deskew = app_config.deskew or settings.OCR_DESKEW
self.rotate = app_config.rotate_pages or settings.OCR_ROTATE_PAGES
self.rotate_threshold = (
app_config.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD
)
self.max_image_pixel = (
app_config.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS
)
self.color_conversion_strategy = (
app_config.color_conversion_strategy
or settings.OCR_COLOR_CONVERSION_STRATEGY
)
user_args = None
if app_config.user_args:
user_args = app_config.user_args
elif settings.OCR_USER_ARGS is not None:
try:
user_args = json.loads(settings.OCR_USER_ARGS)
except json.JSONDecodeError:
user_args = {}
self.user_args = user_args

View File

@ -1,4 +1,4 @@
# Generated by Django 4.2.7 on 2023-12-14 17:12
# Generated by Django 4.2.7 on 2023-12-19 17:51
import django.core.validators
from django.db import migrations
@ -7,10 +7,9 @@ from django.db import models
def _create_singleton(apps, schema_editor):
"""
Creates the first and only instance of the settings models
Creates the first and only instance of the configuration model
"""
for model_name in ["CommonSettings", "OcrSettings"]:
settings_model = apps.get_model("paperless", model_name)
settings_model = apps.get_model("paperless", "ApplicationConfiguration")
settings_model.objects.create()
@ -21,7 +20,7 @@ class Migration(migrations.Migration):
operations = [
migrations.CreateModel(
name="CommonSettings",
name="ApplicationConfiguration",
fields=[
(
"id",
@ -48,23 +47,6 @@ class Migration(migrations.Migration):
verbose_name="Sets the output PDF type",
),
),
],
options={
"abstract": False,
},
),
migrations.CreateModel(
name="OcrSettings",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"pages",
models.PositiveIntegerField(
@ -88,9 +70,9 @@ class Migration(migrations.Migration):
blank=True,
choices=[
("skip", "skip"),
("skip_noarchive", "skip_noarchive"),
("redo", "redo"),
("force", "force"),
("skip_noarchive", "skip_noarchive"),
],
max_length=16,
null=True,
@ -186,7 +168,7 @@ class Migration(migrations.Migration):
),
],
options={
"verbose_name": "ocr settings",
"verbose_name": "paperless application settings",
},
),
migrations.RunPython(_create_singleton, migrations.RunPython.noop),

View File

@ -17,18 +17,67 @@ class AbstractSingletonModel(models.Model):
super().save(*args, **kwargs)
class CommonSettings(AbstractSingletonModel):
class OutputTypeChoices(models.TextChoices):
"""
Settings which are common across more than 1 parser
Matches to --output-type
"""
class OutputTypeChoices(models.TextChoices):
PDF = ("pdf", _("pdf"))
PDF_A = ("pdfa", _("pdfa"))
PDF_A1 = ("pdfa-1", _("pdfa-1"))
PDF_A2 = ("pdfa-2", _("pdfa-2"))
PDF_A3 = ("pdfa-3", _("pdfa-3"))
class ModeChoices(models.TextChoices):
"""
Matches to --skip-text, --redo-ocr, --force-ocr
and our own custom setting
"""
SKIP = ("skip", _("skip"))
REDO = ("redo", _("redo"))
FORCE = ("force", _("force"))
SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive"))
class ArchiveFileChoices(models.TextChoices):
"""
Settings to control creation of an archive PDF file
"""
NEVER = ("never", _("never"))
WITH_TEXT = ("with_text", _("with_text"))
ALWAYS = ("always", _("always"))
class CleanChoices(models.TextChoices):
"""
Matches to --clean, --clean-final
"""
CLEAN = ("clean", _("clean"))
FINAL = ("clean-final", _("clean-final"))
NONE = ("none", _("none"))
class ColorConvertChoices(models.TextChoices):
"""
Refer to the Ghostscript documentation for valid options
"""
UNCHANGED = ("LeaveColorUnchanged", _("LeaveColorUnchanged"))
RGB = ("RGB", _("RGB"))
INDEPENDENT = ("UseDeviceIndependentColor", _("UseDeviceIndependentColor"))
GRAY = ("Gray", _("Gray"))
CMYK = ("CMYK", _("CMYK"))
class ApplicationConfiguration(AbstractSingletonModel):
"""
Settings which are common across more than 1 parser
"""
output_type = models.CharField(
verbose_name=_("Sets the output PDF type"),
null=True,
@ -37,35 +86,10 @@ class CommonSettings(AbstractSingletonModel):
choices=OutputTypeChoices.choices,
)
class OcrSettings(AbstractSingletonModel):
"""
Settings for the Tesseract based OCR parser
"""
class ModeChoices(models.TextChoices):
SKIP = ("skip", _("skip"))
SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive"))
REDO = ("redo", _("redo"))
FORCE = ("force", _("force"))
class ArchiveFileChoices(models.TextChoices):
NEVER = ("never", _("never"))
WITH_TEXT = ("with_text", _("with_text"))
ALWAYS = ("always", _("always"))
class CleanChoices(models.TextChoices):
CLEAN = ("clean", _("clean"))
FINAL = ("clean-final", _("clean-final"))
NONE = ("none", _("none"))
class ColorConvertChoices(models.TextChoices):
UNCHANGED = ("LeaveColorUnchanged", _("LeaveColorUnchanged"))
RGB = ("RGB", _("RGB"))
INDEPENDENT = ("UseDeviceIndependentColor", _("UseDeviceIndependentColor"))
GRAY = ("Gray", _("Gray"))
CMYK = ("CMYK", _("CMYK"))
pages = models.PositiveIntegerField(
verbose_name=_("Do OCR from page 1 to this value"),
null=True,
@ -142,7 +166,7 @@ class OcrSettings(AbstractSingletonModel):
)
class Meta:
verbose_name = _("ocr settings")
verbose_name = _("paperless application settings")
def __str__(self) -> str:
return "OcrSettings"
return "ApplicationConfiguration"

View File

@ -3,8 +3,7 @@ from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
from rest_framework import serializers
from paperless.models import CommonSettings
from paperless.models import OcrSettings
from paperless.models import ApplicationConfiguration
class ObfuscatedUserPasswordField(serializers.Field):
@ -118,13 +117,7 @@ class ProfileSerializer(serializers.ModelSerializer):
)
class CommonSettingsSerializer(serializers.ModelSerializer):
class ApplicationConfigurationSerializer(serializers.ModelSerializer):
class Meta:
model = CommonSettings
fields = "__all__"
class OcrSettingsSerializer(serializers.ModelSerializer):
class Meta:
model = OcrSettings
model = ApplicationConfiguration
fields = "__all__"

View File

@ -34,11 +34,10 @@ from documents.views import TasksViewSet
from documents.views import UiSettingsView
from documents.views import UnifiedSearchViewSet
from paperless.consumers import StatusConsumer
from paperless.views import CommonSettingsViewSet
from paperless.views import ApplicationConfigurationViewSet
from paperless.views import FaviconView
from paperless.views import GenerateAuthTokenView
from paperless.views import GroupViewSet
from paperless.views import OcrSettingsViewSet
from paperless.views import ProfileView
from paperless.views import UserViewSet
from paperless_mail.views import MailAccountTestView
@ -61,8 +60,7 @@ api_router.register(r"mail_rules", MailRuleViewSet)
api_router.register(r"share_links", ShareLinkViewSet)
api_router.register(r"consumption_templates", ConsumptionTemplateViewSet)
api_router.register(r"custom_fields", CustomFieldViewSet)
api_router.register(r"common_settings", CommonSettingsViewSet)
api_router.register(r"ocr_settings", OcrSettingsViewSet)
api_router.register(r"config", ApplicationConfigurationViewSet)
urlpatterns = [

View File

@ -18,11 +18,9 @@ from rest_framework.viewsets import ModelViewSet
from documents.permissions import PaperlessObjectPermissions
from paperless.filters import GroupFilterSet
from paperless.filters import UserFilterSet
from paperless.models import CommonSettings
from paperless.models import OcrSettings
from paperless.serialisers import CommonSettingsSerializer
from paperless.models import ApplicationConfiguration
from paperless.serialisers import ApplicationConfigurationSerializer
from paperless.serialisers import GroupSerializer
from paperless.serialisers import OcrSettingsSerializer
from paperless.serialisers import ProfileSerializer
from paperless.serialisers import UserSerializer
@ -166,19 +164,10 @@ class GenerateAuthTokenView(GenericAPIView):
)
class CommonSettingsViewSet(ModelViewSet):
model = CommonSettings
class ApplicationConfigurationViewSet(ModelViewSet):
model = ApplicationConfiguration
queryset = CommonSettings.objects
queryset = ApplicationConfiguration.objects
serializer_class = CommonSettingsSerializer
permission_classes = (IsAuthenticated,)
class OcrSettingsViewSet(ModelViewSet):
model = OcrSettings
queryset = OcrSettings.objects
serializer_class = OcrSettingsSerializer
serializer_class = ApplicationConfigurationSerializer
permission_classes = (IsAuthenticated,)

View File

@ -12,9 +12,10 @@ from PIL import Image
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless.models import OcrSettings as OcrSettingModel
from paperless_tesseract.setting_schema import OcrSetting
from paperless_tesseract.setting_schema import get_ocr_settings
from paperless.config import OcrConfig
from paperless.models import ArchiveFileChoices
from paperless.models import CleanChoices
from paperless.models import ModeChoices
class NoTextFoundException(Exception):
@ -33,8 +34,8 @@ class RasterisedDocumentParser(DocumentParser):
logging_name = "paperless.parsing.tesseract"
def get_settings(self) -> OcrSetting:
return get_ocr_settings()
def get_settings(self) -> OcrConfig:
return OcrConfig()
def extract_metadata(self, document_path, mime_type):
result = []
@ -129,7 +130,7 @@ class RasterisedDocumentParser(DocumentParser):
if (
sidecar_file is not None
and os.path.isfile(sidecar_file)
and self.parser_settings.mode != "redo"
and self.settings.mode != "redo"
):
text = self.read_file_handle_unicode_errors(sidecar_file)
@ -185,7 +186,7 @@ class RasterisedDocumentParser(DocumentParser):
safe_fallback=False,
):
if TYPE_CHECKING:
assert isinstance(self.parser_settings, OcrSetting)
assert isinstance(self.settings, OcrConfig)
ocrmypdf_args = {
"input_file": input_file,
"output_file": output_file,
@ -193,55 +194,47 @@ class RasterisedDocumentParser(DocumentParser):
# processes via the task library.
"use_threads": True,
"jobs": settings.THREADS_PER_WORKER,
"language": self.parser_settings.language,
"output_type": self.parser_settings.output_type,
"language": self.settings.language,
"output_type": self.settings.output_type,
"progress_bar": False,
}
if "pdfa" in ocrmypdf_args["output_type"]:
ocrmypdf_args[
"color_conversion_strategy"
] = self.parser_settings.color_conversion_strategy
] = self.settings.color_conversion_strategy
if (
self.parser_settings.mode == OcrSettingModel.ModeChoices.FORCE
or safe_fallback
):
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
ocrmypdf_args["force_ocr"] = True
elif self.parser_settings.mode in {
OcrSettingModel.ModeChoices.SKIP,
OcrSettingModel.ModeChoices.SKIP_NO_ARCHIVE,
elif self.settings.mode in {
ModeChoices.SKIP,
ModeChoices.SKIP_NO_ARCHIVE,
}:
ocrmypdf_args["skip_text"] = True
elif self.parser_settings.mode == OcrSettingModel.ModeChoices.REDO:
elif self.settings.mode == ModeChoices.REDO:
ocrmypdf_args["redo_ocr"] = True
else:
raise ParseError(f"Invalid ocr mode: {self.parser_settings.mode}")
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
if self.parser_settings.clean == OcrSettingModel.CleanChoices.CLEAN:
if self.settings.clean == CleanChoices.CLEAN:
ocrmypdf_args["clean"] = True
elif self.parser_settings.clean == OcrSettingModel.CleanChoices.FINAL:
if self.parser_settings.mode == OcrSettingModel.ModeChoices.REDO:
elif self.settings.clean == CleanChoices.FINAL:
if self.settings.mode == ModeChoices.REDO:
ocrmypdf_args["clean"] = True
else:
# --clean-final is not compatible with --redo-ocr
ocrmypdf_args["clean_final"] = True
if (
self.parser_settings.deskew
and self.parser_settings.mode != OcrSettingModel.ModeChoices.REDO
):
if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
# --deskew is not compatible with --redo-ocr
ocrmypdf_args["deskew"] = True
if self.parser_settings.rotate:
if self.settings.rotate:
ocrmypdf_args["rotate_pages"] = True
ocrmypdf_args[
"rotate_pages_threshold"
] = self.parser_settings.rotate_threshold
ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
if self.parser_settings.pages is not None:
ocrmypdf_args["pages"] = f"1-{self.parser_settings.pages}"
if self.settings.pages is not None:
ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
else:
# sidecar is incompatible with pages
ocrmypdf_args["sidecar"] = sidecar_file
@ -260,8 +253,8 @@ class RasterisedDocumentParser(DocumentParser):
if dpi:
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
ocrmypdf_args["image_dpi"] = dpi
elif self.parser_settings.image_dpi is not None:
ocrmypdf_args["image_dpi"] = self.parser_settings.image_dpi
elif self.settings.image_dpi is not None:
ocrmypdf_args["image_dpi"] = self.settings.image_dpi
elif a4_dpi:
ocrmypdf_args["image_dpi"] = a4_dpi
else:
@ -275,18 +268,18 @@ class RasterisedDocumentParser(DocumentParser):
f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
)
if self.parser_settings.user_args is not None:
if self.settings.user_args is not None:
try:
ocrmypdf_args = {**ocrmypdf_args, **self.parser_settings.user_args}
ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
except Exception as e:
self.log.warning(
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
f"they will not be used. Error: {e}",
)
if self.parser_settings.max_image_pixel is not None:
if self.settings.max_image_pixel is not None:
# Convert pixels to mega-pixels and provide to ocrmypdf
max_pixels_mpixels = self.parser_settings.max_image_pixel / 1_000_000.0
max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
if max_pixels_mpixels > 0:
self.log.debug(
f"Calculated {max_pixels_mpixels} megapixels for OCR",
@ -318,11 +311,11 @@ class RasterisedDocumentParser(DocumentParser):
# If the original has text, and the user doesn't want an archive,
# we're done here
skip_archive_for_text = (
self.parser_settings.mode == OcrSettingModel.ModeChoices.SKIP_NO_ARCHIVE
or self.parser_settings.skip_archive_file
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
or self.settings.skip_archive_file
in {
OcrSettingModel.ArchiveFileChoices.WITH_TEXT,
OcrSettingModel.ArchiveFileChoices.ALWAYS,
ArchiveFileChoices.WITH_TEXT,
ArchiveFileChoices.ALWAYS,
}
)
if skip_archive_for_text and original_has_text:
@ -353,10 +346,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log.debug(f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
if (
self.parser_settings.skip_archive_file
!= OcrSettingModel.ArchiveFileChoices.ALWAYS
):
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)

View File

@ -1,72 +0,0 @@
import dataclasses
import json
from typing import Optional
from django.conf import settings
from paperless.models import CommonSettings
from paperless.models import OcrSettings as OcrSettingModel
@dataclasses.dataclass(frozen=True)
class OcrSetting:
pages: Optional[int]
language: str
output_type: str
mode: str
skip_archive_file: str
image_dpi: Optional[int]
clean: str
deskew: bool
rotate: bool
rotate_threshold: float
max_image_pixel: Optional[float]
color_conversion_strategy: str
user_args: Optional[dict[str, str]]
def get_ocr_settings() -> OcrSetting:
ocr_db_settings = OcrSettingModel.objects.all().first()
# Workaround for a test where the migration hasn't run to create the single model
if ocr_db_settings is None:
OcrSettingModel.objects.create()
ocr_db_settings = OcrSettingModel.objects.all().first()
cmn_db_settings = CommonSettings.objects.all().first()
if cmn_db_settings is None:
CommonSettings.objects.create()
cmn_db_settings = CommonSettings.objects.all().first()
user_args = None
if ocr_db_settings.user_args:
user_args = ocr_db_settings.user_args
elif settings.OCR_USER_ARGS is not None:
try:
user_args = json.loads(settings.OCR_USER_ARGS)
except json.JSONDecodeError:
user_args = {}
return OcrSetting(
pages=ocr_db_settings.pages or settings.OCR_PAGES,
language=ocr_db_settings.language or settings.OCR_LANGUAGE,
output_type=cmn_db_settings.output_type or settings.OCR_OUTPUT_TYPE,
mode=ocr_db_settings.mode or settings.OCR_MODE,
skip_archive_file=(
ocr_db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
),
image_dpi=ocr_db_settings.image_dpi or settings.OCR_IMAGE_DPI,
clean=ocr_db_settings.unpaper_clean or settings.OCR_CLEAN,
deskew=ocr_db_settings.deskew or settings.OCR_DESKEW,
rotate=ocr_db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
rotate_threshold=(
ocr_db_settings.rotate_pages_threshold
or settings.OCR_ROTATE_PAGES_THRESHOLD
),
max_image_pixel=ocr_db_settings.max_image_pixels
or settings.OCR_MAX_IMAGE_PIXELS,
color_conversion_strategy=(
ocr_db_settings.color_conversion_strategy
or settings.OCR_COLOR_CONVERSION_STRATEGY
),
user_args=user_args,
)

View File

@ -3,8 +3,11 @@ from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless.models import CommonSettings
from paperless.models import OcrSettings
from paperless.models import ApplicationConfiguration
from paperless.models import CleanChoices
from paperless.models import ColorConvertChoices
from paperless.models import ModeChoices
from paperless.models import OutputTypeChoices
from paperless_tesseract.parsers import RasterisedDocumentParser
@ -21,7 +24,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_pages(self):
with override_settings(OCR_PAGES=10):
instance = OcrSettings.objects.all().first()
instance = ApplicationConfiguration.objects.all().first()
instance.pages = 5
instance.save()
@ -30,7 +33,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_language(self):
with override_settings(OCR_LANGUAGE="eng+deu"):
instance = OcrSettings.objects.all().first()
instance = ApplicationConfiguration.objects.all().first()
instance.language = "fra+ita"
instance.save()
@ -39,8 +42,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_output_type(self):
with override_settings(OCR_LANGUAGE="pdfa-3"):
instance = OcrSettings.objects.all().first()
instance.output_type = CommonSettings.OutputTypeChoices.PDF_A
instance = ApplicationConfiguration.objects.all().first()
instance.output_type = OutputTypeChoices.PDF_A
instance.save()
params = self.get_params()
@ -48,8 +51,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_mode(self):
with override_settings(OCR_MODE="redo"):
instance = OcrSettings.objects.all().first()
instance.mode = OcrSettings.ModeChoices.SKIP
instance = ApplicationConfiguration.objects.all().first()
instance.mode = ModeChoices.SKIP
instance.save()
params = self.get_params()
@ -59,8 +62,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_clean(self):
with override_settings(OCR_CLEAN="clean-final"):
instance = OcrSettings.objects.all().first()
instance.unpaper_clean = OcrSettings.CleanChoices.CLEAN
instance = ApplicationConfiguration.objects.all().first()
instance.unpaper_clean = CleanChoices.CLEAN
instance.save()
params = self.get_params()
@ -68,8 +71,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
self.assertNotIn("clean_final", params)
with override_settings(OCR_CLEAN="clean-final"):
instance = OcrSettings.objects.all().first()
instance.unpaper_clean = OcrSettings.CleanChoices.FINAL
instance = ApplicationConfiguration.objects.all().first()
instance.unpaper_clean = CleanChoices.FINAL
instance.save()
params = self.get_params()
@ -78,7 +81,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_deskew(self):
with override_settings(OCR_DESKEW=False):
instance = OcrSettings.objects.all().first()
instance = ApplicationConfiguration.objects.all().first()
instance.deskew = True
instance.save()
@ -87,7 +90,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_rotate(self):
with override_settings(OCR_ROTATE_PAGES=False, OCR_ROTATE_PAGES_THRESHOLD=30.0):
instance = OcrSettings.objects.all().first()
instance = ApplicationConfiguration.objects.all().first()
instance.rotate_pages = True
instance.rotate_pages_threshold = 15.0
instance.save()
@ -98,7 +101,7 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_max_pixels(self):
with override_settings(OCR_MAX_IMAGE_PIXELS=2_000_000.0):
instance = OcrSettings.objects.all().first()
instance = ApplicationConfiguration.objects.all().first()
instance.max_image_pixels = 1_000_000.0
instance.save()
@ -107,10 +110,8 @@ class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCas
def test_db_settings_ocr_color_convert(self):
with override_settings(OCR_COLOR_CONVERSION_STRATEGY="LeaveColorUnchanged"):
instance = OcrSettings.objects.all().first()
instance.color_conversion_strategy = (
OcrSettings.ColorConvertChoices.INDEPENDENT
)
instance = ApplicationConfiguration.objects.all().first()
instance.color_conversion_strategy = ColorConvertChoices.INDEPENDENT
instance.save()
params = self.get_params()

View File

@ -10,6 +10,8 @@ from tika_client import TikaClient
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless.config import OutputTypeConfig
from paperless.models import OutputTypeChoices
class TikaDocumentParser(DocumentParser):
@ -91,11 +93,14 @@ class TikaDocumentParser(DocumentParser):
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client, client.libre_office.to_pdf() as route:
# Set the output format of the resulting PDF
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
if settings.OCR_OUTPUT_TYPE in {
OutputTypeChoices.PDF_A,
OutputTypeChoices.PDF_A2,
}:
route.pdf_format(PdfAFormat.A2b)
elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
route.pdf_format(PdfAFormat.A1a)
elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
route.pdf_format(PdfAFormat.A3b)
route.convert(document_path)
@ -112,8 +117,8 @@ class TikaDocumentParser(DocumentParser):
f"Error while converting document to PDF: {err}",
) from err
def get_settings(self):
def get_settings(self) -> OutputTypeConfig:
"""
This parser does not implement additional settings yet
"""
return None
return OutputTypeConfig()