More fixes and work

This commit is contained in:
Trenton H 2023-12-14 14:41:27 -08:00
parent a7753b1f89
commit 25cc7ada6b
11 changed files with 193 additions and 142 deletions

View File

@ -41,6 +41,8 @@ from documents.settings import EXPORTER_THUMBNAIL_NAME
from documents.utils import copy_file_with_basic_stats
from paperless import version
from paperless.db import GnuPG
from paperless.models import CommonSettings
from paperless.models import OcrSettings
from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
@ -291,6 +293,14 @@ class Command(BaseCommand):
serializers.serialize("json", CustomField.objects.all()),
)
manifest += json.loads(
serializers.serialize("json", CommonSettings.objects.all()),
)
manifest += json.loads(
serializers.serialize("json", OcrSettings.objects.all()),
)
# These are treated specially and included in the per-document manifest
# if that setting is enabled. Otherwise, they are just exported to the bulk
# manifest

View File

@ -9,7 +9,7 @@ def _create_singleton(apps, schema_editor):
"""
Creates the first and only instance of the settings models
"""
for model_name in ["CommonSettings", "OcrSettings", "TextSettings", "TikaSettings"]:
for model_name in ["CommonSettings", "OcrSettings"]:
settings_model = apps.get_model("paperless", model_name)
settings_model.objects.create()
@ -189,60 +189,5 @@ class Migration(migrations.Migration):
"verbose_name": "ocr settings",
},
),
migrations.CreateModel(
name="TextSettings",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"thumbnail_font_name",
models.CharField(
blank=True,
max_length=64,
null=True,
verbose_name="Sets the output PDF type",
),
),
],
options={
"abstract": False,
},
),
migrations.CreateModel(
name="TikaSettings",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"tika_url",
models.URLField(blank=True, null=True, verbose_name="Tika URL"),
),
(
"gotenberg_url",
models.URLField(
blank=True,
null=True,
verbose_name="Gotenberg URL",
),
),
],
options={
"abstract": False,
},
),
migrations.RunPython(_create_singleton, migrations.RunPython.noop),
]

View File

@ -146,40 +146,3 @@ class OcrSettings(AbstractSingletonModel):
def __str__(self) -> str:
return "OcrSettings"
class TextSettings(AbstractSingletonModel):
"""
Settings for the text parser
"""
thumbnail_font_name = models.CharField(
verbose_name=_("Sets the output PDF type"),
null=True,
blank=True,
max_length=64,
)
class TikaSettings(AbstractSingletonModel):
"""
Settings for the Tika parser
"""
tika_url = models.URLField(
verbose_name=_("Tika URL"),
null=True,
blank=True,
)
gotenberg_url = models.URLField(
verbose_name=_("Gotenberg URL"),
null=True,
blank=True,
)
class ConsumerSettings(AbstractSingletonModel):
delete_duplicates = models.BooleanField(
verbose_name=_("Delete duplicate consumer files"),
null=True,
)

View File

@ -3,6 +3,7 @@ from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
from rest_framework import serializers
from paperless.models import CommonSettings
from paperless.models import OcrSettings
@ -117,6 +118,12 @@ class ProfileSerializer(serializers.ModelSerializer):
)
class CommonSettingsSerializer(serializers.ModelSerializer):
class Meta:
model = CommonSettings
fields = ["all"]
class OcrSettingsSerializer(serializers.ModelSerializer):
class Meta:
model = OcrSettings

View File

@ -34,9 +34,11 @@ from documents.views import TasksViewSet
from documents.views import UiSettingsView
from documents.views import UnifiedSearchViewSet
from paperless.consumers import StatusConsumer
from paperless.views import CommonSettingsViewSet
from paperless.views import FaviconView
from paperless.views import GenerateAuthTokenView
from paperless.views import GroupViewSet
from paperless.views import OcrSettingsViewSet
from paperless.views import ProfileView
from paperless.views import UserViewSet
from paperless_mail.views import MailAccountTestView
@ -59,6 +61,8 @@ api_router.register(r"mail_rules", MailRuleViewSet)
api_router.register(r"share_links", ShareLinkViewSet)
api_router.register(r"consumption_templates", ConsumptionTemplateViewSet)
api_router.register(r"custom_fields", CustomFieldViewSet)
api_router.register(r"common_settings", CommonSettingsViewSet)
api_router.register(r"ocr_settings", OcrSettingsViewSet)
urlpatterns = [

View File

@ -18,7 +18,9 @@ from rest_framework.viewsets import ModelViewSet
from documents.permissions import PaperlessObjectPermissions
from paperless.filters import GroupFilterSet
from paperless.filters import UserFilterSet
from paperless.models import CommonSettings
from paperless.models import OcrSettings
from paperless.serialisers import CommonSettingsSerializer
from paperless.serialisers import GroupSerializer
from paperless.serialisers import OcrSettingsSerializer
from paperless.serialisers import ProfileSerializer
@ -164,6 +166,15 @@ class GenerateAuthTokenView(GenericAPIView):
)
class CommonSettingsViewSet(ModelViewSet):
model = CommonSettings
queryset = CommonSettings.objects
serializer_class = CommonSettingsSerializer
permission_classes = (IsAuthenticated,)
class OcrSettingsViewSet(ModelViewSet):
model = OcrSettings

View File

@ -3,6 +3,7 @@ import re
import subprocess
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Optional
from django.conf import settings
@ -11,7 +12,7 @@ from PIL import Image
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless_tesseract.models import OcrSettings as OcrSettingModel
from paperless.models import OcrSettings as OcrSettingModel
from paperless_tesseract.setting_schema import OcrSetting
from paperless_tesseract.setting_schema import get_ocr_settings
@ -71,7 +72,7 @@ class RasterisedDocumentParser(DocumentParser):
self.logging_group,
)
def is_image(self, mime_type):
def is_image(self, mime_type) -> bool:
return mime_type in [
"image/png",
"image/jpeg",
@ -81,7 +82,7 @@ class RasterisedDocumentParser(DocumentParser):
"image/webp",
]
def has_alpha(self, image):
def has_alpha(self, image) -> bool:
with Image.open(image) as im:
return im.mode in ("RGBA", "LA")
@ -96,7 +97,7 @@ class RasterisedDocumentParser(DocumentParser):
],
)
def get_dpi(self, image):
def get_dpi(self, image) -> Optional[int]:
try:
with Image.open(image) as im:
x, y = im.info["dpi"]
@ -105,7 +106,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log.warning(f"Error while getting DPI from image {image}: {e}")
return None
def calculate_a4_dpi(self, image):
def calculate_a4_dpi(self, image) -> Optional[int]:
try:
with Image.open(image) as im:
width, height = im.size
@ -118,7 +119,11 @@ class RasterisedDocumentParser(DocumentParser):
self.log.warning(f"Error while calculating DPI for image {image}: {e}")
return None
def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path):
def extract_text(
self,
sidecar_file: Optional[Path],
pdf_file: Path,
) -> Optional[str]:
# When re-doing OCR, the sidecar contains ONLY the new text, not
# the whole text, so do not utilize it in that case
if (
@ -179,7 +184,8 @@ class RasterisedDocumentParser(DocumentParser):
sidecar_file,
safe_fallback=False,
):
assert isinstance(self.parser_settings, OcrSetting)
if TYPE_CHECKING:
assert isinstance(self.parser_settings, OcrSetting)
ocrmypdf_args = {
"input_file": input_file,
"output_file": output_file,

View File

@ -4,7 +4,8 @@ from typing import Optional
from django.conf import settings
from paperless_tesseract.models import OcrSettings as OcrSettingModel
from paperless.models import CommonSettings
from paperless.models import OcrSettings as OcrSettingModel
@dataclasses.dataclass(frozen=True)
@ -25,16 +26,20 @@ class OcrSetting:
def get_ocr_settings() -> OcrSetting:
db_settings = OcrSettingModel.objects.all().first()
ocr_db_settings = OcrSettingModel.objects.all().first()
# Workaround for a test where the migration hasn't run to create the single model
if db_settings is None:
if ocr_db_settings is None:
OcrSettingModel.objects.create()
db_settings = OcrSettingModel.objects.all().first()
assert db_settings is not None
ocr_db_settings = OcrSettingModel.objects.all().first()
cmn_db_settings = CommonSettings.objects.all().first()
if cmn_db_settings is None:
CommonSettings.objects.create()
cmn_db_settings = CommonSettings.objects.all().first()
user_args = None
if db_settings.user_args:
user_args = db_settings.user_args
if ocr_db_settings.user_args:
user_args = ocr_db_settings.user_args
elif settings.OCR_USER_ARGS is not None:
try:
user_args = json.loads(settings.OCR_USER_ARGS)
@ -42,23 +47,25 @@ def get_ocr_settings() -> OcrSetting:
user_args = {}
return OcrSetting(
pages=db_settings.pages or settings.OCR_PAGES,
language=db_settings.language or settings.OCR_LANGUAGE,
output_type=db_settings.output_type or settings.OCR_OUTPUT_TYPE,
mode=db_settings.mode or settings.OCR_MODE,
pages=ocr_db_settings.pages or settings.OCR_PAGES,
language=ocr_db_settings.language or settings.OCR_LANGUAGE,
output_type=cmn_db_settings.output_type or settings.OCR_OUTPUT_TYPE,
mode=ocr_db_settings.mode or settings.OCR_MODE,
skip_archive_file=(
db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
ocr_db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
),
image_dpi=db_settings.image_dpi or settings.OCR_IMAGE_DPI,
clean=db_settings.unpaper_clean or settings.OCR_CLEAN,
deskew=db_settings.deskew or settings.OCR_DESKEW,
rotate=db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
image_dpi=ocr_db_settings.image_dpi or settings.OCR_IMAGE_DPI,
clean=ocr_db_settings.unpaper_clean or settings.OCR_CLEAN,
deskew=ocr_db_settings.deskew or settings.OCR_DESKEW,
rotate=ocr_db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
rotate_threshold=(
db_settings.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD
ocr_db_settings.rotate_pages_threshold
or settings.OCR_ROTATE_PAGES_THRESHOLD
),
max_image_pixel=db_settings.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS,
max_image_pixel=ocr_db_settings.max_image_pixels
or settings.OCR_MAX_IMAGE_PIXELS,
color_conversion_strategy=(
db_settings.color_conversion_strategy
ocr_db_settings.color_conversion_strategy
or settings.OCR_COLOR_CONVERSION_STRATEGY
),
user_args=user_args,

View File

@ -2,7 +2,6 @@ import os
import shutil
import tempfile
import uuid
from contextlib import AbstractContextManager
from pathlib import Path
from unittest import mock
@ -17,28 +16,6 @@ from documents.tests.utils import FileSystemAssertsMixin
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_tesseract.parsers import post_process_text
image_to_string_calls = []
def fake_convert(input_file, output_file, **kwargs):
with open(input_file) as f:
lines = f.readlines()
for i, line in enumerate(lines):
with open(output_file % i, "w") as f2:
f2.write(line.strip())
class FakeImageFile(AbstractContextManager):
def __init__(self, fname):
self.fname = fname
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def __enter__(self):
return os.path.basename(self.fname)
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"

View File

@ -0,0 +1,120 @@
from django.test import TestCase
from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless.models import CommonSettings
from paperless.models import OcrSettings
from paperless_tesseract.parsers import RasterisedDocumentParser
class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
@staticmethod
def get_params():
return RasterisedDocumentParser(None).construct_ocrmypdf_parameters(
input_file="input.pdf",
output_file="output.pdf",
sidecar_file="sidecar.txt",
mime_type="application/pdf",
safe_fallback=False,
)
def test_db_settings_ocr_pages(self):
with override_settings(OCR_PAGES=10):
instance = OcrSettings.objects.all().first()
instance.pages = 5
instance.save()
params = self.get_params()
self.assertEqual(params["pages"], "1-5")
def test_db_settings_ocr_language(self):
with override_settings(OCR_LANGUAGE="eng+deu"):
instance = OcrSettings.objects.all().first()
instance.language = "fra+ita"
instance.save()
params = self.get_params()
self.assertEqual(params["language"], "fra+ita")
def test_db_settings_ocr_output_type(self):
with override_settings(OCR_LANGUAGE="pdfa-3"):
instance = OcrSettings.objects.all().first()
instance.output_type = CommonSettings.OutputTypeChoices.PDF_A
instance.save()
params = self.get_params()
self.assertEqual(params["output_type"], "pdfa")
def test_db_settings_ocr_mode(self):
with override_settings(OCR_MODE="redo"):
instance = OcrSettings.objects.all().first()
instance.mode = OcrSettings.ModeChoices.SKIP
instance.save()
params = self.get_params()
self.assertTrue(params["skip_text"])
self.assertNotIn("redo_ocr", params)
self.assertNotIn("force_ocr", params)
def test_db_settings_ocr_clean(self):
with override_settings(OCR_CLEAN="clean-final"):
instance = OcrSettings.objects.all().first()
instance.unpaper_clean = OcrSettings.CleanChoices.CLEAN
instance.save()
params = self.get_params()
self.assertTrue(params["clean"])
self.assertNotIn("clean_final", params)
with override_settings(OCR_CLEAN="clean-final"):
instance = OcrSettings.objects.all().first()
instance.unpaper_clean = OcrSettings.CleanChoices.FINAL
instance.save()
params = self.get_params()
self.assertTrue(params["clean_final"])
self.assertNotIn("clean", params)
def test_db_settings_ocr_deskew(self):
with override_settings(OCR_DESKEW=False):
instance = OcrSettings.objects.all().first()
instance.deskew = True
instance.save()
params = self.get_params()
self.assertTrue(params["deskew"])
def test_db_settings_ocr_rotate(self):
with override_settings(OCR_ROTATE_PAGES=False, OCR_ROTATE_PAGES_THRESHOLD=30.0):
instance = OcrSettings.objects.all().first()
instance.rotate_pages = True
instance.rotate_pages_threshold = 15.0
instance.save()
params = self.get_params()
self.assertTrue(params["rotate_pages"])
self.assertAlmostEqual(params["rotate_pages_threshold"], 15.0)
def test_db_settings_ocr_max_pixels(self):
with override_settings(OCR_MAX_IMAGE_PIXELS=2_000_000.0):
instance = OcrSettings.objects.all().first()
instance.max_image_pixels = 1_000_000.0
instance.save()
params = self.get_params()
self.assertAlmostEqual(params["max_image_mpixels"], 1.0)
def test_db_settings_ocr_color_convert(self):
with override_settings(OCR_COLOR_CONVERSION_STRATEGY="LeaveColorUnchanged"):
instance = OcrSettings.objects.all().first()
instance.color_conversion_strategy = (
OcrSettings.ColorConvertChoices.INDEPENDENT
)
instance.save()
params = self.get_params()
self.assertEqual(
params["color_conversion_strategy"],
"UseDeviceIndependentColor",
)

View File

@ -18,6 +18,7 @@ omit =
exclude_also =
if settings.AUDIT_LOG_ENABLED:
if AUDIT_LOG_ENABLED:
if TYPE_CHECKING:
[mypy]
plugins = mypy_django_plugin.main, mypy_drf_plugin.main, numpy.typing.mypy_plugin