More fixes and work
This commit is contained in:
parent
a7753b1f89
commit
25cc7ada6b
@ -41,6 +41,8 @@ from documents.settings import EXPORTER_THUMBNAIL_NAME
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from paperless import version
|
||||
from paperless.db import GnuPG
|
||||
from paperless.models import CommonSettings
|
||||
from paperless.models import OcrSettings
|
||||
from paperless_mail.models import MailAccount
|
||||
from paperless_mail.models import MailRule
|
||||
|
||||
@ -291,6 +293,14 @@ class Command(BaseCommand):
|
||||
serializers.serialize("json", CustomField.objects.all()),
|
||||
)
|
||||
|
||||
manifest += json.loads(
|
||||
serializers.serialize("json", CommonSettings.objects.all()),
|
||||
)
|
||||
|
||||
manifest += json.loads(
|
||||
serializers.serialize("json", OcrSettings.objects.all()),
|
||||
)
|
||||
|
||||
# These are treated specially and included in the per-document manifest
|
||||
# if that setting is enabled. Otherwise, they are just exported to the bulk
|
||||
# manifest
|
||||
|
@ -9,7 +9,7 @@ def _create_singleton(apps, schema_editor):
|
||||
"""
|
||||
Creates the first and only instance of the settings models
|
||||
"""
|
||||
for model_name in ["CommonSettings", "OcrSettings", "TextSettings", "TikaSettings"]:
|
||||
for model_name in ["CommonSettings", "OcrSettings"]:
|
||||
settings_model = apps.get_model("paperless", model_name)
|
||||
settings_model.objects.create()
|
||||
|
||||
@ -189,60 +189,5 @@ class Migration(migrations.Migration):
|
||||
"verbose_name": "ocr settings",
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="TextSettings",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
(
|
||||
"thumbnail_font_name",
|
||||
models.CharField(
|
||||
blank=True,
|
||||
max_length=64,
|
||||
null=True,
|
||||
verbose_name="Sets the output PDF type",
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"abstract": False,
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name="TikaSettings",
|
||||
fields=[
|
||||
(
|
||||
"id",
|
||||
models.AutoField(
|
||||
auto_created=True,
|
||||
primary_key=True,
|
||||
serialize=False,
|
||||
verbose_name="ID",
|
||||
),
|
||||
),
|
||||
(
|
||||
"tika_url",
|
||||
models.URLField(blank=True, null=True, verbose_name="Tika URL"),
|
||||
),
|
||||
(
|
||||
"gotenberg_url",
|
||||
models.URLField(
|
||||
blank=True,
|
||||
null=True,
|
||||
verbose_name="Gotenberg URL",
|
||||
),
|
||||
),
|
||||
],
|
||||
options={
|
||||
"abstract": False,
|
||||
},
|
||||
),
|
||||
migrations.RunPython(_create_singleton, migrations.RunPython.noop),
|
||||
]
|
||||
|
@ -146,40 +146,3 @@ class OcrSettings(AbstractSingletonModel):
|
||||
|
||||
def __str__(self) -> str:
|
||||
return "OcrSettings"
|
||||
|
||||
|
||||
class TextSettings(AbstractSingletonModel):
|
||||
"""
|
||||
Settings for the text parser
|
||||
"""
|
||||
|
||||
thumbnail_font_name = models.CharField(
|
||||
verbose_name=_("Sets the output PDF type"),
|
||||
null=True,
|
||||
blank=True,
|
||||
max_length=64,
|
||||
)
|
||||
|
||||
|
||||
class TikaSettings(AbstractSingletonModel):
|
||||
"""
|
||||
Settings for the Tika parser
|
||||
"""
|
||||
|
||||
tika_url = models.URLField(
|
||||
verbose_name=_("Tika URL"),
|
||||
null=True,
|
||||
blank=True,
|
||||
)
|
||||
gotenberg_url = models.URLField(
|
||||
verbose_name=_("Gotenberg URL"),
|
||||
null=True,
|
||||
blank=True,
|
||||
)
|
||||
|
||||
|
||||
class ConsumerSettings(AbstractSingletonModel):
|
||||
delete_duplicates = models.BooleanField(
|
||||
verbose_name=_("Delete duplicate consumer files"),
|
||||
null=True,
|
||||
)
|
||||
|
@ -3,6 +3,7 @@ from django.contrib.auth.models import Permission
|
||||
from django.contrib.auth.models import User
|
||||
from rest_framework import serializers
|
||||
|
||||
from paperless.models import CommonSettings
|
||||
from paperless.models import OcrSettings
|
||||
|
||||
|
||||
@ -117,6 +118,12 @@ class ProfileSerializer(serializers.ModelSerializer):
|
||||
)
|
||||
|
||||
|
||||
class CommonSettingsSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
model = CommonSettings
|
||||
fields = ["all"]
|
||||
|
||||
|
||||
class OcrSettingsSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
model = OcrSettings
|
||||
|
@ -34,9 +34,11 @@ from documents.views import TasksViewSet
|
||||
from documents.views import UiSettingsView
|
||||
from documents.views import UnifiedSearchViewSet
|
||||
from paperless.consumers import StatusConsumer
|
||||
from paperless.views import CommonSettingsViewSet
|
||||
from paperless.views import FaviconView
|
||||
from paperless.views import GenerateAuthTokenView
|
||||
from paperless.views import GroupViewSet
|
||||
from paperless.views import OcrSettingsViewSet
|
||||
from paperless.views import ProfileView
|
||||
from paperless.views import UserViewSet
|
||||
from paperless_mail.views import MailAccountTestView
|
||||
@ -59,6 +61,8 @@ api_router.register(r"mail_rules", MailRuleViewSet)
|
||||
api_router.register(r"share_links", ShareLinkViewSet)
|
||||
api_router.register(r"consumption_templates", ConsumptionTemplateViewSet)
|
||||
api_router.register(r"custom_fields", CustomFieldViewSet)
|
||||
api_router.register(r"common_settings", CommonSettingsViewSet)
|
||||
api_router.register(r"ocr_settings", OcrSettingsViewSet)
|
||||
|
||||
|
||||
urlpatterns = [
|
||||
|
@ -18,7 +18,9 @@ from rest_framework.viewsets import ModelViewSet
|
||||
from documents.permissions import PaperlessObjectPermissions
|
||||
from paperless.filters import GroupFilterSet
|
||||
from paperless.filters import UserFilterSet
|
||||
from paperless.models import CommonSettings
|
||||
from paperless.models import OcrSettings
|
||||
from paperless.serialisers import CommonSettingsSerializer
|
||||
from paperless.serialisers import GroupSerializer
|
||||
from paperless.serialisers import OcrSettingsSerializer
|
||||
from paperless.serialisers import ProfileSerializer
|
||||
@ -164,6 +166,15 @@ class GenerateAuthTokenView(GenericAPIView):
|
||||
)
|
||||
|
||||
|
||||
class CommonSettingsViewSet(ModelViewSet):
|
||||
model = CommonSettings
|
||||
|
||||
queryset = CommonSettings.objects
|
||||
|
||||
serializer_class = CommonSettingsSerializer
|
||||
permission_classes = (IsAuthenticated,)
|
||||
|
||||
|
||||
class OcrSettingsViewSet(ModelViewSet):
|
||||
model = OcrSettings
|
||||
|
||||
|
@ -3,6 +3,7 @@ import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Optional
|
||||
|
||||
from django.conf import settings
|
||||
@ -11,7 +12,7 @@ from PIL import Image
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from paperless_tesseract.models import OcrSettings as OcrSettingModel
|
||||
from paperless.models import OcrSettings as OcrSettingModel
|
||||
from paperless_tesseract.setting_schema import OcrSetting
|
||||
from paperless_tesseract.setting_schema import get_ocr_settings
|
||||
|
||||
@ -71,7 +72,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.logging_group,
|
||||
)
|
||||
|
||||
def is_image(self, mime_type):
|
||||
def is_image(self, mime_type) -> bool:
|
||||
return mime_type in [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
@ -81,7 +82,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
"image/webp",
|
||||
]
|
||||
|
||||
def has_alpha(self, image):
|
||||
def has_alpha(self, image) -> bool:
|
||||
with Image.open(image) as im:
|
||||
return im.mode in ("RGBA", "LA")
|
||||
|
||||
@ -96,7 +97,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
],
|
||||
)
|
||||
|
||||
def get_dpi(self, image):
|
||||
def get_dpi(self, image) -> Optional[int]:
|
||||
try:
|
||||
with Image.open(image) as im:
|
||||
x, y = im.info["dpi"]
|
||||
@ -105,7 +106,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.log.warning(f"Error while getting DPI from image {image}: {e}")
|
||||
return None
|
||||
|
||||
def calculate_a4_dpi(self, image):
|
||||
def calculate_a4_dpi(self, image) -> Optional[int]:
|
||||
try:
|
||||
with Image.open(image) as im:
|
||||
width, height = im.size
|
||||
@ -118,7 +119,11 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.log.warning(f"Error while calculating DPI for image {image}: {e}")
|
||||
return None
|
||||
|
||||
def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path):
|
||||
def extract_text(
|
||||
self,
|
||||
sidecar_file: Optional[Path],
|
||||
pdf_file: Path,
|
||||
) -> Optional[str]:
|
||||
# When re-doing OCR, the sidecar contains ONLY the new text, not
|
||||
# the whole text, so do not utilize it in that case
|
||||
if (
|
||||
@ -179,7 +184,8 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
sidecar_file,
|
||||
safe_fallback=False,
|
||||
):
|
||||
assert isinstance(self.parser_settings, OcrSetting)
|
||||
if TYPE_CHECKING:
|
||||
assert isinstance(self.parser_settings, OcrSetting)
|
||||
ocrmypdf_args = {
|
||||
"input_file": input_file,
|
||||
"output_file": output_file,
|
||||
|
@ -4,7 +4,8 @@ from typing import Optional
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from paperless_tesseract.models import OcrSettings as OcrSettingModel
|
||||
from paperless.models import CommonSettings
|
||||
from paperless.models import OcrSettings as OcrSettingModel
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
@ -25,16 +26,20 @@ class OcrSetting:
|
||||
|
||||
|
||||
def get_ocr_settings() -> OcrSetting:
|
||||
db_settings = OcrSettingModel.objects.all().first()
|
||||
ocr_db_settings = OcrSettingModel.objects.all().first()
|
||||
# Workaround for a test where the migration hasn't run to create the single model
|
||||
if db_settings is None:
|
||||
if ocr_db_settings is None:
|
||||
OcrSettingModel.objects.create()
|
||||
db_settings = OcrSettingModel.objects.all().first()
|
||||
assert db_settings is not None
|
||||
ocr_db_settings = OcrSettingModel.objects.all().first()
|
||||
|
||||
cmn_db_settings = CommonSettings.objects.all().first()
|
||||
if cmn_db_settings is None:
|
||||
CommonSettings.objects.create()
|
||||
cmn_db_settings = CommonSettings.objects.all().first()
|
||||
|
||||
user_args = None
|
||||
if db_settings.user_args:
|
||||
user_args = db_settings.user_args
|
||||
if ocr_db_settings.user_args:
|
||||
user_args = ocr_db_settings.user_args
|
||||
elif settings.OCR_USER_ARGS is not None:
|
||||
try:
|
||||
user_args = json.loads(settings.OCR_USER_ARGS)
|
||||
@ -42,23 +47,25 @@ def get_ocr_settings() -> OcrSetting:
|
||||
user_args = {}
|
||||
|
||||
return OcrSetting(
|
||||
pages=db_settings.pages or settings.OCR_PAGES,
|
||||
language=db_settings.language or settings.OCR_LANGUAGE,
|
||||
output_type=db_settings.output_type or settings.OCR_OUTPUT_TYPE,
|
||||
mode=db_settings.mode or settings.OCR_MODE,
|
||||
pages=ocr_db_settings.pages or settings.OCR_PAGES,
|
||||
language=ocr_db_settings.language or settings.OCR_LANGUAGE,
|
||||
output_type=cmn_db_settings.output_type or settings.OCR_OUTPUT_TYPE,
|
||||
mode=ocr_db_settings.mode or settings.OCR_MODE,
|
||||
skip_archive_file=(
|
||||
db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
|
||||
ocr_db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
|
||||
),
|
||||
image_dpi=db_settings.image_dpi or settings.OCR_IMAGE_DPI,
|
||||
clean=db_settings.unpaper_clean or settings.OCR_CLEAN,
|
||||
deskew=db_settings.deskew or settings.OCR_DESKEW,
|
||||
rotate=db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
|
||||
image_dpi=ocr_db_settings.image_dpi or settings.OCR_IMAGE_DPI,
|
||||
clean=ocr_db_settings.unpaper_clean or settings.OCR_CLEAN,
|
||||
deskew=ocr_db_settings.deskew or settings.OCR_DESKEW,
|
||||
rotate=ocr_db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
|
||||
rotate_threshold=(
|
||||
db_settings.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD
|
||||
ocr_db_settings.rotate_pages_threshold
|
||||
or settings.OCR_ROTATE_PAGES_THRESHOLD
|
||||
),
|
||||
max_image_pixel=db_settings.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS,
|
||||
max_image_pixel=ocr_db_settings.max_image_pixels
|
||||
or settings.OCR_MAX_IMAGE_PIXELS,
|
||||
color_conversion_strategy=(
|
||||
db_settings.color_conversion_strategy
|
||||
ocr_db_settings.color_conversion_strategy
|
||||
or settings.OCR_COLOR_CONVERSION_STRATEGY
|
||||
),
|
||||
user_args=user_args,
|
||||
|
@ -2,7 +2,6 @@ import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import uuid
|
||||
from contextlib import AbstractContextManager
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
@ -17,28 +16,6 @@ from documents.tests.utils import FileSystemAssertsMixin
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
from paperless_tesseract.parsers import post_process_text
|
||||
|
||||
image_to_string_calls = []
|
||||
|
||||
|
||||
def fake_convert(input_file, output_file, **kwargs):
|
||||
with open(input_file) as f:
|
||||
lines = f.readlines()
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
with open(output_file % i, "w") as f2:
|
||||
f2.write(line.strip())
|
||||
|
||||
|
||||
class FakeImageFile(AbstractContextManager):
|
||||
def __init__(self, fname):
|
||||
self.fname = fname
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
return os.path.basename(self.fname)
|
||||
|
||||
|
||||
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
||||
|
120
src/paperless_tesseract/tests/test_parser_custom_settings.py
Normal file
120
src/paperless_tesseract/tests/test_parser_custom_settings.py
Normal file
@ -0,0 +1,120 @@
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from paperless.models import CommonSettings
|
||||
from paperless.models import OcrSettings
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
@staticmethod
|
||||
def get_params():
|
||||
return RasterisedDocumentParser(None).construct_ocrmypdf_parameters(
|
||||
input_file="input.pdf",
|
||||
output_file="output.pdf",
|
||||
sidecar_file="sidecar.txt",
|
||||
mime_type="application/pdf",
|
||||
safe_fallback=False,
|
||||
)
|
||||
|
||||
def test_db_settings_ocr_pages(self):
|
||||
with override_settings(OCR_PAGES=10):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.pages = 5
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertEqual(params["pages"], "1-5")
|
||||
|
||||
def test_db_settings_ocr_language(self):
|
||||
with override_settings(OCR_LANGUAGE="eng+deu"):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.language = "fra+ita"
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertEqual(params["language"], "fra+ita")
|
||||
|
||||
def test_db_settings_ocr_output_type(self):
|
||||
with override_settings(OCR_LANGUAGE="pdfa-3"):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.output_type = CommonSettings.OutputTypeChoices.PDF_A
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertEqual(params["output_type"], "pdfa")
|
||||
|
||||
def test_db_settings_ocr_mode(self):
|
||||
with override_settings(OCR_MODE="redo"):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.mode = OcrSettings.ModeChoices.SKIP
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertTrue(params["skip_text"])
|
||||
self.assertNotIn("redo_ocr", params)
|
||||
self.assertNotIn("force_ocr", params)
|
||||
|
||||
def test_db_settings_ocr_clean(self):
|
||||
with override_settings(OCR_CLEAN="clean-final"):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.unpaper_clean = OcrSettings.CleanChoices.CLEAN
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertTrue(params["clean"])
|
||||
self.assertNotIn("clean_final", params)
|
||||
|
||||
with override_settings(OCR_CLEAN="clean-final"):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.unpaper_clean = OcrSettings.CleanChoices.FINAL
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertTrue(params["clean_final"])
|
||||
self.assertNotIn("clean", params)
|
||||
|
||||
def test_db_settings_ocr_deskew(self):
|
||||
with override_settings(OCR_DESKEW=False):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.deskew = True
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertTrue(params["deskew"])
|
||||
|
||||
def test_db_settings_ocr_rotate(self):
|
||||
with override_settings(OCR_ROTATE_PAGES=False, OCR_ROTATE_PAGES_THRESHOLD=30.0):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.rotate_pages = True
|
||||
instance.rotate_pages_threshold = 15.0
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertTrue(params["rotate_pages"])
|
||||
self.assertAlmostEqual(params["rotate_pages_threshold"], 15.0)
|
||||
|
||||
def test_db_settings_ocr_max_pixels(self):
|
||||
with override_settings(OCR_MAX_IMAGE_PIXELS=2_000_000.0):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.max_image_pixels = 1_000_000.0
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertAlmostEqual(params["max_image_mpixels"], 1.0)
|
||||
|
||||
def test_db_settings_ocr_color_convert(self):
|
||||
with override_settings(OCR_COLOR_CONVERSION_STRATEGY="LeaveColorUnchanged"):
|
||||
instance = OcrSettings.objects.all().first()
|
||||
instance.color_conversion_strategy = (
|
||||
OcrSettings.ColorConvertChoices.INDEPENDENT
|
||||
)
|
||||
instance.save()
|
||||
|
||||
params = self.get_params()
|
||||
self.assertEqual(
|
||||
params["color_conversion_strategy"],
|
||||
"UseDeviceIndependentColor",
|
||||
)
|
@ -18,6 +18,7 @@ omit =
|
||||
exclude_also =
|
||||
if settings.AUDIT_LOG_ENABLED:
|
||||
if AUDIT_LOG_ENABLED:
|
||||
if TYPE_CHECKING:
|
||||
|
||||
[mypy]
|
||||
plugins = mypy_django_plugin.main, mypy_drf_plugin.main, numpy.typing.mypy_plugin
|
||||
|
Loading…
x
Reference in New Issue
Block a user