Problems with migration testing need to figure out

This commit is contained in:
Trenton H 2023-12-11 17:12:56 -08:00
parent 30281bd593
commit 5266bd1590
11 changed files with 175 additions and 38 deletions

View File

@ -334,7 +334,9 @@ class DocumentParser(LoggingMixin):
self.progress_callback(current_progress, max_progress)
def get_settings(self):
# return None
"""
A parser must implement this
"""
raise NotImplementedError
def read_file_handle_unicode_errors(self, filepath: Path) -> str:

View File

@ -172,7 +172,15 @@ class TestFieldPermutations(TestCase):
self.assertEqual(info.title, "anotherall")
class DummyParser(DocumentParser):
class _BaseTestParser(DocumentParser):
def get_settings(self):
"""
This parser does not implement additional settings yet
"""
return None
class DummyParser(_BaseTestParser):
def __init__(self, logging_group, scratch_dir, archive_path):
super().__init__(logging_group, None)
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
@ -185,7 +193,7 @@ class DummyParser(DocumentParser):
self.text = "The Text"
class CopyParser(DocumentParser):
class CopyParser(_BaseTestParser):
def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb
@ -199,7 +207,7 @@ class CopyParser(DocumentParser):
shutil.copy(document_path, self.archive_path)
class FaultyParser(DocumentParser):
class FaultyParser(_BaseTestParser):
def __init__(self, logging_group, scratch_dir):
super().__init__(logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
@ -211,7 +219,7 @@ class FaultyParser(DocumentParser):
raise ParseError("Does not compute.")
class FaultyGenericExceptionParser(DocumentParser):
class FaultyGenericExceptionParser(_BaseTestParser):
def __init__(self, logging_group, scratch_dir):
super().__init__(logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)

View File

@ -168,7 +168,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
manifest = self._do_export(use_filename_format=use_filename_format)
self.assertEqual(len(manifest), 172)
self.assertEqual(len(manifest), 177)
# dont include consumer or AnonymousUser users
self.assertEqual(
@ -694,8 +694,8 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
os.path.join(self.dirs.media_dir, "documents"),
)
self.assertEqual(ContentType.objects.count(), 31)
self.assertEqual(Permission.objects.count(), 124)
self.assertEqual(ContentType.objects.count(), 32)
self.assertEqual(Permission.objects.count(), 128)
manifest = self._do_export()
@ -719,5 +719,5 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
with self.assertRaises(IntegrityError):
call_command("document_importer", "--no-progress-bar", self.target)
self.assertEqual(ContentType.objects.count(), 31)
self.assertEqual(Permission.objects.count(), 125)
self.assertEqual(ContentType.objects.count(), 32)
self.assertEqual(Permission.objects.count(), 128)

View File

@ -405,3 +405,9 @@ class MailDocumentParser(DocumentParser):
html_pdf = tempdir / "html.pdf"
html_pdf.write_bytes(response.content)
return html_pdf
def get_settings(self):
"""
This parser does not implement additional settings yet
"""
return None

View File

@ -1,4 +1,4 @@
# Generated by Django 4.2.7 on 2023-12-07 22:52
# Generated by Django 4.2.7 on 2023-12-11 19:59
import django.core.validators
from django.db import migrations
@ -28,8 +28,23 @@ class Migration(migrations.Migration):
verbose_name="ID",
),
),
("pages", models.PositiveIntegerField(blank=True, null=True)),
("language", models.CharField(blank=True, max_length=32, null=True)),
(
"pages",
models.PositiveIntegerField(
blank=True,
null=True,
verbose_name="Do OCR from page 1 to this value",
),
),
(
"language",
models.CharField(
blank=True,
max_length=32,
null=True,
verbose_name="Do OCR using these languages",
),
),
(
"output_type",
models.CharField(
@ -43,6 +58,7 @@ class Migration(migrations.Migration):
],
max_length=8,
null=True,
verbose_name="Sets the output PDF type",
),
),
(
@ -57,6 +73,7 @@ class Migration(migrations.Migration):
],
max_length=8,
null=True,
verbose_name="Sets the OCR mode",
),
),
(
@ -70,9 +87,16 @@ class Migration(migrations.Migration):
],
max_length=16,
null=True,
verbose_name="Controls the generation of an archive file",
),
),
(
"image_dpi",
models.PositiveIntegerField(
null=True,
verbose_name="Sets image DPI fallback value",
),
),
("image_dpi", models.PositiveIntegerField(null=True)),
(
"unpaper_clean",
models.CharField(
@ -84,15 +108,26 @@ class Migration(migrations.Migration):
],
max_length=16,
null=True,
verbose_name="Controls the unpaper cleaning",
),
),
(
"deskew",
models.BooleanField(null=True, verbose_name="Enables deskew"),
),
(
"rotate_pages",
models.BooleanField(
null=True,
verbose_name="Enables page rotation",
),
),
("deskew", models.BooleanField(null=True)),
("rotate_pages", models.BooleanField(null=True)),
(
"rotate_pages_threshold",
models.FloatField(
null=True,
validators=[django.core.validators.MinValueValidator(0.0)],
verbose_name="Sets the threshold for rotation of pages",
),
),
(
@ -102,6 +137,7 @@ class Migration(migrations.Migration):
validators=[
django.core.validators.MinValueValidator(1000000.0),
],
verbose_name="Sets the maximum image for decompression",
),
),
(
@ -117,9 +153,16 @@ class Migration(migrations.Migration):
],
max_length=32,
null=True,
verbose_name="Sets the Ghostscript color conversion strategy",
),
),
(
"user_args",
models.JSONField(
null=True,
verbose_name="Adds additional user arguments for OCRMyPDF",
),
),
("user_args", models.JSONField(blank=True, null=True)),
],
options={
"verbose_name": "ocr settings",

View File

@ -37,11 +37,21 @@ class OcrSettings(models.Model):
GRAY = ("Gray", _("Gray"))
CMYK = ("CMYK", _("CMYK"))
pages = models.PositiveIntegerField(null=True, blank=True)
pages = models.PositiveIntegerField(
verbose_name=_("Do OCR from page 1 to this value"),
null=True,
blank=True,
)
language = models.CharField(null=True, blank=True, max_length=32)
language = models.CharField(
verbose_name=_("Do OCR using these languages"),
null=True,
blank=True,
max_length=32,
)
output_type = models.CharField(
verbose_name=_("Sets the output PDF type"),
null=True,
blank=True,
max_length=8,
@ -49,6 +59,7 @@ class OcrSettings(models.Model):
)
mode = models.CharField(
verbose_name=_("Sets the OCR mode"),
null=True,
blank=True,
max_length=8,
@ -56,43 +67,58 @@ class OcrSettings(models.Model):
)
skip_archive_file = models.CharField(
verbose_name=_("Controls the generation of an archive file"),
null=True,
blank=True,
max_length=16,
choices=ArchiveFileChoices.choices,
)
image_dpi = models.PositiveIntegerField(null=True)
image_dpi = models.PositiveIntegerField(
verbose_name=_("Sets image DPI fallback value"),
null=True,
)
# Can't call it clean, that's a model method
unpaper_clean = models.CharField(
verbose_name=_("Controls the unpaper cleaning"),
null=True,
blank=True,
max_length=16,
choices=CleanChoices.choices,
)
deskew = models.BooleanField(null=True)
deskew = models.BooleanField(verbose_name=_("Enables deskew"), null=True)
rotate_pages = models.BooleanField(null=True)
rotate_pages = models.BooleanField(
verbose_name=_("Enables page rotation"),
null=True,
)
rotate_pages_threshold = models.FloatField(
verbose_name=_("Sets the threshold for rotation of pages"),
null=True,
validators=[MinValueValidator(0.0)],
)
max_image_pixels = models.FloatField(
verbose_name=_("Sets the maximum image for decompression"),
null=True,
validators=[MinValueValidator(1_000_000.0)],
)
color_conversion_strategy = models.CharField(
verbose_name=_("Sets the Ghostscript color conversion strategy"),
blank=True,
null=True,
max_length=32,
choices=ColorConvertChoices.choices,
)
user_args = models.JSONField(null=True)
user_args = models.JSONField(
verbose_name=_("Adds additional user arguments for OCRMyPDF"),
null=True,
)
class Meta:
verbose_name = _("ocr settings")
@ -105,7 +131,7 @@ class OcrSettings(models.Model):
# if you'll not check for self.pk
# then error will also be raised in the update of exists model
raise ValidationError(
"There is can be only one JuicerBaseSettings instance",
"There is can be only one OcrSettings instance",
)
return super().save(*args, **kwargs)

View File

@ -0,0 +1,9 @@
from rest_framework import serializers
from paperless_tesseract.models import OcrSettings
class OcrSettingsSerializer(serializers.ModelSerializer):
class Meta:
model = OcrSettings
fields = ["all"]

View File

@ -26,33 +26,50 @@ class OcrSetting:
def get_ocr_settings() -> OcrSetting:
db_settings = OcrSettingModel.objects.all().first()
assert db_settings is not None
# assert db_settings is not None
user_args = None
if db_settings.user_args:
if db_settings is not None and db_settings.user_args:
user_args = db_settings.user_args
elif settings.OCR_USER_ARGS is not None:
user_args = json.loads(settings.OCR_USER_ARGS)
return OcrSetting(
pages=db_settings.pages or settings.OCR_PAGES,
language=db_settings.language or settings.OCR_LANGUAGE,
output_type=db_settings.output_type or settings.OCR_OUTPUT_TYPE,
mode=db_settings.mode or settings.OCR_MODE,
pages=db_settings.pages if db_settings is not None else settings.OCR_PAGES,
language=db_settings.language
if db_settings is not None and db_settings.language is not None
else settings.OCR_LANGUAGE,
output_type=db_settings.output_type
if db_settings is not None
else settings.OCR_OUTPUT_TYPE,
mode=db_settings.mode if db_settings is not None else settings.OCR_MODE,
skip_archive_file=(
db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
db_settings.skip_archive_file
if db_settings is not None
else settings.OCR_SKIP_ARCHIVE_FILE
),
image_dpi=db_settings.image_dpi or settings.OCR_IMAGE_DPI,
clean=db_settings.unpaper_clean or settings.OCR_CLEAN,
deskew=db_settings.deskew or settings.OCR_DESKEW,
rotate=db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
image_dpi=db_settings.image_dpi
if db_settings is not None
else settings.OCR_IMAGE_DPI,
clean=db_settings.unpaper_clean
if db_settings is not None
else settings.OCR_CLEAN,
deskew=db_settings.deskew if db_settings is not None else settings.OCR_DESKEW,
rotate=db_settings.rotate_pages
if db_settings is not None
else settings.OCR_ROTATE_PAGES,
rotate_threshold=(
db_settings.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD
db_settings.rotate_pages_threshold
if db_settings is not None
else settings.OCR_ROTATE_PAGES_THRESHOLD
),
max_image_pixel=db_settings.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS,
max_image_pixel=db_settings.max_image_pixels
if db_settings is not None
else settings.OCR_MAX_IMAGE_PIXELS,
color_conversion_strategy=(
db_settings.color_conversion_strategy
or settings.OCR_COLOR_CONVERSION_STRATEGY
if db_settings is not None
else settings.OCR_COLOR_CONVERSION_STRATEGY
),
user_args=user_args,
)

View File

@ -0,0 +1,14 @@
from rest_framework.permissions import IsAuthenticated
from rest_framework.viewsets import ModelViewSet
from paperless_tesseract.models import OcrSettings
from paperless_tesseract.serialisers import OcrSettingsSerializer
class OcrSettingsViewSet(ModelViewSet):
model = OcrSettings
queryset = OcrSettings.objects
serializer_class = OcrSettingsSerializer
permission_classes = (IsAuthenticated,)

View File

@ -34,3 +34,9 @@ class TextDocumentParser(DocumentParser):
def parse(self, document_path, mime_type, file_name=None):
self.text = self.read_file_handle_unicode_errors(document_path)
def get_settings(self):
"""
This parser does not implement additional settings yet
"""
return None

View File

@ -111,3 +111,9 @@ class TikaDocumentParser(DocumentParser):
raise ParseError(
f"Error while converting document to PDF: {err}",
) from err
def get_settings(self):
"""
This parser does not implement additional settings yet
"""
return None