Problems with migration testing need to figure out

2023-12-11 17:12:56 -08:00
parent 30281bd593
commit 5266bd1590
11 changed files with 175 additions and 38 deletions
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -334,7 +334,9 @@ class DocumentParser(LoggingMixin):
            self.progress_callback(current_progress, max_progress)
    def get_settings(self):
-        # return None
+        """
        A parser must implement this
        """
        raise NotImplementedError
    def read_file_handle_unicode_errors(self, filepath: Path) -> str:
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -172,7 +172,15 @@ class TestFieldPermutations(TestCase):
            self.assertEqual(info.title, "anotherall")
-class DummyParser(DocumentParser):
+class _BaseTestParser(DocumentParser):
    def get_settings(self):
        """
        This parser does not implement additional settings yet
        """
        return None
 class DummyParser(_BaseTestParser):
    def __init__(self, logging_group, scratch_dir, archive_path):
        super().__init__(logging_group, None)
        _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
@@ -185,7 +193,7 @@ class DummyParser(DocumentParser):
        self.text = "The Text"
-class CopyParser(DocumentParser):
+class CopyParser(_BaseTestParser):
    def get_thumbnail(self, document_path, mime_type, file_name=None):
        return self.fake_thumb
@@ -199,7 +207,7 @@ class CopyParser(DocumentParser):
        shutil.copy(document_path, self.archive_path)
-class FaultyParser(DocumentParser):
+class FaultyParser(_BaseTestParser):
    def __init__(self, logging_group, scratch_dir):
        super().__init__(logging_group)
        _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
@@ -211,7 +219,7 @@ class FaultyParser(DocumentParser):
        raise ParseError("Does not compute.")
-class FaultyGenericExceptionParser(DocumentParser):
+class FaultyGenericExceptionParser(_BaseTestParser):
    def __init__(self, logging_group, scratch_dir):
        super().__init__(logging_group)
        _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
--- a/src/documents/tests/test_management_exporter.py
+++ b/src/documents/tests/test_management_exporter.py
@@ -168,7 +168,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
        manifest = self._do_export(use_filename_format=use_filename_format)
-        self.assertEqual(len(manifest), 172)
+        self.assertEqual(len(manifest), 177)
        # dont include consumer or AnonymousUser users
        self.assertEqual(
@@ -694,8 +694,8 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
            os.path.join(self.dirs.media_dir, "documents"),
        )
-        self.assertEqual(ContentType.objects.count(), 31)
+        self.assertEqual(ContentType.objects.count(), 32)
-        self.assertEqual(Permission.objects.count(), 124)
+        self.assertEqual(Permission.objects.count(), 128)
        manifest = self._do_export()
@@ -719,5 +719,5 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
            with self.assertRaises(IntegrityError):
                call_command("document_importer", "--no-progress-bar", self.target)
-            self.assertEqual(ContentType.objects.count(), 31)
+            self.assertEqual(ContentType.objects.count(), 32)
-            self.assertEqual(Permission.objects.count(), 125)
+            self.assertEqual(Permission.objects.count(), 128)
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@@ -405,3 +405,9 @@ class MailDocumentParser(DocumentParser):
        html_pdf = tempdir / "html.pdf"
        html_pdf.write_bytes(response.content)
        return html_pdf
    def get_settings(self):
        """
        This parser does not implement additional settings yet
        """
        return None
--- a/src/paperless_tesseract/migrations/0001_initial.py
+++ b/src/paperless_tesseract/migrations/0001_initial.py
@@ -1,4 +1,4 @@
-# Generated by Django 4.2.7 on 2023-12-07 22:52
+# Generated by Django 4.2.7 on 2023-12-11 19:59
 import django.core.validators
 from django.db import migrations
@@ -28,8 +28,23 @@ class Migration(migrations.Migration):
                        verbose_name="ID",
                    ),
                ),
-                ("pages", models.PositiveIntegerField(blank=True, null=True)),
+                (
-                ("language", models.CharField(blank=True, max_length=32, null=True)),
+                    "pages",
                    models.PositiveIntegerField(
                        blank=True,
                        null=True,
                        verbose_name="Do OCR from page 1 to this value",
                    ),
                ),
                (
                    "language",
                    models.CharField(
                        blank=True,
                        max_length=32,
                        null=True,
                        verbose_name="Do OCR using these languages",
                    ),
                ),
                (
                    "output_type",
                    models.CharField(
@@ -43,6 +58,7 @@ class Migration(migrations.Migration):
                        ],
                        max_length=8,
                        null=True,
                        verbose_name="Sets the output PDF type",
                    ),
                ),
                (
@@ -57,6 +73,7 @@ class Migration(migrations.Migration):
                        ],
                        max_length=8,
                        null=True,
                        verbose_name="Sets the OCR mode",
                    ),
                ),
                (
@@ -70,9 +87,16 @@ class Migration(migrations.Migration):
                        ],
                        max_length=16,
                        null=True,
                        verbose_name="Controls the generation of an archive file",
                    ),
                ),
                (
                    "image_dpi",
                    models.PositiveIntegerField(
                        null=True,
                        verbose_name="Sets image DPI fallback value",
                    ),
                ),
                ("image_dpi", models.PositiveIntegerField(null=True)),
                (
                    "unpaper_clean",
                    models.CharField(
@@ -84,15 +108,26 @@ class Migration(migrations.Migration):
                        ],
                        max_length=16,
                        null=True,
                        verbose_name="Controls the unpaper cleaning",
                    ),
                ),
                (
                    "deskew",
                    models.BooleanField(null=True, verbose_name="Enables deskew"),
                ),
                (
                    "rotate_pages",
                    models.BooleanField(
                        null=True,
                        verbose_name="Enables page rotation",
                    ),
                ),
                ("deskew", models.BooleanField(null=True)),
                ("rotate_pages", models.BooleanField(null=True)),
                (
                    "rotate_pages_threshold",
                    models.FloatField(
                        null=True,
                        validators=[django.core.validators.MinValueValidator(0.0)],
                        verbose_name="Sets the threshold for rotation of pages",
                    ),
                ),
                (
@@ -102,6 +137,7 @@ class Migration(migrations.Migration):
                        validators=[
                            django.core.validators.MinValueValidator(1000000.0),
                        ],
                        verbose_name="Sets the maximum image for decompression",
                    ),
                ),
                (
@@ -117,9 +153,16 @@ class Migration(migrations.Migration):
                        ],
                        max_length=32,
                        null=True,
                        verbose_name="Sets the Ghostscript color conversion strategy",
                    ),
                ),
                (
                    "user_args",
                    models.JSONField(
                        null=True,
                        verbose_name="Adds additional user arguments for OCRMyPDF",
                    ),
                ),
                ("user_args", models.JSONField(blank=True, null=True)),
            ],
            options={
                "verbose_name": "ocr settings",
--- a/src/paperless_tesseract/models.py
+++ b/src/paperless_tesseract/models.py
@@ -37,11 +37,21 @@ class OcrSettings(models.Model):
        GRAY = ("Gray", _("Gray"))
        CMYK = ("CMYK", _("CMYK"))
-    pages = models.PositiveIntegerField(null=True, blank=True)
+    pages = models.PositiveIntegerField(
        verbose_name=_("Do OCR from page 1 to this value"),
        null=True,
        blank=True,
    )
-    language = models.CharField(null=True, blank=True, max_length=32)
+    language = models.CharField(
        verbose_name=_("Do OCR using these languages"),
        null=True,
        blank=True,
        max_length=32,
    )
    output_type = models.CharField(
        verbose_name=_("Sets the output PDF type"),
        null=True,
        blank=True,
        max_length=8,
@@ -49,6 +59,7 @@ class OcrSettings(models.Model):
    )
    mode = models.CharField(
        verbose_name=_("Sets the OCR mode"),
        null=True,
        blank=True,
        max_length=8,
@@ -56,43 +67,58 @@ class OcrSettings(models.Model):
    )
    skip_archive_file = models.CharField(
        verbose_name=_("Controls the generation of an archive file"),
        null=True,
        blank=True,
        max_length=16,
        choices=ArchiveFileChoices.choices,
    )
-    image_dpi = models.PositiveIntegerField(null=True)
+    image_dpi = models.PositiveIntegerField(
        verbose_name=_("Sets image DPI fallback value"),
        null=True,
    )
    # Can't call it clean, that's a model method
    unpaper_clean = models.CharField(
        verbose_name=_("Controls the unpaper cleaning"),
        null=True,
        blank=True,
        max_length=16,
        choices=CleanChoices.choices,
    )
-    deskew = models.BooleanField(null=True)
+    deskew = models.BooleanField(verbose_name=_("Enables deskew"), null=True)
-    rotate_pages = models.BooleanField(null=True)
+    rotate_pages = models.BooleanField(
        verbose_name=_("Enables page rotation"),
        null=True,
    )
    rotate_pages_threshold = models.FloatField(
        verbose_name=_("Sets the threshold for rotation of pages"),
        null=True,
        validators=[MinValueValidator(0.0)],
    )
    max_image_pixels = models.FloatField(
        verbose_name=_("Sets the maximum image for decompression"),
        null=True,
        validators=[MinValueValidator(1_000_000.0)],
    )
    color_conversion_strategy = models.CharField(
        verbose_name=_("Sets the Ghostscript color conversion strategy"),
        blank=True,
        null=True,
        max_length=32,
        choices=ColorConvertChoices.choices,
    )
-    user_args = models.JSONField(null=True)
+    user_args = models.JSONField(
        verbose_name=_("Adds additional user arguments for OCRMyPDF"),
        null=True,
    )
    class Meta:
        verbose_name = _("ocr settings")
@@ -105,7 +131,7 @@ class OcrSettings(models.Model):
            # if you'll not check for self.pk
            # then error will also be raised in the update of exists model
            raise ValidationError(
-                "There is can be only one JuicerBaseSettings instance",
+                "There is can be only one OcrSettings instance",
            )
        return super().save(*args, **kwargs)
--- a/src/paperless_tesseract/serialisers.py
+++ b/src/paperless_tesseract/serialisers.py
@@ -0,0 +1,9 @@
 from rest_framework import serializers
 from paperless_tesseract.models import OcrSettings
 class OcrSettingsSerializer(serializers.ModelSerializer):
    class Meta:
        model = OcrSettings
        fields = ["all"]
--- a/src/paperless_tesseract/setting_schema.py
+++ b/src/paperless_tesseract/setting_schema.py
@@ -26,33 +26,50 @@ class OcrSetting:
 def get_ocr_settings() -> OcrSetting:
    db_settings = OcrSettingModel.objects.all().first()
-    assert db_settings is not None
+    # assert db_settings is not None
    user_args = None
-    if db_settings.user_args:
+    if db_settings is not None and db_settings.user_args:
        user_args = db_settings.user_args
    elif settings.OCR_USER_ARGS is not None:
        user_args = json.loads(settings.OCR_USER_ARGS)
    return OcrSetting(
-        pages=db_settings.pages or settings.OCR_PAGES,
+        pages=db_settings.pages if db_settings is not None else settings.OCR_PAGES,
-        language=db_settings.language or settings.OCR_LANGUAGE,
+        language=db_settings.language
-        output_type=db_settings.output_type or settings.OCR_OUTPUT_TYPE,
+        if db_settings is not None and db_settings.language is not None
-        mode=db_settings.mode or settings.OCR_MODE,
+        else settings.OCR_LANGUAGE,
        output_type=db_settings.output_type
        if db_settings is not None
        else settings.OCR_OUTPUT_TYPE,
        mode=db_settings.mode if db_settings is not None else settings.OCR_MODE,
        skip_archive_file=(
-            db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
+            db_settings.skip_archive_file
            if db_settings is not None
            else settings.OCR_SKIP_ARCHIVE_FILE
        ),
-        image_dpi=db_settings.image_dpi or settings.OCR_IMAGE_DPI,
+        image_dpi=db_settings.image_dpi
-        clean=db_settings.unpaper_clean or settings.OCR_CLEAN,
+        if db_settings is not None
-        deskew=db_settings.deskew or settings.OCR_DESKEW,
+        else settings.OCR_IMAGE_DPI,
-        rotate=db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
+        clean=db_settings.unpaper_clean
        if db_settings is not None
        else settings.OCR_CLEAN,
        deskew=db_settings.deskew if db_settings is not None else settings.OCR_DESKEW,
        rotate=db_settings.rotate_pages
        if db_settings is not None
        else settings.OCR_ROTATE_PAGES,
        rotate_threshold=(
-            db_settings.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD
+            db_settings.rotate_pages_threshold
            if db_settings is not None
            else settings.OCR_ROTATE_PAGES_THRESHOLD
        ),
-        max_image_pixel=db_settings.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS,
+        max_image_pixel=db_settings.max_image_pixels
        if db_settings is not None
        else settings.OCR_MAX_IMAGE_PIXELS,
        color_conversion_strategy=(
            db_settings.color_conversion_strategy
-            or settings.OCR_COLOR_CONVERSION_STRATEGY
+            if db_settings is not None
            else settings.OCR_COLOR_CONVERSION_STRATEGY
        ),
        user_args=user_args,
    )
--- a/src/paperless_tesseract/views.py
+++ b/src/paperless_tesseract/views.py
@@ -0,0 +1,14 @@
 from rest_framework.permissions import IsAuthenticated
 from rest_framework.viewsets import ModelViewSet
 from paperless_tesseract.models import OcrSettings
 from paperless_tesseract.serialisers import OcrSettingsSerializer
 class OcrSettingsViewSet(ModelViewSet):
    model = OcrSettings
    queryset = OcrSettings.objects
    serializer_class = OcrSettingsSerializer
    permission_classes = (IsAuthenticated,)
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -34,3 +34,9 @@ class TextDocumentParser(DocumentParser):
    def parse(self, document_path, mime_type, file_name=None):
        self.text = self.read_file_handle_unicode_errors(document_path)
    def get_settings(self):
        """
        This parser does not implement additional settings yet
        """
        return None
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -111,3 +111,9 @@ class TikaDocumentParser(DocumentParser):
                raise ParseError(
                    f"Error while converting document to PDF: {err}",
                ) from err
    def get_settings(self):
        """
        This parser does not implement additional settings yet
        """
        return None