diff --git a/src/documents/parsers.py b/src/documents/parsers.py index 89fafdb82..0989b0792 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -334,7 +334,9 @@ class DocumentParser(LoggingMixin): self.progress_callback(current_progress, max_progress) def get_settings(self): - # return None + """ + A parser must implement this + """ raise NotImplementedError def read_file_handle_unicode_errors(self, filepath: Path) -> str: diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index e2cd74016..1db90ee54 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -172,7 +172,15 @@ class TestFieldPermutations(TestCase): self.assertEqual(info.title, "anotherall") -class DummyParser(DocumentParser): +class _BaseTestParser(DocumentParser): + def get_settings(self): + """ + This parser does not implement additional settings yet + """ + return None + + +class DummyParser(_BaseTestParser): def __init__(self, logging_group, scratch_dir, archive_path): super().__init__(logging_group, None) _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) @@ -185,7 +193,7 @@ class DummyParser(DocumentParser): self.text = "The Text" -class CopyParser(DocumentParser): +class CopyParser(_BaseTestParser): def get_thumbnail(self, document_path, mime_type, file_name=None): return self.fake_thumb @@ -199,7 +207,7 @@ class CopyParser(DocumentParser): shutil.copy(document_path, self.archive_path) -class FaultyParser(DocumentParser): +class FaultyParser(_BaseTestParser): def __init__(self, logging_group, scratch_dir): super().__init__(logging_group) _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) @@ -211,7 +219,7 @@ class FaultyParser(DocumentParser): raise ParseError("Does not compute.") -class FaultyGenericExceptionParser(DocumentParser): +class FaultyGenericExceptionParser(_BaseTestParser): def __init__(self, logging_group, scratch_dir): super().__init__(logging_group) _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) diff --git a/src/documents/tests/test_management_exporter.py b/src/documents/tests/test_management_exporter.py index 54bb6f34c..9f07beeb4 100644 --- a/src/documents/tests/test_management_exporter.py +++ b/src/documents/tests/test_management_exporter.py @@ -168,7 +168,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase): manifest = self._do_export(use_filename_format=use_filename_format) - self.assertEqual(len(manifest), 172) + self.assertEqual(len(manifest), 177) # dont include consumer or AnonymousUser users self.assertEqual( @@ -694,8 +694,8 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase): os.path.join(self.dirs.media_dir, "documents"), ) - self.assertEqual(ContentType.objects.count(), 31) - self.assertEqual(Permission.objects.count(), 124) + self.assertEqual(ContentType.objects.count(), 32) + self.assertEqual(Permission.objects.count(), 128) manifest = self._do_export() @@ -719,5 +719,5 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase): with self.assertRaises(IntegrityError): call_command("document_importer", "--no-progress-bar", self.target) - self.assertEqual(ContentType.objects.count(), 31) - self.assertEqual(Permission.objects.count(), 125) + self.assertEqual(ContentType.objects.count(), 32) + self.assertEqual(Permission.objects.count(), 128) diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index 9bc7081d6..92fb90bb1 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -405,3 +405,9 @@ class MailDocumentParser(DocumentParser): html_pdf = tempdir / "html.pdf" html_pdf.write_bytes(response.content) return html_pdf + + def get_settings(self): + """ + This parser does not implement additional settings yet + """ + return None diff --git a/src/paperless_tesseract/migrations/0001_initial.py b/src/paperless_tesseract/migrations/0001_initial.py index 9cd546ea0..6f639ee5e 100644 --- a/src/paperless_tesseract/migrations/0001_initial.py +++ b/src/paperless_tesseract/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 4.2.7 on 2023-12-07 22:52 +# Generated by Django 4.2.7 on 2023-12-11 19:59 import django.core.validators from django.db import migrations @@ -28,8 +28,23 @@ class Migration(migrations.Migration): verbose_name="ID", ), ), - ("pages", models.PositiveIntegerField(blank=True, null=True)), - ("language", models.CharField(blank=True, max_length=32, null=True)), + ( + "pages", + models.PositiveIntegerField( + blank=True, + null=True, + verbose_name="Do OCR from page 1 to this value", + ), + ), + ( + "language", + models.CharField( + blank=True, + max_length=32, + null=True, + verbose_name="Do OCR using these languages", + ), + ), ( "output_type", models.CharField( @@ -43,6 +58,7 @@ class Migration(migrations.Migration): ], max_length=8, null=True, + verbose_name="Sets the output PDF type", ), ), ( @@ -57,6 +73,7 @@ class Migration(migrations.Migration): ], max_length=8, null=True, + verbose_name="Sets the OCR mode", ), ), ( @@ -70,9 +87,16 @@ class Migration(migrations.Migration): ], max_length=16, null=True, + verbose_name="Controls the generation of an archive file", + ), + ), + ( + "image_dpi", + models.PositiveIntegerField( + null=True, + verbose_name="Sets image DPI fallback value", ), ), - ("image_dpi", models.PositiveIntegerField(null=True)), ( "unpaper_clean", models.CharField( @@ -84,15 +108,26 @@ class Migration(migrations.Migration): ], max_length=16, null=True, + verbose_name="Controls the unpaper cleaning", + ), + ), + ( + "deskew", + models.BooleanField(null=True, verbose_name="Enables deskew"), + ), + ( + "rotate_pages", + models.BooleanField( + null=True, + verbose_name="Enables page rotation", ), ), - ("deskew", models.BooleanField(null=True)), - ("rotate_pages", models.BooleanField(null=True)), ( "rotate_pages_threshold", models.FloatField( null=True, validators=[django.core.validators.MinValueValidator(0.0)], + verbose_name="Sets the threshold for rotation of pages", ), ), ( @@ -102,6 +137,7 @@ class Migration(migrations.Migration): validators=[ django.core.validators.MinValueValidator(1000000.0), ], + verbose_name="Sets the maximum image for decompression", ), ), ( @@ -117,9 +153,16 @@ class Migration(migrations.Migration): ], max_length=32, null=True, + verbose_name="Sets the Ghostscript color conversion strategy", + ), + ), + ( + "user_args", + models.JSONField( + null=True, + verbose_name="Adds additional user arguments for OCRMyPDF", ), ), - ("user_args", models.JSONField(blank=True, null=True)), ], options={ "verbose_name": "ocr settings", diff --git a/src/paperless_tesseract/models.py b/src/paperless_tesseract/models.py index f43930635..eb11f2eb5 100644 --- a/src/paperless_tesseract/models.py +++ b/src/paperless_tesseract/models.py @@ -37,11 +37,21 @@ class OcrSettings(models.Model): GRAY = ("Gray", _("Gray")) CMYK = ("CMYK", _("CMYK")) - pages = models.PositiveIntegerField(null=True, blank=True) + pages = models.PositiveIntegerField( + verbose_name=_("Do OCR from page 1 to this value"), + null=True, + blank=True, + ) - language = models.CharField(null=True, blank=True, max_length=32) + language = models.CharField( + verbose_name=_("Do OCR using these languages"), + null=True, + blank=True, + max_length=32, + ) output_type = models.CharField( + verbose_name=_("Sets the output PDF type"), null=True, blank=True, max_length=8, @@ -49,6 +59,7 @@ class OcrSettings(models.Model): ) mode = models.CharField( + verbose_name=_("Sets the OCR mode"), null=True, blank=True, max_length=8, @@ -56,43 +67,58 @@ class OcrSettings(models.Model): ) skip_archive_file = models.CharField( + verbose_name=_("Controls the generation of an archive file"), null=True, blank=True, max_length=16, choices=ArchiveFileChoices.choices, ) - image_dpi = models.PositiveIntegerField(null=True) + image_dpi = models.PositiveIntegerField( + verbose_name=_("Sets image DPI fallback value"), + null=True, + ) + # Can't call it clean, that's a model method unpaper_clean = models.CharField( + verbose_name=_("Controls the unpaper cleaning"), null=True, blank=True, max_length=16, choices=CleanChoices.choices, ) - deskew = models.BooleanField(null=True) + deskew = models.BooleanField(verbose_name=_("Enables deskew"), null=True) - rotate_pages = models.BooleanField(null=True) + rotate_pages = models.BooleanField( + verbose_name=_("Enables page rotation"), + null=True, + ) rotate_pages_threshold = models.FloatField( + verbose_name=_("Sets the threshold for rotation of pages"), null=True, validators=[MinValueValidator(0.0)], ) max_image_pixels = models.FloatField( + verbose_name=_("Sets the maximum image for decompression"), null=True, validators=[MinValueValidator(1_000_000.0)], ) color_conversion_strategy = models.CharField( + verbose_name=_("Sets the Ghostscript color conversion strategy"), blank=True, null=True, max_length=32, choices=ColorConvertChoices.choices, ) - user_args = models.JSONField(null=True) + user_args = models.JSONField( + verbose_name=_("Adds additional user arguments for OCRMyPDF"), + null=True, + ) class Meta: verbose_name = _("ocr settings") @@ -105,7 +131,7 @@ class OcrSettings(models.Model): # if you'll not check for self.pk # then error will also be raised in the update of exists model raise ValidationError( - "There is can be only one JuicerBaseSettings instance", + "There is can be only one OcrSettings instance", ) return super().save(*args, **kwargs) diff --git a/src/paperless_tesseract/serialisers.py b/src/paperless_tesseract/serialisers.py new file mode 100644 index 000000000..d25e9eeab --- /dev/null +++ b/src/paperless_tesseract/serialisers.py @@ -0,0 +1,9 @@ +from rest_framework import serializers + +from paperless_tesseract.models import OcrSettings + + +class OcrSettingsSerializer(serializers.ModelSerializer): + class Meta: + model = OcrSettings + fields = ["all"] diff --git a/src/paperless_tesseract/setting_schema.py b/src/paperless_tesseract/setting_schema.py index 0008f8c41..23c88dd7f 100644 --- a/src/paperless_tesseract/setting_schema.py +++ b/src/paperless_tesseract/setting_schema.py @@ -26,33 +26,50 @@ class OcrSetting: def get_ocr_settings() -> OcrSetting: db_settings = OcrSettingModel.objects.all().first() - assert db_settings is not None + # assert db_settings is not None user_args = None - if db_settings.user_args: + if db_settings is not None and db_settings.user_args: user_args = db_settings.user_args elif settings.OCR_USER_ARGS is not None: user_args = json.loads(settings.OCR_USER_ARGS) return OcrSetting( - pages=db_settings.pages or settings.OCR_PAGES, - language=db_settings.language or settings.OCR_LANGUAGE, - output_type=db_settings.output_type or settings.OCR_OUTPUT_TYPE, - mode=db_settings.mode or settings.OCR_MODE, + pages=db_settings.pages if db_settings is not None else settings.OCR_PAGES, + language=db_settings.language + if db_settings is not None and db_settings.language is not None + else settings.OCR_LANGUAGE, + output_type=db_settings.output_type + if db_settings is not None + else settings.OCR_OUTPUT_TYPE, + mode=db_settings.mode if db_settings is not None else settings.OCR_MODE, skip_archive_file=( - db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE + db_settings.skip_archive_file + if db_settings is not None + else settings.OCR_SKIP_ARCHIVE_FILE ), - image_dpi=db_settings.image_dpi or settings.OCR_IMAGE_DPI, - clean=db_settings.unpaper_clean or settings.OCR_CLEAN, - deskew=db_settings.deskew or settings.OCR_DESKEW, - rotate=db_settings.rotate_pages or settings.OCR_ROTATE_PAGES, + image_dpi=db_settings.image_dpi + if db_settings is not None + else settings.OCR_IMAGE_DPI, + clean=db_settings.unpaper_clean + if db_settings is not None + else settings.OCR_CLEAN, + deskew=db_settings.deskew if db_settings is not None else settings.OCR_DESKEW, + rotate=db_settings.rotate_pages + if db_settings is not None + else settings.OCR_ROTATE_PAGES, rotate_threshold=( - db_settings.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD + db_settings.rotate_pages_threshold + if db_settings is not None + else settings.OCR_ROTATE_PAGES_THRESHOLD ), - max_image_pixel=db_settings.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS, + max_image_pixel=db_settings.max_image_pixels + if db_settings is not None + else settings.OCR_MAX_IMAGE_PIXELS, color_conversion_strategy=( db_settings.color_conversion_strategy - or settings.OCR_COLOR_CONVERSION_STRATEGY + if db_settings is not None + else settings.OCR_COLOR_CONVERSION_STRATEGY ), user_args=user_args, ) diff --git a/src/paperless_tesseract/views.py b/src/paperless_tesseract/views.py new file mode 100644 index 000000000..2883a117a --- /dev/null +++ b/src/paperless_tesseract/views.py @@ -0,0 +1,14 @@ +from rest_framework.permissions import IsAuthenticated +from rest_framework.viewsets import ModelViewSet + +from paperless_tesseract.models import OcrSettings +from paperless_tesseract.serialisers import OcrSettingsSerializer + + +class OcrSettingsViewSet(ModelViewSet): + model = OcrSettings + + queryset = OcrSettings.objects + + serializer_class = OcrSettingsSerializer + permission_classes = (IsAuthenticated,) diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index c017a3c0f..b6481adc9 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -34,3 +34,9 @@ class TextDocumentParser(DocumentParser): def parse(self, document_path, mime_type, file_name=None): self.text = self.read_file_handle_unicode_errors(document_path) + + def get_settings(self): + """ + This parser does not implement additional settings yet + """ + return None diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index c9056d90d..ed9996039 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -111,3 +111,9 @@ class TikaDocumentParser(DocumentParser): raise ParseError( f"Error while converting document to PDF: {err}", ) from err + + def get_settings(self): + """ + This parser does not implement additional settings yet + """ + return None