Problems with migration testing need to figure out

This commit is contained in:
Trenton H 2023-12-11 17:12:56 -08:00
parent 30281bd593
commit 5266bd1590
11 changed files with 175 additions and 38 deletions

View File

@ -334,7 +334,9 @@ class DocumentParser(LoggingMixin):
self.progress_callback(current_progress, max_progress) self.progress_callback(current_progress, max_progress)
def get_settings(self): def get_settings(self):
# return None """
A parser must implement this
"""
raise NotImplementedError raise NotImplementedError
def read_file_handle_unicode_errors(self, filepath: Path) -> str: def read_file_handle_unicode_errors(self, filepath: Path) -> str:

View File

@ -172,7 +172,15 @@ class TestFieldPermutations(TestCase):
self.assertEqual(info.title, "anotherall") self.assertEqual(info.title, "anotherall")
class DummyParser(DocumentParser): class _BaseTestParser(DocumentParser):
def get_settings(self):
"""
This parser does not implement additional settings yet
"""
return None
class DummyParser(_BaseTestParser):
def __init__(self, logging_group, scratch_dir, archive_path): def __init__(self, logging_group, scratch_dir, archive_path):
super().__init__(logging_group, None) super().__init__(logging_group, None)
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
@ -185,7 +193,7 @@ class DummyParser(DocumentParser):
self.text = "The Text" self.text = "The Text"
class CopyParser(DocumentParser): class CopyParser(_BaseTestParser):
def get_thumbnail(self, document_path, mime_type, file_name=None): def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb return self.fake_thumb
@ -199,7 +207,7 @@ class CopyParser(DocumentParser):
shutil.copy(document_path, self.archive_path) shutil.copy(document_path, self.archive_path)
class FaultyParser(DocumentParser): class FaultyParser(_BaseTestParser):
def __init__(self, logging_group, scratch_dir): def __init__(self, logging_group, scratch_dir):
super().__init__(logging_group) super().__init__(logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
@ -211,7 +219,7 @@ class FaultyParser(DocumentParser):
raise ParseError("Does not compute.") raise ParseError("Does not compute.")
class FaultyGenericExceptionParser(DocumentParser): class FaultyGenericExceptionParser(_BaseTestParser):
def __init__(self, logging_group, scratch_dir): def __init__(self, logging_group, scratch_dir):
super().__init__(logging_group) super().__init__(logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir) _, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)

View File

@ -168,7 +168,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
manifest = self._do_export(use_filename_format=use_filename_format) manifest = self._do_export(use_filename_format=use_filename_format)
self.assertEqual(len(manifest), 172) self.assertEqual(len(manifest), 177)
# dont include consumer or AnonymousUser users # dont include consumer or AnonymousUser users
self.assertEqual( self.assertEqual(
@ -694,8 +694,8 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
os.path.join(self.dirs.media_dir, "documents"), os.path.join(self.dirs.media_dir, "documents"),
) )
self.assertEqual(ContentType.objects.count(), 31) self.assertEqual(ContentType.objects.count(), 32)
self.assertEqual(Permission.objects.count(), 124) self.assertEqual(Permission.objects.count(), 128)
manifest = self._do_export() manifest = self._do_export()
@ -719,5 +719,5 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
with self.assertRaises(IntegrityError): with self.assertRaises(IntegrityError):
call_command("document_importer", "--no-progress-bar", self.target) call_command("document_importer", "--no-progress-bar", self.target)
self.assertEqual(ContentType.objects.count(), 31) self.assertEqual(ContentType.objects.count(), 32)
self.assertEqual(Permission.objects.count(), 125) self.assertEqual(Permission.objects.count(), 128)

View File

@ -405,3 +405,9 @@ class MailDocumentParser(DocumentParser):
html_pdf = tempdir / "html.pdf" html_pdf = tempdir / "html.pdf"
html_pdf.write_bytes(response.content) html_pdf.write_bytes(response.content)
return html_pdf return html_pdf
def get_settings(self):
"""
This parser does not implement additional settings yet
"""
return None

View File

@ -1,4 +1,4 @@
# Generated by Django 4.2.7 on 2023-12-07 22:52 # Generated by Django 4.2.7 on 2023-12-11 19:59
import django.core.validators import django.core.validators
from django.db import migrations from django.db import migrations
@ -28,8 +28,23 @@ class Migration(migrations.Migration):
verbose_name="ID", verbose_name="ID",
), ),
), ),
("pages", models.PositiveIntegerField(blank=True, null=True)), (
("language", models.CharField(blank=True, max_length=32, null=True)), "pages",
models.PositiveIntegerField(
blank=True,
null=True,
verbose_name="Do OCR from page 1 to this value",
),
),
(
"language",
models.CharField(
blank=True,
max_length=32,
null=True,
verbose_name="Do OCR using these languages",
),
),
( (
"output_type", "output_type",
models.CharField( models.CharField(
@ -43,6 +58,7 @@ class Migration(migrations.Migration):
], ],
max_length=8, max_length=8,
null=True, null=True,
verbose_name="Sets the output PDF type",
), ),
), ),
( (
@ -57,6 +73,7 @@ class Migration(migrations.Migration):
], ],
max_length=8, max_length=8,
null=True, null=True,
verbose_name="Sets the OCR mode",
), ),
), ),
( (
@ -70,9 +87,16 @@ class Migration(migrations.Migration):
], ],
max_length=16, max_length=16,
null=True, null=True,
verbose_name="Controls the generation of an archive file",
),
),
(
"image_dpi",
models.PositiveIntegerField(
null=True,
verbose_name="Sets image DPI fallback value",
), ),
), ),
("image_dpi", models.PositiveIntegerField(null=True)),
( (
"unpaper_clean", "unpaper_clean",
models.CharField( models.CharField(
@ -84,15 +108,26 @@ class Migration(migrations.Migration):
], ],
max_length=16, max_length=16,
null=True, null=True,
verbose_name="Controls the unpaper cleaning",
),
),
(
"deskew",
models.BooleanField(null=True, verbose_name="Enables deskew"),
),
(
"rotate_pages",
models.BooleanField(
null=True,
verbose_name="Enables page rotation",
), ),
), ),
("deskew", models.BooleanField(null=True)),
("rotate_pages", models.BooleanField(null=True)),
( (
"rotate_pages_threshold", "rotate_pages_threshold",
models.FloatField( models.FloatField(
null=True, null=True,
validators=[django.core.validators.MinValueValidator(0.0)], validators=[django.core.validators.MinValueValidator(0.0)],
verbose_name="Sets the threshold for rotation of pages",
), ),
), ),
( (
@ -102,6 +137,7 @@ class Migration(migrations.Migration):
validators=[ validators=[
django.core.validators.MinValueValidator(1000000.0), django.core.validators.MinValueValidator(1000000.0),
], ],
verbose_name="Sets the maximum image for decompression",
), ),
), ),
( (
@ -117,9 +153,16 @@ class Migration(migrations.Migration):
], ],
max_length=32, max_length=32,
null=True, null=True,
verbose_name="Sets the Ghostscript color conversion strategy",
),
),
(
"user_args",
models.JSONField(
null=True,
verbose_name="Adds additional user arguments for OCRMyPDF",
), ),
), ),
("user_args", models.JSONField(blank=True, null=True)),
], ],
options={ options={
"verbose_name": "ocr settings", "verbose_name": "ocr settings",

View File

@ -37,11 +37,21 @@ class OcrSettings(models.Model):
GRAY = ("Gray", _("Gray")) GRAY = ("Gray", _("Gray"))
CMYK = ("CMYK", _("CMYK")) CMYK = ("CMYK", _("CMYK"))
pages = models.PositiveIntegerField(null=True, blank=True) pages = models.PositiveIntegerField(
verbose_name=_("Do OCR from page 1 to this value"),
null=True,
blank=True,
)
language = models.CharField(null=True, blank=True, max_length=32) language = models.CharField(
verbose_name=_("Do OCR using these languages"),
null=True,
blank=True,
max_length=32,
)
output_type = models.CharField( output_type = models.CharField(
verbose_name=_("Sets the output PDF type"),
null=True, null=True,
blank=True, blank=True,
max_length=8, max_length=8,
@ -49,6 +59,7 @@ class OcrSettings(models.Model):
) )
mode = models.CharField( mode = models.CharField(
verbose_name=_("Sets the OCR mode"),
null=True, null=True,
blank=True, blank=True,
max_length=8, max_length=8,
@ -56,43 +67,58 @@ class OcrSettings(models.Model):
) )
skip_archive_file = models.CharField( skip_archive_file = models.CharField(
verbose_name=_("Controls the generation of an archive file"),
null=True, null=True,
blank=True, blank=True,
max_length=16, max_length=16,
choices=ArchiveFileChoices.choices, choices=ArchiveFileChoices.choices,
) )
image_dpi = models.PositiveIntegerField(null=True) image_dpi = models.PositiveIntegerField(
verbose_name=_("Sets image DPI fallback value"),
null=True,
)
# Can't call it clean, that's a model method
unpaper_clean = models.CharField( unpaper_clean = models.CharField(
verbose_name=_("Controls the unpaper cleaning"),
null=True, null=True,
blank=True, blank=True,
max_length=16, max_length=16,
choices=CleanChoices.choices, choices=CleanChoices.choices,
) )
deskew = models.BooleanField(null=True) deskew = models.BooleanField(verbose_name=_("Enables deskew"), null=True)
rotate_pages = models.BooleanField(null=True) rotate_pages = models.BooleanField(
verbose_name=_("Enables page rotation"),
null=True,
)
rotate_pages_threshold = models.FloatField( rotate_pages_threshold = models.FloatField(
verbose_name=_("Sets the threshold for rotation of pages"),
null=True, null=True,
validators=[MinValueValidator(0.0)], validators=[MinValueValidator(0.0)],
) )
max_image_pixels = models.FloatField( max_image_pixels = models.FloatField(
verbose_name=_("Sets the maximum image for decompression"),
null=True, null=True,
validators=[MinValueValidator(1_000_000.0)], validators=[MinValueValidator(1_000_000.0)],
) )
color_conversion_strategy = models.CharField( color_conversion_strategy = models.CharField(
verbose_name=_("Sets the Ghostscript color conversion strategy"),
blank=True, blank=True,
null=True, null=True,
max_length=32, max_length=32,
choices=ColorConvertChoices.choices, choices=ColorConvertChoices.choices,
) )
user_args = models.JSONField(null=True) user_args = models.JSONField(
verbose_name=_("Adds additional user arguments for OCRMyPDF"),
null=True,
)
class Meta: class Meta:
verbose_name = _("ocr settings") verbose_name = _("ocr settings")
@ -105,7 +131,7 @@ class OcrSettings(models.Model):
# if you'll not check for self.pk # if you'll not check for self.pk
# then error will also be raised in the update of exists model # then error will also be raised in the update of exists model
raise ValidationError( raise ValidationError(
"There is can be only one JuicerBaseSettings instance", "There is can be only one OcrSettings instance",
) )
return super().save(*args, **kwargs) return super().save(*args, **kwargs)

View File

@ -0,0 +1,9 @@
from rest_framework import serializers
from paperless_tesseract.models import OcrSettings
class OcrSettingsSerializer(serializers.ModelSerializer):
class Meta:
model = OcrSettings
fields = ["all"]

View File

@ -26,33 +26,50 @@ class OcrSetting:
def get_ocr_settings() -> OcrSetting: def get_ocr_settings() -> OcrSetting:
db_settings = OcrSettingModel.objects.all().first() db_settings = OcrSettingModel.objects.all().first()
assert db_settings is not None # assert db_settings is not None
user_args = None user_args = None
if db_settings.user_args: if db_settings is not None and db_settings.user_args:
user_args = db_settings.user_args user_args = db_settings.user_args
elif settings.OCR_USER_ARGS is not None: elif settings.OCR_USER_ARGS is not None:
user_args = json.loads(settings.OCR_USER_ARGS) user_args = json.loads(settings.OCR_USER_ARGS)
return OcrSetting( return OcrSetting(
pages=db_settings.pages or settings.OCR_PAGES, pages=db_settings.pages if db_settings is not None else settings.OCR_PAGES,
language=db_settings.language or settings.OCR_LANGUAGE, language=db_settings.language
output_type=db_settings.output_type or settings.OCR_OUTPUT_TYPE, if db_settings is not None and db_settings.language is not None
mode=db_settings.mode or settings.OCR_MODE, else settings.OCR_LANGUAGE,
output_type=db_settings.output_type
if db_settings is not None
else settings.OCR_OUTPUT_TYPE,
mode=db_settings.mode if db_settings is not None else settings.OCR_MODE,
skip_archive_file=( skip_archive_file=(
db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE db_settings.skip_archive_file
if db_settings is not None
else settings.OCR_SKIP_ARCHIVE_FILE
), ),
image_dpi=db_settings.image_dpi or settings.OCR_IMAGE_DPI, image_dpi=db_settings.image_dpi
clean=db_settings.unpaper_clean or settings.OCR_CLEAN, if db_settings is not None
deskew=db_settings.deskew or settings.OCR_DESKEW, else settings.OCR_IMAGE_DPI,
rotate=db_settings.rotate_pages or settings.OCR_ROTATE_PAGES, clean=db_settings.unpaper_clean
if db_settings is not None
else settings.OCR_CLEAN,
deskew=db_settings.deskew if db_settings is not None else settings.OCR_DESKEW,
rotate=db_settings.rotate_pages
if db_settings is not None
else settings.OCR_ROTATE_PAGES,
rotate_threshold=( rotate_threshold=(
db_settings.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD db_settings.rotate_pages_threshold
if db_settings is not None
else settings.OCR_ROTATE_PAGES_THRESHOLD
), ),
max_image_pixel=db_settings.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS, max_image_pixel=db_settings.max_image_pixels
if db_settings is not None
else settings.OCR_MAX_IMAGE_PIXELS,
color_conversion_strategy=( color_conversion_strategy=(
db_settings.color_conversion_strategy db_settings.color_conversion_strategy
or settings.OCR_COLOR_CONVERSION_STRATEGY if db_settings is not None
else settings.OCR_COLOR_CONVERSION_STRATEGY
), ),
user_args=user_args, user_args=user_args,
) )

View File

@ -0,0 +1,14 @@
from rest_framework.permissions import IsAuthenticated
from rest_framework.viewsets import ModelViewSet
from paperless_tesseract.models import OcrSettings
from paperless_tesseract.serialisers import OcrSettingsSerializer
class OcrSettingsViewSet(ModelViewSet):
model = OcrSettings
queryset = OcrSettings.objects
serializer_class = OcrSettingsSerializer
permission_classes = (IsAuthenticated,)

View File

@ -34,3 +34,9 @@ class TextDocumentParser(DocumentParser):
def parse(self, document_path, mime_type, file_name=None): def parse(self, document_path, mime_type, file_name=None):
self.text = self.read_file_handle_unicode_errors(document_path) self.text = self.read_file_handle_unicode_errors(document_path)
def get_settings(self):
"""
This parser does not implement additional settings yet
"""
return None

View File

@ -111,3 +111,9 @@ class TikaDocumentParser(DocumentParser):
raise ParseError( raise ParseError(
f"Error while converting document to PDF: {err}", f"Error while converting document to PDF: {err}",
) from err ) from err
def get_settings(self):
"""
This parser does not implement additional settings yet
"""
return None