Problems with migration testing need to figure out
This commit is contained in:
parent
30281bd593
commit
5266bd1590
@ -334,7 +334,9 @@ class DocumentParser(LoggingMixin):
|
|||||||
self.progress_callback(current_progress, max_progress)
|
self.progress_callback(current_progress, max_progress)
|
||||||
|
|
||||||
def get_settings(self):
|
def get_settings(self):
|
||||||
# return None
|
"""
|
||||||
|
A parser must implement this
|
||||||
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def read_file_handle_unicode_errors(self, filepath: Path) -> str:
|
def read_file_handle_unicode_errors(self, filepath: Path) -> str:
|
||||||
|
@ -172,7 +172,15 @@ class TestFieldPermutations(TestCase):
|
|||||||
self.assertEqual(info.title, "anotherall")
|
self.assertEqual(info.title, "anotherall")
|
||||||
|
|
||||||
|
|
||||||
class DummyParser(DocumentParser):
|
class _BaseTestParser(DocumentParser):
|
||||||
|
def get_settings(self):
|
||||||
|
"""
|
||||||
|
This parser does not implement additional settings yet
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class DummyParser(_BaseTestParser):
|
||||||
def __init__(self, logging_group, scratch_dir, archive_path):
|
def __init__(self, logging_group, scratch_dir, archive_path):
|
||||||
super().__init__(logging_group, None)
|
super().__init__(logging_group, None)
|
||||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||||
@ -185,7 +193,7 @@ class DummyParser(DocumentParser):
|
|||||||
self.text = "The Text"
|
self.text = "The Text"
|
||||||
|
|
||||||
|
|
||||||
class CopyParser(DocumentParser):
|
class CopyParser(_BaseTestParser):
|
||||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||||
return self.fake_thumb
|
return self.fake_thumb
|
||||||
|
|
||||||
@ -199,7 +207,7 @@ class CopyParser(DocumentParser):
|
|||||||
shutil.copy(document_path, self.archive_path)
|
shutil.copy(document_path, self.archive_path)
|
||||||
|
|
||||||
|
|
||||||
class FaultyParser(DocumentParser):
|
class FaultyParser(_BaseTestParser):
|
||||||
def __init__(self, logging_group, scratch_dir):
|
def __init__(self, logging_group, scratch_dir):
|
||||||
super().__init__(logging_group)
|
super().__init__(logging_group)
|
||||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||||
@ -211,7 +219,7 @@ class FaultyParser(DocumentParser):
|
|||||||
raise ParseError("Does not compute.")
|
raise ParseError("Does not compute.")
|
||||||
|
|
||||||
|
|
||||||
class FaultyGenericExceptionParser(DocumentParser):
|
class FaultyGenericExceptionParser(_BaseTestParser):
|
||||||
def __init__(self, logging_group, scratch_dir):
|
def __init__(self, logging_group, scratch_dir):
|
||||||
super().__init__(logging_group)
|
super().__init__(logging_group)
|
||||||
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
|
||||||
|
@ -168,7 +168,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
|
|
||||||
manifest = self._do_export(use_filename_format=use_filename_format)
|
manifest = self._do_export(use_filename_format=use_filename_format)
|
||||||
|
|
||||||
self.assertEqual(len(manifest), 172)
|
self.assertEqual(len(manifest), 177)
|
||||||
|
|
||||||
# dont include consumer or AnonymousUser users
|
# dont include consumer or AnonymousUser users
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
@ -694,8 +694,8 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
os.path.join(self.dirs.media_dir, "documents"),
|
os.path.join(self.dirs.media_dir, "documents"),
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(ContentType.objects.count(), 31)
|
self.assertEqual(ContentType.objects.count(), 32)
|
||||||
self.assertEqual(Permission.objects.count(), 124)
|
self.assertEqual(Permission.objects.count(), 128)
|
||||||
|
|
||||||
manifest = self._do_export()
|
manifest = self._do_export()
|
||||||
|
|
||||||
@ -719,5 +719,5 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
with self.assertRaises(IntegrityError):
|
with self.assertRaises(IntegrityError):
|
||||||
call_command("document_importer", "--no-progress-bar", self.target)
|
call_command("document_importer", "--no-progress-bar", self.target)
|
||||||
|
|
||||||
self.assertEqual(ContentType.objects.count(), 31)
|
self.assertEqual(ContentType.objects.count(), 32)
|
||||||
self.assertEqual(Permission.objects.count(), 125)
|
self.assertEqual(Permission.objects.count(), 128)
|
||||||
|
@ -405,3 +405,9 @@ class MailDocumentParser(DocumentParser):
|
|||||||
html_pdf = tempdir / "html.pdf"
|
html_pdf = tempdir / "html.pdf"
|
||||||
html_pdf.write_bytes(response.content)
|
html_pdf.write_bytes(response.content)
|
||||||
return html_pdf
|
return html_pdf
|
||||||
|
|
||||||
|
def get_settings(self):
|
||||||
|
"""
|
||||||
|
This parser does not implement additional settings yet
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
# Generated by Django 4.2.7 on 2023-12-07 22:52
|
# Generated by Django 4.2.7 on 2023-12-11 19:59
|
||||||
|
|
||||||
import django.core.validators
|
import django.core.validators
|
||||||
from django.db import migrations
|
from django.db import migrations
|
||||||
@ -28,8 +28,23 @@ class Migration(migrations.Migration):
|
|||||||
verbose_name="ID",
|
verbose_name="ID",
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
("pages", models.PositiveIntegerField(blank=True, null=True)),
|
(
|
||||||
("language", models.CharField(blank=True, max_length=32, null=True)),
|
"pages",
|
||||||
|
models.PositiveIntegerField(
|
||||||
|
blank=True,
|
||||||
|
null=True,
|
||||||
|
verbose_name="Do OCR from page 1 to this value",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"language",
|
||||||
|
models.CharField(
|
||||||
|
blank=True,
|
||||||
|
max_length=32,
|
||||||
|
null=True,
|
||||||
|
verbose_name="Do OCR using these languages",
|
||||||
|
),
|
||||||
|
),
|
||||||
(
|
(
|
||||||
"output_type",
|
"output_type",
|
||||||
models.CharField(
|
models.CharField(
|
||||||
@ -43,6 +58,7 @@ class Migration(migrations.Migration):
|
|||||||
],
|
],
|
||||||
max_length=8,
|
max_length=8,
|
||||||
null=True,
|
null=True,
|
||||||
|
verbose_name="Sets the output PDF type",
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
@ -57,6 +73,7 @@ class Migration(migrations.Migration):
|
|||||||
],
|
],
|
||||||
max_length=8,
|
max_length=8,
|
||||||
null=True,
|
null=True,
|
||||||
|
verbose_name="Sets the OCR mode",
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
@ -70,9 +87,16 @@ class Migration(migrations.Migration):
|
|||||||
],
|
],
|
||||||
max_length=16,
|
max_length=16,
|
||||||
null=True,
|
null=True,
|
||||||
|
verbose_name="Controls the generation of an archive file",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"image_dpi",
|
||||||
|
models.PositiveIntegerField(
|
||||||
|
null=True,
|
||||||
|
verbose_name="Sets image DPI fallback value",
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
("image_dpi", models.PositiveIntegerField(null=True)),
|
|
||||||
(
|
(
|
||||||
"unpaper_clean",
|
"unpaper_clean",
|
||||||
models.CharField(
|
models.CharField(
|
||||||
@ -84,15 +108,26 @@ class Migration(migrations.Migration):
|
|||||||
],
|
],
|
||||||
max_length=16,
|
max_length=16,
|
||||||
null=True,
|
null=True,
|
||||||
|
verbose_name="Controls the unpaper cleaning",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"deskew",
|
||||||
|
models.BooleanField(null=True, verbose_name="Enables deskew"),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"rotate_pages",
|
||||||
|
models.BooleanField(
|
||||||
|
null=True,
|
||||||
|
verbose_name="Enables page rotation",
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
("deskew", models.BooleanField(null=True)),
|
|
||||||
("rotate_pages", models.BooleanField(null=True)),
|
|
||||||
(
|
(
|
||||||
"rotate_pages_threshold",
|
"rotate_pages_threshold",
|
||||||
models.FloatField(
|
models.FloatField(
|
||||||
null=True,
|
null=True,
|
||||||
validators=[django.core.validators.MinValueValidator(0.0)],
|
validators=[django.core.validators.MinValueValidator(0.0)],
|
||||||
|
verbose_name="Sets the threshold for rotation of pages",
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
@ -102,6 +137,7 @@ class Migration(migrations.Migration):
|
|||||||
validators=[
|
validators=[
|
||||||
django.core.validators.MinValueValidator(1000000.0),
|
django.core.validators.MinValueValidator(1000000.0),
|
||||||
],
|
],
|
||||||
|
verbose_name="Sets the maximum image for decompression",
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
@ -117,9 +153,16 @@ class Migration(migrations.Migration):
|
|||||||
],
|
],
|
||||||
max_length=32,
|
max_length=32,
|
||||||
null=True,
|
null=True,
|
||||||
|
verbose_name="Sets the Ghostscript color conversion strategy",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"user_args",
|
||||||
|
models.JSONField(
|
||||||
|
null=True,
|
||||||
|
verbose_name="Adds additional user arguments for OCRMyPDF",
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
("user_args", models.JSONField(blank=True, null=True)),
|
|
||||||
],
|
],
|
||||||
options={
|
options={
|
||||||
"verbose_name": "ocr settings",
|
"verbose_name": "ocr settings",
|
||||||
|
@ -37,11 +37,21 @@ class OcrSettings(models.Model):
|
|||||||
GRAY = ("Gray", _("Gray"))
|
GRAY = ("Gray", _("Gray"))
|
||||||
CMYK = ("CMYK", _("CMYK"))
|
CMYK = ("CMYK", _("CMYK"))
|
||||||
|
|
||||||
pages = models.PositiveIntegerField(null=True, blank=True)
|
pages = models.PositiveIntegerField(
|
||||||
|
verbose_name=_("Do OCR from page 1 to this value"),
|
||||||
|
null=True,
|
||||||
|
blank=True,
|
||||||
|
)
|
||||||
|
|
||||||
language = models.CharField(null=True, blank=True, max_length=32)
|
language = models.CharField(
|
||||||
|
verbose_name=_("Do OCR using these languages"),
|
||||||
|
null=True,
|
||||||
|
blank=True,
|
||||||
|
max_length=32,
|
||||||
|
)
|
||||||
|
|
||||||
output_type = models.CharField(
|
output_type = models.CharField(
|
||||||
|
verbose_name=_("Sets the output PDF type"),
|
||||||
null=True,
|
null=True,
|
||||||
blank=True,
|
blank=True,
|
||||||
max_length=8,
|
max_length=8,
|
||||||
@ -49,6 +59,7 @@ class OcrSettings(models.Model):
|
|||||||
)
|
)
|
||||||
|
|
||||||
mode = models.CharField(
|
mode = models.CharField(
|
||||||
|
verbose_name=_("Sets the OCR mode"),
|
||||||
null=True,
|
null=True,
|
||||||
blank=True,
|
blank=True,
|
||||||
max_length=8,
|
max_length=8,
|
||||||
@ -56,43 +67,58 @@ class OcrSettings(models.Model):
|
|||||||
)
|
)
|
||||||
|
|
||||||
skip_archive_file = models.CharField(
|
skip_archive_file = models.CharField(
|
||||||
|
verbose_name=_("Controls the generation of an archive file"),
|
||||||
null=True,
|
null=True,
|
||||||
blank=True,
|
blank=True,
|
||||||
max_length=16,
|
max_length=16,
|
||||||
choices=ArchiveFileChoices.choices,
|
choices=ArchiveFileChoices.choices,
|
||||||
)
|
)
|
||||||
|
|
||||||
image_dpi = models.PositiveIntegerField(null=True)
|
image_dpi = models.PositiveIntegerField(
|
||||||
|
verbose_name=_("Sets image DPI fallback value"),
|
||||||
|
null=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Can't call it clean, that's a model method
|
||||||
unpaper_clean = models.CharField(
|
unpaper_clean = models.CharField(
|
||||||
|
verbose_name=_("Controls the unpaper cleaning"),
|
||||||
null=True,
|
null=True,
|
||||||
blank=True,
|
blank=True,
|
||||||
max_length=16,
|
max_length=16,
|
||||||
choices=CleanChoices.choices,
|
choices=CleanChoices.choices,
|
||||||
)
|
)
|
||||||
|
|
||||||
deskew = models.BooleanField(null=True)
|
deskew = models.BooleanField(verbose_name=_("Enables deskew"), null=True)
|
||||||
|
|
||||||
rotate_pages = models.BooleanField(null=True)
|
rotate_pages = models.BooleanField(
|
||||||
|
verbose_name=_("Enables page rotation"),
|
||||||
|
null=True,
|
||||||
|
)
|
||||||
|
|
||||||
rotate_pages_threshold = models.FloatField(
|
rotate_pages_threshold = models.FloatField(
|
||||||
|
verbose_name=_("Sets the threshold for rotation of pages"),
|
||||||
null=True,
|
null=True,
|
||||||
validators=[MinValueValidator(0.0)],
|
validators=[MinValueValidator(0.0)],
|
||||||
)
|
)
|
||||||
|
|
||||||
max_image_pixels = models.FloatField(
|
max_image_pixels = models.FloatField(
|
||||||
|
verbose_name=_("Sets the maximum image for decompression"),
|
||||||
null=True,
|
null=True,
|
||||||
validators=[MinValueValidator(1_000_000.0)],
|
validators=[MinValueValidator(1_000_000.0)],
|
||||||
)
|
)
|
||||||
|
|
||||||
color_conversion_strategy = models.CharField(
|
color_conversion_strategy = models.CharField(
|
||||||
|
verbose_name=_("Sets the Ghostscript color conversion strategy"),
|
||||||
blank=True,
|
blank=True,
|
||||||
null=True,
|
null=True,
|
||||||
max_length=32,
|
max_length=32,
|
||||||
choices=ColorConvertChoices.choices,
|
choices=ColorConvertChoices.choices,
|
||||||
)
|
)
|
||||||
|
|
||||||
user_args = models.JSONField(null=True)
|
user_args = models.JSONField(
|
||||||
|
verbose_name=_("Adds additional user arguments for OCRMyPDF"),
|
||||||
|
null=True,
|
||||||
|
)
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
verbose_name = _("ocr settings")
|
verbose_name = _("ocr settings")
|
||||||
@ -105,7 +131,7 @@ class OcrSettings(models.Model):
|
|||||||
# if you'll not check for self.pk
|
# if you'll not check for self.pk
|
||||||
# then error will also be raised in the update of exists model
|
# then error will also be raised in the update of exists model
|
||||||
raise ValidationError(
|
raise ValidationError(
|
||||||
"There is can be only one JuicerBaseSettings instance",
|
"There is can be only one OcrSettings instance",
|
||||||
)
|
)
|
||||||
return super().save(*args, **kwargs)
|
return super().save(*args, **kwargs)
|
||||||
|
|
||||||
|
9
src/paperless_tesseract/serialisers.py
Normal file
9
src/paperless_tesseract/serialisers.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
from rest_framework import serializers
|
||||||
|
|
||||||
|
from paperless_tesseract.models import OcrSettings
|
||||||
|
|
||||||
|
|
||||||
|
class OcrSettingsSerializer(serializers.ModelSerializer):
|
||||||
|
class Meta:
|
||||||
|
model = OcrSettings
|
||||||
|
fields = ["all"]
|
@ -26,33 +26,50 @@ class OcrSetting:
|
|||||||
|
|
||||||
def get_ocr_settings() -> OcrSetting:
|
def get_ocr_settings() -> OcrSetting:
|
||||||
db_settings = OcrSettingModel.objects.all().first()
|
db_settings = OcrSettingModel.objects.all().first()
|
||||||
assert db_settings is not None
|
# assert db_settings is not None
|
||||||
|
|
||||||
user_args = None
|
user_args = None
|
||||||
if db_settings.user_args:
|
if db_settings is not None and db_settings.user_args:
|
||||||
user_args = db_settings.user_args
|
user_args = db_settings.user_args
|
||||||
elif settings.OCR_USER_ARGS is not None:
|
elif settings.OCR_USER_ARGS is not None:
|
||||||
user_args = json.loads(settings.OCR_USER_ARGS)
|
user_args = json.loads(settings.OCR_USER_ARGS)
|
||||||
|
|
||||||
return OcrSetting(
|
return OcrSetting(
|
||||||
pages=db_settings.pages or settings.OCR_PAGES,
|
pages=db_settings.pages if db_settings is not None else settings.OCR_PAGES,
|
||||||
language=db_settings.language or settings.OCR_LANGUAGE,
|
language=db_settings.language
|
||||||
output_type=db_settings.output_type or settings.OCR_OUTPUT_TYPE,
|
if db_settings is not None and db_settings.language is not None
|
||||||
mode=db_settings.mode or settings.OCR_MODE,
|
else settings.OCR_LANGUAGE,
|
||||||
|
output_type=db_settings.output_type
|
||||||
|
if db_settings is not None
|
||||||
|
else settings.OCR_OUTPUT_TYPE,
|
||||||
|
mode=db_settings.mode if db_settings is not None else settings.OCR_MODE,
|
||||||
skip_archive_file=(
|
skip_archive_file=(
|
||||||
db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
|
db_settings.skip_archive_file
|
||||||
|
if db_settings is not None
|
||||||
|
else settings.OCR_SKIP_ARCHIVE_FILE
|
||||||
),
|
),
|
||||||
image_dpi=db_settings.image_dpi or settings.OCR_IMAGE_DPI,
|
image_dpi=db_settings.image_dpi
|
||||||
clean=db_settings.unpaper_clean or settings.OCR_CLEAN,
|
if db_settings is not None
|
||||||
deskew=db_settings.deskew or settings.OCR_DESKEW,
|
else settings.OCR_IMAGE_DPI,
|
||||||
rotate=db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
|
clean=db_settings.unpaper_clean
|
||||||
|
if db_settings is not None
|
||||||
|
else settings.OCR_CLEAN,
|
||||||
|
deskew=db_settings.deskew if db_settings is not None else settings.OCR_DESKEW,
|
||||||
|
rotate=db_settings.rotate_pages
|
||||||
|
if db_settings is not None
|
||||||
|
else settings.OCR_ROTATE_PAGES,
|
||||||
rotate_threshold=(
|
rotate_threshold=(
|
||||||
db_settings.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD
|
db_settings.rotate_pages_threshold
|
||||||
|
if db_settings is not None
|
||||||
|
else settings.OCR_ROTATE_PAGES_THRESHOLD
|
||||||
),
|
),
|
||||||
max_image_pixel=db_settings.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS,
|
max_image_pixel=db_settings.max_image_pixels
|
||||||
|
if db_settings is not None
|
||||||
|
else settings.OCR_MAX_IMAGE_PIXELS,
|
||||||
color_conversion_strategy=(
|
color_conversion_strategy=(
|
||||||
db_settings.color_conversion_strategy
|
db_settings.color_conversion_strategy
|
||||||
or settings.OCR_COLOR_CONVERSION_STRATEGY
|
if db_settings is not None
|
||||||
|
else settings.OCR_COLOR_CONVERSION_STRATEGY
|
||||||
),
|
),
|
||||||
user_args=user_args,
|
user_args=user_args,
|
||||||
)
|
)
|
||||||
|
14
src/paperless_tesseract/views.py
Normal file
14
src/paperless_tesseract/views.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from rest_framework.permissions import IsAuthenticated
|
||||||
|
from rest_framework.viewsets import ModelViewSet
|
||||||
|
|
||||||
|
from paperless_tesseract.models import OcrSettings
|
||||||
|
from paperless_tesseract.serialisers import OcrSettingsSerializer
|
||||||
|
|
||||||
|
|
||||||
|
class OcrSettingsViewSet(ModelViewSet):
|
||||||
|
model = OcrSettings
|
||||||
|
|
||||||
|
queryset = OcrSettings.objects
|
||||||
|
|
||||||
|
serializer_class = OcrSettingsSerializer
|
||||||
|
permission_classes = (IsAuthenticated,)
|
@ -34,3 +34,9 @@ class TextDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
def parse(self, document_path, mime_type, file_name=None):
|
def parse(self, document_path, mime_type, file_name=None):
|
||||||
self.text = self.read_file_handle_unicode_errors(document_path)
|
self.text = self.read_file_handle_unicode_errors(document_path)
|
||||||
|
|
||||||
|
def get_settings(self):
|
||||||
|
"""
|
||||||
|
This parser does not implement additional settings yet
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
@ -111,3 +111,9 @@ class TikaDocumentParser(DocumentParser):
|
|||||||
raise ParseError(
|
raise ParseError(
|
||||||
f"Error while converting document to PDF: {err}",
|
f"Error while converting document to PDF: {err}",
|
||||||
) from err
|
) from err
|
||||||
|
|
||||||
|
def get_settings(self):
|
||||||
|
"""
|
||||||
|
This parser does not implement additional settings yet
|
||||||
|
"""
|
||||||
|
return None
|
||||||
|
Loading…
x
Reference in New Issue
Block a user