paperless-ngx/src/paperless_tesseract/tests/test_parser_custom_settings.py
2023-12-19 09:19:47 -08:00

121 lines
4.4 KiB
Python

from django.test import TestCase
from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless.models import CommonSettings
from paperless.models import OcrSettings
from paperless_tesseract.parsers import RasterisedDocumentParser
class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
@staticmethod
def get_params():
return RasterisedDocumentParser(None).construct_ocrmypdf_parameters(
input_file="input.pdf",
output_file="output.pdf",
sidecar_file="sidecar.txt",
mime_type="application/pdf",
safe_fallback=False,
)
def test_db_settings_ocr_pages(self):
with override_settings(OCR_PAGES=10):
instance = OcrSettings.objects.all().first()
instance.pages = 5
instance.save()
params = self.get_params()
self.assertEqual(params["pages"], "1-5")
def test_db_settings_ocr_language(self):
with override_settings(OCR_LANGUAGE="eng+deu"):
instance = OcrSettings.objects.all().first()
instance.language = "fra+ita"
instance.save()
params = self.get_params()
self.assertEqual(params["language"], "fra+ita")
def test_db_settings_ocr_output_type(self):
with override_settings(OCR_LANGUAGE="pdfa-3"):
instance = OcrSettings.objects.all().first()
instance.output_type = CommonSettings.OutputTypeChoices.PDF_A
instance.save()
params = self.get_params()
self.assertEqual(params["output_type"], "pdfa")
def test_db_settings_ocr_mode(self):
with override_settings(OCR_MODE="redo"):
instance = OcrSettings.objects.all().first()
instance.mode = OcrSettings.ModeChoices.SKIP
instance.save()
params = self.get_params()
self.assertTrue(params["skip_text"])
self.assertNotIn("redo_ocr", params)
self.assertNotIn("force_ocr", params)
def test_db_settings_ocr_clean(self):
with override_settings(OCR_CLEAN="clean-final"):
instance = OcrSettings.objects.all().first()
instance.unpaper_clean = OcrSettings.CleanChoices.CLEAN
instance.save()
params = self.get_params()
self.assertTrue(params["clean"])
self.assertNotIn("clean_final", params)
with override_settings(OCR_CLEAN="clean-final"):
instance = OcrSettings.objects.all().first()
instance.unpaper_clean = OcrSettings.CleanChoices.FINAL
instance.save()
params = self.get_params()
self.assertTrue(params["clean_final"])
self.assertNotIn("clean", params)
def test_db_settings_ocr_deskew(self):
with override_settings(OCR_DESKEW=False):
instance = OcrSettings.objects.all().first()
instance.deskew = True
instance.save()
params = self.get_params()
self.assertTrue(params["deskew"])
def test_db_settings_ocr_rotate(self):
with override_settings(OCR_ROTATE_PAGES=False, OCR_ROTATE_PAGES_THRESHOLD=30.0):
instance = OcrSettings.objects.all().first()
instance.rotate_pages = True
instance.rotate_pages_threshold = 15.0
instance.save()
params = self.get_params()
self.assertTrue(params["rotate_pages"])
self.assertAlmostEqual(params["rotate_pages_threshold"], 15.0)
def test_db_settings_ocr_max_pixels(self):
with override_settings(OCR_MAX_IMAGE_PIXELS=2_000_000.0):
instance = OcrSettings.objects.all().first()
instance.max_image_pixels = 1_000_000.0
instance.save()
params = self.get_params()
self.assertAlmostEqual(params["max_image_mpixels"], 1.0)
def test_db_settings_ocr_color_convert(self):
with override_settings(OCR_COLOR_CONVERSION_STRATEGY="LeaveColorUnchanged"):
instance = OcrSettings.objects.all().first()
instance.color_conversion_strategy = (
OcrSettings.ColorConvertChoices.INDEPENDENT
)
instance.save()
params = self.get_params()
self.assertEqual(
params["color_conversion_strategy"],
"UseDeviceIndependentColor",
)