More fixes and work

This commit is contained in:
Trenton H
2023-12-14 14:41:27 -08:00
parent a7753b1f89
commit 25cc7ada6b
11 changed files with 193 additions and 142 deletions

View File

@@ -3,6 +3,7 @@ import re
import subprocess
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Optional
from django.conf import settings
@@ -11,7 +12,7 @@ from PIL import Image
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless_tesseract.models import OcrSettings as OcrSettingModel
from paperless.models import OcrSettings as OcrSettingModel
from paperless_tesseract.setting_schema import OcrSetting
from paperless_tesseract.setting_schema import get_ocr_settings
@@ -71,7 +72,7 @@ class RasterisedDocumentParser(DocumentParser):
self.logging_group,
)
def is_image(self, mime_type):
def is_image(self, mime_type) -> bool:
return mime_type in [
"image/png",
"image/jpeg",
@@ -81,7 +82,7 @@ class RasterisedDocumentParser(DocumentParser):
"image/webp",
]
def has_alpha(self, image):
def has_alpha(self, image) -> bool:
with Image.open(image) as im:
return im.mode in ("RGBA", "LA")
@@ -96,7 +97,7 @@ class RasterisedDocumentParser(DocumentParser):
],
)
def get_dpi(self, image):
def get_dpi(self, image) -> Optional[int]:
try:
with Image.open(image) as im:
x, y = im.info["dpi"]
@@ -105,7 +106,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log.warning(f"Error while getting DPI from image {image}: {e}")
return None
def calculate_a4_dpi(self, image):
def calculate_a4_dpi(self, image) -> Optional[int]:
try:
with Image.open(image) as im:
width, height = im.size
@@ -118,7 +119,11 @@ class RasterisedDocumentParser(DocumentParser):
self.log.warning(f"Error while calculating DPI for image {image}: {e}")
return None
def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path):
def extract_text(
self,
sidecar_file: Optional[Path],
pdf_file: Path,
) -> Optional[str]:
# When re-doing OCR, the sidecar contains ONLY the new text, not
# the whole text, so do not utilize it in that case
if (
@@ -179,7 +184,8 @@ class RasterisedDocumentParser(DocumentParser):
sidecar_file,
safe_fallback=False,
):
assert isinstance(self.parser_settings, OcrSetting)
if TYPE_CHECKING:
assert isinstance(self.parser_settings, OcrSetting)
ocrmypdf_args = {
"input_file": input_file,
"output_file": output_file,

View File

@@ -4,7 +4,8 @@ from typing import Optional
from django.conf import settings
from paperless_tesseract.models import OcrSettings as OcrSettingModel
from paperless.models import CommonSettings
from paperless.models import OcrSettings as OcrSettingModel
@dataclasses.dataclass(frozen=True)
@@ -25,16 +26,20 @@ class OcrSetting:
def get_ocr_settings() -> OcrSetting:
db_settings = OcrSettingModel.objects.all().first()
ocr_db_settings = OcrSettingModel.objects.all().first()
# Workaround for a test where the migration hasn't run to create the single model
if db_settings is None:
if ocr_db_settings is None:
OcrSettingModel.objects.create()
db_settings = OcrSettingModel.objects.all().first()
assert db_settings is not None
ocr_db_settings = OcrSettingModel.objects.all().first()
cmn_db_settings = CommonSettings.objects.all().first()
if cmn_db_settings is None:
CommonSettings.objects.create()
cmn_db_settings = CommonSettings.objects.all().first()
user_args = None
if db_settings.user_args:
user_args = db_settings.user_args
if ocr_db_settings.user_args:
user_args = ocr_db_settings.user_args
elif settings.OCR_USER_ARGS is not None:
try:
user_args = json.loads(settings.OCR_USER_ARGS)
@@ -42,23 +47,25 @@ def get_ocr_settings() -> OcrSetting:
user_args = {}
return OcrSetting(
pages=db_settings.pages or settings.OCR_PAGES,
language=db_settings.language or settings.OCR_LANGUAGE,
output_type=db_settings.output_type or settings.OCR_OUTPUT_TYPE,
mode=db_settings.mode or settings.OCR_MODE,
pages=ocr_db_settings.pages or settings.OCR_PAGES,
language=ocr_db_settings.language or settings.OCR_LANGUAGE,
output_type=cmn_db_settings.output_type or settings.OCR_OUTPUT_TYPE,
mode=ocr_db_settings.mode or settings.OCR_MODE,
skip_archive_file=(
db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
ocr_db_settings.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
),
image_dpi=db_settings.image_dpi or settings.OCR_IMAGE_DPI,
clean=db_settings.unpaper_clean or settings.OCR_CLEAN,
deskew=db_settings.deskew or settings.OCR_DESKEW,
rotate=db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
image_dpi=ocr_db_settings.image_dpi or settings.OCR_IMAGE_DPI,
clean=ocr_db_settings.unpaper_clean or settings.OCR_CLEAN,
deskew=ocr_db_settings.deskew or settings.OCR_DESKEW,
rotate=ocr_db_settings.rotate_pages or settings.OCR_ROTATE_PAGES,
rotate_threshold=(
db_settings.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD
ocr_db_settings.rotate_pages_threshold
or settings.OCR_ROTATE_PAGES_THRESHOLD
),
max_image_pixel=db_settings.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS,
max_image_pixel=ocr_db_settings.max_image_pixels
or settings.OCR_MAX_IMAGE_PIXELS,
color_conversion_strategy=(
db_settings.color_conversion_strategy
ocr_db_settings.color_conversion_strategy
or settings.OCR_COLOR_CONVERSION_STRATEGY
),
user_args=user_args,

View File

@@ -2,7 +2,6 @@ import os
import shutil
import tempfile
import uuid
from contextlib import AbstractContextManager
from pathlib import Path
from unittest import mock
@@ -17,28 +16,6 @@ from documents.tests.utils import FileSystemAssertsMixin
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_tesseract.parsers import post_process_text
image_to_string_calls = []
def fake_convert(input_file, output_file, **kwargs):
with open(input_file) as f:
lines = f.readlines()
for i, line in enumerate(lines):
with open(output_file % i, "w") as f2:
f2.write(line.strip())
class FakeImageFile(AbstractContextManager):
def __init__(self, fname):
self.fname = fname
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def __enter__(self):
return os.path.basename(self.fname)
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"

View File

@@ -0,0 +1,120 @@
from django.test import TestCase
from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless.models import CommonSettings
from paperless.models import OcrSettings
from paperless_tesseract.parsers import RasterisedDocumentParser
class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
@staticmethod
def get_params():
return RasterisedDocumentParser(None).construct_ocrmypdf_parameters(
input_file="input.pdf",
output_file="output.pdf",
sidecar_file="sidecar.txt",
mime_type="application/pdf",
safe_fallback=False,
)
def test_db_settings_ocr_pages(self):
with override_settings(OCR_PAGES=10):
instance = OcrSettings.objects.all().first()
instance.pages = 5
instance.save()
params = self.get_params()
self.assertEqual(params["pages"], "1-5")
def test_db_settings_ocr_language(self):
with override_settings(OCR_LANGUAGE="eng+deu"):
instance = OcrSettings.objects.all().first()
instance.language = "fra+ita"
instance.save()
params = self.get_params()
self.assertEqual(params["language"], "fra+ita")
def test_db_settings_ocr_output_type(self):
with override_settings(OCR_LANGUAGE="pdfa-3"):
instance = OcrSettings.objects.all().first()
instance.output_type = CommonSettings.OutputTypeChoices.PDF_A
instance.save()
params = self.get_params()
self.assertEqual(params["output_type"], "pdfa")
def test_db_settings_ocr_mode(self):
with override_settings(OCR_MODE="redo"):
instance = OcrSettings.objects.all().first()
instance.mode = OcrSettings.ModeChoices.SKIP
instance.save()
params = self.get_params()
self.assertTrue(params["skip_text"])
self.assertNotIn("redo_ocr", params)
self.assertNotIn("force_ocr", params)
def test_db_settings_ocr_clean(self):
with override_settings(OCR_CLEAN="clean-final"):
instance = OcrSettings.objects.all().first()
instance.unpaper_clean = OcrSettings.CleanChoices.CLEAN
instance.save()
params = self.get_params()
self.assertTrue(params["clean"])
self.assertNotIn("clean_final", params)
with override_settings(OCR_CLEAN="clean-final"):
instance = OcrSettings.objects.all().first()
instance.unpaper_clean = OcrSettings.CleanChoices.FINAL
instance.save()
params = self.get_params()
self.assertTrue(params["clean_final"])
self.assertNotIn("clean", params)
def test_db_settings_ocr_deskew(self):
with override_settings(OCR_DESKEW=False):
instance = OcrSettings.objects.all().first()
instance.deskew = True
instance.save()
params = self.get_params()
self.assertTrue(params["deskew"])
def test_db_settings_ocr_rotate(self):
with override_settings(OCR_ROTATE_PAGES=False, OCR_ROTATE_PAGES_THRESHOLD=30.0):
instance = OcrSettings.objects.all().first()
instance.rotate_pages = True
instance.rotate_pages_threshold = 15.0
instance.save()
params = self.get_params()
self.assertTrue(params["rotate_pages"])
self.assertAlmostEqual(params["rotate_pages_threshold"], 15.0)
def test_db_settings_ocr_max_pixels(self):
with override_settings(OCR_MAX_IMAGE_PIXELS=2_000_000.0):
instance = OcrSettings.objects.all().first()
instance.max_image_pixels = 1_000_000.0
instance.save()
params = self.get_params()
self.assertAlmostEqual(params["max_image_mpixels"], 1.0)
def test_db_settings_ocr_color_convert(self):
with override_settings(OCR_COLOR_CONVERSION_STRATEGY="LeaveColorUnchanged"):
instance = OcrSettings.objects.all().first()
instance.color_conversion_strategy = (
OcrSettings.ColorConvertChoices.INDEPENDENT
)
instance.save()
params = self.get_params()
self.assertEqual(
params["color_conversion_strategy"],
"UseDeviceIndependentColor",
)