diff --git a/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.html b/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.html
index c32eb8e80..b584c86d5 100644
--- a/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.html
+++ b/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.html
@@ -28,6 +28,10 @@
+
+ {{ file.fileEntry.name }}
+
+
{
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index 863eea8ad..790281ca8 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -296,6 +296,8 @@ class Consumer(LoggingMixin):
override_owner_id=None,
override_storage_path_id=None,
full_path=None,
+ is_large_file=None,
+ ocr_specific_pages=None
) -> Document:
"""
Return the document object if it was successfully created.
@@ -390,7 +392,11 @@ class Consumer(LoggingMixin):
try:
self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT)
self.log("debug", f"Parsing {self.filename}...")
- document_parser.parse(self.path, mime_type, self.filename)
+ custom_options = {
+ 'is_large_file': is_large_file,
+ 'ocr_specific_pages': ocr_specific_pages
+ }
+ document_parser.parse(self.path, mime_type, self.filename, custom_options)
self.log("debug", f"Generating thumbnail for {self.filename}...")
self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL)
diff --git a/src/documents/data_models.py b/src/documents/data_models.py
index 9ea720b00..251406db3 100644
--- a/src/documents/data_models.py
+++ b/src/documents/data_models.py
@@ -26,6 +26,8 @@ class DocumentMetadataOverrides:
owner_id: Optional[int] = None
storage_path_id: Optional[int] = None
full_path: Optional[str] = None
+ is_large_file: Optional[bool] = None
+ ocr_specific_pages: Optional[str] = None
class DocumentSource(enum.IntEnum):
diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py
index 1217410a0..9b063038a 100644
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -726,6 +726,8 @@ class PostDocumentSerializer(serializers.Serializer):
min_value=Document.ARCHIVE_SERIAL_NUMBER_MIN,
max_value=Document.ARCHIVE_SERIAL_NUMBER_MAX,
)
+
+ # Custom fields
storage_path_id = serializers.IntegerField(
label="Storage path ID",
@@ -735,7 +737,21 @@ class PostDocumentSerializer(serializers.Serializer):
)
full_path = serializers.CharField(
- label="Full Path",
+ label="Full path",
+ allow_null=True,
+ write_only=True,
+ required=False,
+ )
+
+ is_large_file = serializers.CharField(
+ label="Is large file",
+ allow_null=True,
+ write_only=True,
+ required=False,
+ )
+
+ ocr_specific_pages = serializers.CharField(
+ label="OCR specific pages",
allow_null=True,
write_only=True,
required=False,
diff --git a/src/documents/tasks.py b/src/documents/tasks.py
index 33e93b703..81d627335 100644
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -201,7 +201,9 @@ def consume_file(
override_asn=overrides.asn,
override_owner_id=overrides.owner_id,
override_storage_path_id=overrides.storage_path_id,
- full_path=overrides.full_path
+ full_path=overrides.full_path,
+ is_large_file=overrides.is_large_file,
+ ocr_specific_pages=overrides.ocr_specific_pages
)
if document:
diff --git a/src/documents/views.py b/src/documents/views.py
index 9abe84eb7..e1ae9c4cc 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -719,6 +719,8 @@ class PostDocumentView(GenericAPIView):
archive_serial_number = serializer.validated_data.get("archive_serial_number")
storage_path_id = serializer.validated_data.get("storage_path_id")
full_path = serializer.validated_data.get("full_path")
+ is_large_file = serializer.validated_data.get("is_large_file")
+ ocr_specific_pages = serializer.validated_data.get("ocr_specific_pages")
logger.debug(f"storage_path_id: {storage_path_id}")
@@ -750,6 +752,8 @@ class PostDocumentView(GenericAPIView):
# owner_id=request.user.id,
storage_path_id=storage_path_id,
full_path=full_path,
+ is_large_file=is_large_file,
+ ocr_specific_pages=ocr_specific_pages
)
async_task = consume_file.delay(
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py
index f3e8e21fd..6b63516f7 100644
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -1,413 +1,450 @@
-import json
-import os
-import re
-import subprocess
-import tempfile
-from pathlib import Path
-from typing import Optional
-
-from django.conf import settings
-from documents.parsers import DocumentParser
-from documents.parsers import make_thumbnail_from_pdf
-from documents.parsers import ParseError
-from PIL import Image
-
-
-class NoTextFoundException(Exception):
- pass
-
-
-class RtlLanguageException(Exception):
- pass
-
-
-class RasterisedDocumentParser(DocumentParser):
- """
- This parser uses Tesseract to try and get some text out of a rasterised
- image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
- """
-
- logging_name = "paperless.parsing.tesseract"
-
- def extract_metadata(self, document_path, mime_type):
-
- result = []
- if mime_type == "application/pdf":
- import pikepdf
-
- namespace_pattern = re.compile(r"\{(.*)\}(.*)")
-
- pdf = pikepdf.open(document_path)
- meta = pdf.open_metadata()
- for key, value in meta.items():
- if isinstance(value, list):
- value = " ".join([str(e) for e in value])
- value = str(value)
- try:
- m = namespace_pattern.match(key)
- result.append(
- {
- "namespace": m.group(1),
- "prefix": meta.REVERSE_NS[m.group(1)],
- "key": m.group(2),
- "value": value,
- },
- )
- except Exception as e:
- self.log(
- "warning",
- f"Error while reading metadata {key}: {value}. Error: {e}",
- )
- return result
-
- def get_thumbnail(self, document_path, mime_type, file_name=None):
- return make_thumbnail_from_pdf(
- self.archive_path or document_path,
- self.tempdir,
- self.logging_group,
- )
-
- def is_image(self, mime_type):
- return mime_type in [
- "image/png",
- "image/jpeg",
- "image/tiff",
- "image/bmp",
- "image/gif",
- "image/webp",
- ]
-
- def has_alpha(self, image):
- with Image.open(image) as im:
- return im.mode in ("RGBA", "LA")
-
- def remove_alpha(self, image_path: str):
- subprocess.run(
- [
- settings.CONVERT_BINARY,
- "-alpha",
- "off",
- image_path,
- image_path,
- ],
- )
-
- def get_dpi(self, image):
- try:
- with Image.open(image) as im:
- x, y = im.info["dpi"]
- return round(x)
- except Exception as e:
- self.log("warning", f"Error while getting DPI from image {image}: {e}")
- return None
-
- def calculate_a4_dpi(self, image):
- try:
- with Image.open(image) as im:
- width, height = im.size
- # divide image width by A4 width (210mm) in inches.
- dpi = int(width / (21 / 2.54))
- self.log("debug", f"Estimated DPI {dpi} based on image width {width}")
- return dpi
-
- except Exception as e:
- self.log("warning", f"Error while calculating DPI for image {image}: {e}")
- return None
-
- def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path):
- # When re-doing OCR, the sidecar contains ONLY the new text, not
- # the whole text, so do not utilize it in that case
- if (
- sidecar_file is not None
- and os.path.isfile(sidecar_file)
- and settings.OCR_MODE != "redo"
- ):
- with open(sidecar_file) as f:
- text = f.read()
-
- if "[OCR skipped on page" not in text:
- # This happens when there's already text in the input file.
- # The sidecar file will only contain text for OCR'ed pages.
- self.log("debug", "Using text from sidecar file")
- return post_process_text(text)
- else:
- self.log("debug", "Incomplete sidecar file: discarding.")
-
- # no success with the sidecar file, try PDF
-
- if not os.path.isfile(pdf_file):
- return None
-
- try:
- text = None
- with tempfile.NamedTemporaryFile(
- mode="w+",
- dir=self.tempdir,
- ) as tmp:
- subprocess.run(
- [
- "pdftotext",
- "-q",
- "-layout",
- "-enc",
- "UTF-8",
- pdf_file,
- tmp.name,
- ],
- )
- text = tmp.read()
-
- return post_process_text(text)
-
- except Exception:
- # If pdftotext fails, fall back to OCR.
- self.log(
- "warning",
- "Error while getting text from PDF document with pdftotext",
- exc_info=True,
- )
- # probably not a PDF file.
- return None
-
- def construct_ocrmypdf_parameters(
- self,
- input_file,
- mime_type,
- output_file,
- sidecar_file,
- safe_fallback=False,
- ):
- ocrmypdf_args = {
- "input_file": input_file,
- "output_file": output_file,
- # need to use threads, since this will be run in daemonized
- # processes via the task library.
- "use_threads": True,
- "jobs": settings.THREADS_PER_WORKER,
- "language": settings.OCR_LANGUAGE,
- "output_type": settings.OCR_OUTPUT_TYPE,
- "progress_bar": False,
- }
-
- if settings.OCR_MODE == "force" or safe_fallback:
- ocrmypdf_args["force_ocr"] = True
- elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
- ocrmypdf_args["skip_text"] = True
- elif settings.OCR_MODE == "redo":
- ocrmypdf_args["redo_ocr"] = True
- else:
- raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}")
-
- if settings.OCR_CLEAN == "clean":
- ocrmypdf_args["clean"] = True
- elif settings.OCR_CLEAN == "clean-final":
- if settings.OCR_MODE == "redo":
- ocrmypdf_args["clean"] = True
- else:
- # --clean-final is not compatible with --redo-ocr
- ocrmypdf_args["clean_final"] = True
-
- if settings.OCR_DESKEW and settings.OCR_MODE != "redo":
- # --deskew is not compatible with --redo-ocr
- ocrmypdf_args["deskew"] = True
-
- if settings.OCR_ROTATE_PAGES:
- ocrmypdf_args["rotate_pages"] = True
- ocrmypdf_args[
- "rotate_pages_threshold"
- ] = settings.OCR_ROTATE_PAGES_THRESHOLD
-
- if settings.OCR_PAGES > 0:
- ocrmypdf_args["pages"] = f"1-{settings.OCR_PAGES}"
- else:
- # sidecar is incompatible with pages
- ocrmypdf_args["sidecar"] = sidecar_file
-
- if self.is_image(mime_type):
- dpi = self.get_dpi(input_file)
- a4_dpi = self.calculate_a4_dpi(input_file)
-
- if self.has_alpha(input_file):
- self.log(
- "info",
- f"Removing alpha layer from {input_file} "
- "for compatibility with img2pdf",
- )
- self.remove_alpha(input_file)
-
- if dpi:
- self.log("debug", f"Detected DPI for image {input_file}: {dpi}")
- ocrmypdf_args["image_dpi"] = dpi
- elif settings.OCR_IMAGE_DPI:
- ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI
- elif a4_dpi:
- ocrmypdf_args["image_dpi"] = a4_dpi
- else:
- raise ParseError(
- f"Cannot produce archive PDF for image {input_file}, "
- f"no DPI information is present in this image and "
- f"OCR_IMAGE_DPI is not set.",
- )
-
- if settings.OCR_USER_ARGS and not safe_fallback:
- try:
- user_args = json.loads(settings.OCR_USER_ARGS)
- ocrmypdf_args = {**ocrmypdf_args, **user_args}
- except Exception as e:
- self.log(
- "warning",
- f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
- f"they will not be used. Error: {e}",
- )
-
- if settings.OCR_MAX_IMAGE_PIXELS is not None:
- # Convert pixels to mega-pixels and provide to ocrmypdf
- max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
- if max_pixels_mpixels > 0:
-
- self.log(
- "debug",
- f"Calculated {max_pixels_mpixels} megapixels for OCR",
- )
-
- ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
- else:
- self.log(
- "warning",
- "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
- "this value must be at least 1 megapixel if set",
- )
-
- return ocrmypdf_args
-
- def parse(self, document_path: Path, mime_type, file_name=None):
- # This forces tesseract to use one core per page.
- os.environ["OMP_THREAD_LIMIT"] = "1"
- VALID_TEXT_LENGTH = 50
-
- if mime_type == "application/pdf":
- text_original = self.extract_text(None, document_path)
- original_has_text = (
- text_original is not None and len(text_original) > VALID_TEXT_LENGTH
- )
- else:
- text_original = None
- original_has_text = False
-
- # If the original has text, and the user doesn't want an archive,
- # we're done here
- skip_archive_for_text = (
- settings.OCR_MODE == "skip_noarchive"
- or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
- )
- if skip_archive_for_text and original_has_text:
- self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
- self.text = text_original
- return
-
- # Either no text was in the original or there should be an archive
- # file created, so OCR the file and create an archive with any
- # text located via OCR
-
- import ocrmypdf
- from ocrmypdf import InputFileError, EncryptedPdfError
-
- archive_path = Path(os.path.join(self.tempdir, "archive.pdf"))
- sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt"))
-
- args = self.construct_ocrmypdf_parameters(
- document_path,
- mime_type,
- archive_path,
- sidecar_file,
- )
-
- try:
- self.log("debug", f"Calling OCRmyPDF with args: {args}")
- ocrmypdf.ocr(**args)
-
- if settings.OCR_SKIP_ARCHIVE_FILE != "always":
- self.archive_path = archive_path
-
- self.text = self.extract_text(sidecar_file, archive_path)
-
- if not self.text:
- raise NoTextFoundException("No text was found in the original document")
- except EncryptedPdfError:
- self.log(
- "warning",
- "This file is encrypted, OCR is impossible. Using "
- "any text present in the original file.",
- )
- if original_has_text:
- self.text = text_original
- except (NoTextFoundException, InputFileError) as e:
- self.log(
- "warning",
- f"Encountered an error while running OCR: {str(e)}. "
- f"Attempting force OCR to get the text.",
- )
-
- archive_path_fallback = Path(
- os.path.join(self.tempdir, "archive-fallback.pdf"),
- )
- sidecar_file_fallback = Path(
- os.path.join(self.tempdir, "sidecar-fallback.txt"),
- )
-
- # Attempt to run OCR with safe settings.
-
- args = self.construct_ocrmypdf_parameters(
- document_path,
- mime_type,
- archive_path_fallback,
- sidecar_file_fallback,
- safe_fallback=True,
- )
-
- try:
- self.log("debug", f"Fallback: Calling OCRmyPDF with args: {args}")
- ocrmypdf.ocr(**args)
-
- # Don't return the archived file here, since this file
- # is bigger and blurry due to --force-ocr.
-
- self.text = self.extract_text(
- sidecar_file_fallback,
- archive_path_fallback,
- )
-
- except Exception as e:
- # If this fails, we have a serious issue at hand.
- raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e
-
- except Exception as e:
- # Anything else is probably serious.
- raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e
-
- # As a last resort, if we still don't have any text for any reason,
- # try to extract the text from the original document.
- if not self.text:
- if original_has_text:
- self.text = text_original
- else:
- self.log(
- "warning",
- f"No text was found in {document_path}, the content will "
- f"be empty.",
- )
- self.text = ""
-
-
-def post_process_text(text):
- if not text:
- return None
-
- collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
- no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
- no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
-
- # TODO: this needs a rework
- # replace \0 prevents issues with saving to postgres.
- # text may contain \0 when this character is present in PDF files.
- return no_trailing_whitespace.strip().replace("\0", " ")
+import json
+import os
+import re
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import Optional
+
+from django.conf import settings
+from documents.parsers import DocumentParser
+from documents.parsers import make_thumbnail_from_pdf
+from documents.parsers import ParseError
+from PIL import Image
+
+
+class NoTextFoundException(Exception):
+ pass
+
+
+class RtlLanguageException(Exception):
+ pass
+
+
+class RasterisedDocumentParser(DocumentParser):
+ """
+ This parser uses Tesseract to try and get some text out of a rasterised
+ image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
+ """
+
+ logging_name = "paperless.parsing.tesseract"
+
+ def extract_metadata(self, document_path, mime_type):
+
+ result = []
+ if mime_type == "application/pdf":
+ import pikepdf
+
+ namespace_pattern = re.compile(r"\{(.*)\}(.*)")
+
+ pdf = pikepdf.open(document_path)
+ meta = pdf.open_metadata()
+ for key, value in meta.items():
+ if isinstance(value, list):
+ value = " ".join([str(e) for e in value])
+ value = str(value)
+ try:
+ m = namespace_pattern.match(key)
+ result.append(
+ {
+ "namespace": m.group(1),
+ "prefix": meta.REVERSE_NS[m.group(1)],
+ "key": m.group(2),
+ "value": value,
+ },
+ )
+ except Exception as e:
+ self.log(
+ "warning",
+ f"Error while reading metadata {key}: {value}. Error: {e}",
+ )
+ return result
+
+ def get_thumbnail(self, document_path, mime_type, file_name=None):
+ return make_thumbnail_from_pdf(
+ self.archive_path or document_path,
+ self.tempdir,
+ self.logging_group,
+ )
+
+ def is_image(self, mime_type):
+ return mime_type in [
+ "image/png",
+ "image/jpeg",
+ "image/tiff",
+ "image/bmp",
+ "image/gif",
+ "image/webp",
+ ]
+
+ def has_alpha(self, image):
+ with Image.open(image) as im:
+ return im.mode in ("RGBA", "LA")
+
+ def remove_alpha(self, image_path: str):
+ subprocess.run(
+ [
+ settings.CONVERT_BINARY,
+ "-alpha",
+ "off",
+ image_path,
+ image_path,
+ ],
+ )
+
+ def get_dpi(self, image):
+ try:
+ with Image.open(image) as im:
+ x, y = im.info["dpi"]
+ return round(x)
+ except Exception as e:
+ self.log("warning", f"Error while getting DPI from image {image}: {e}")
+ return None
+
+ def calculate_a4_dpi(self, image):
+ try:
+ with Image.open(image) as im:
+ width, height = im.size
+ # divide image width by A4 width (210mm) in inches.
+ dpi = int(width / (21 / 2.54))
+ self.log("debug", f"Estimated DPI {dpi} based on image width {width}")
+ return dpi
+
+ except Exception as e:
+ self.log("warning", f"Error while calculating DPI for image {image}: {e}")
+ return None
+
+ def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path, custom_options=None):
+ # When re-doing OCR, the sidecar contains ONLY the new text, not
+ # the whole text, so do not utilize it in that case
+ if (
+ sidecar_file is not None
+ and os.path.isfile(sidecar_file)
+ and settings.OCR_MODE != "redo"
+ ):
+ with open(sidecar_file) as f:
+ text = f.read()
+
+ if "[OCR skipped on page" not in text:
+ # This happens when there's already text in the input file.
+ # The sidecar file will only contain text for OCR'ed pages.
+ self.log("debug", "Using text from sidecar file")
+ return post_process_text(text)
+ elif custom_options is not None and 'is_large_file' in custom_options:
+ self.log("debug", "File is large so some pages may have been skipped intentionally. Using text from incomplete sidecar file")
+ return post_process_text(text)
+ else:
+ self.log("debug", "Incomplete sidecar file: discarding.")
+
+ # no success with the sidecar file, try PDF
+
+ if not os.path.isfile(pdf_file):
+ return None
+
+ try:
+ text = None
+ with tempfile.NamedTemporaryFile(
+ mode="w+",
+ dir=self.tempdir,
+ ) as tmp:
+ subprocess.run(
+ [
+ "pdftotext",
+ "-q",
+ "-layout",
+ "-enc",
+ "UTF-8",
+ pdf_file,
+ tmp.name,
+ ],
+ )
+ text = tmp.read()
+
+ return post_process_text(text)
+
+ except Exception:
+ # If pdftotext fails, fall back to OCR.
+ self.log(
+ "warning",
+ "Error while getting text from PDF document with pdftotext",
+ exc_info=True,
+ )
+ # probably not a PDF file.
+ return None
+
+ def construct_ocrmypdf_parameters(
+ self,
+ input_file,
+ mime_type,
+ output_file,
+ sidecar_file,
+ safe_fallback=False,
+ # used for large files, to only do OCR on specific pages
+ is_large_file=False,
+ specific_pages=None
+ ):
+ ocrmypdf_args = {
+ "input_file": input_file,
+ "output_file": output_file,
+ # need to use threads, since this will be run in daemonized
+ # processes via the task library.
+ "use_threads": True,
+ "jobs": settings.THREADS_PER_WORKER,
+ "language": settings.OCR_LANGUAGE,
+ "output_type": settings.OCR_OUTPUT_TYPE,
+ "progress_bar": False,
+ }
+
+ if settings.OCR_MODE == "force" or safe_fallback:
+ ocrmypdf_args["force_ocr"] = True
+ elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
+ ocrmypdf_args["skip_text"] = True
+ elif settings.OCR_MODE == "redo":
+ ocrmypdf_args["redo_ocr"] = True
+ else:
+ raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}")
+
+ if settings.OCR_CLEAN == "clean":
+ ocrmypdf_args["clean"] = True
+ elif settings.OCR_CLEAN == "clean-final":
+ if settings.OCR_MODE == "redo":
+ ocrmypdf_args["clean"] = True
+ else:
+ # --clean-final is not compatible with --redo-ocr
+ ocrmypdf_args["clean_final"] = True
+
+ if settings.OCR_DESKEW and settings.OCR_MODE != "redo":
+ # --deskew is not compatible with --redo-ocr
+ ocrmypdf_args["deskew"] = True
+
+ if settings.OCR_ROTATE_PAGES:
+ ocrmypdf_args["rotate_pages"] = True
+ ocrmypdf_args[
+ "rotate_pages_threshold"
+ ] = settings.OCR_ROTATE_PAGES_THRESHOLD
+
+ if settings.OCR_PAGES > 0:
+ ocrmypdf_args["pages"] = f"1-{settings.OCR_PAGES}"
+ else:
+ # sidecar is incompatible with pages
+ ocrmypdf_args["sidecar"] = sidecar_file
+
+ if is_large_file and specific_pages is not None:
+ specific_pages = specific_pages.strip(',')
+ ocrmypdf_args["pages"] = specific_pages
+ elif is_large_file:
+ self.log("debug", "Large file but did not specify pages, so disabling OCR")
+ ocrmypdf_args["tesseract-timeout"] = 0
+
+ # Regardless of other options, disable postprocessing if large file
+ # Source: https://ocrmypdf.readthedocs.io/en/latest/performance.html?highlight=Postprocessing#speed
+ if is_large_file:
+ self.log("debug", "Since large file, disabling postprocessing")
+ ocrmypdf_args["optimize"] = 0
+ ocrmypdf_args["output-type"] = 'pdf'
+ ocrmypdf_args["fast-web-view"] = 0
+ ocrmypdf_args["skip-big"] = 200
+ ocrmypdf_args["deskew"] = False
+ ocrmypdf_args["rotate_pages"] = False
+ ocrmypdf_args["clean"] = False
+
+ if self.is_image(mime_type):
+ dpi = self.get_dpi(input_file)
+ a4_dpi = self.calculate_a4_dpi(input_file)
+
+ if self.has_alpha(input_file):
+ self.log(
+ "info",
+ f"Removing alpha layer from {input_file} "
+ "for compatibility with img2pdf",
+ )
+ self.remove_alpha(input_file)
+
+ if dpi:
+ self.log("debug", f"Detected DPI for image {input_file}: {dpi}")
+ ocrmypdf_args["image_dpi"] = dpi
+ elif settings.OCR_IMAGE_DPI:
+ ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI
+ elif a4_dpi:
+ ocrmypdf_args["image_dpi"] = a4_dpi
+ else:
+ raise ParseError(
+ f"Cannot produce archive PDF for image {input_file}, "
+ f"no DPI information is present in this image and "
+ f"OCR_IMAGE_DPI is not set.",
+ )
+
+ if settings.OCR_USER_ARGS and not safe_fallback:
+ try:
+ user_args = json.loads(settings.OCR_USER_ARGS)
+ ocrmypdf_args = {**ocrmypdf_args, **user_args}
+ except Exception as e:
+ self.log(
+ "warning",
+ f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
+ f"they will not be used. Error: {e}",
+ )
+
+ if settings.OCR_MAX_IMAGE_PIXELS is not None:
+ # Convert pixels to mega-pixels and provide to ocrmypdf
+ max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
+ if max_pixels_mpixels > 0:
+
+ self.log(
+ "debug",
+ f"Calculated {max_pixels_mpixels} megapixels for OCR",
+ )
+
+ ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
+ else:
+ self.log(
+ "warning",
+ "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
+ "this value must be at least 1 megapixel if set",
+ )
+
+ return ocrmypdf_args
+
+ def parse(self, document_path: Path, mime_type, file_name=None, custom_options=None):
+ # This forces tesseract to use one core per page.
+ os.environ["OMP_THREAD_LIMIT"] = "1"
+ VALID_TEXT_LENGTH = 50
+
+ if mime_type == "application/pdf":
+ text_original = self.extract_text(None, document_path)
+ original_has_text = (
+ text_original is not None and len(text_original) > VALID_TEXT_LENGTH
+ )
+ else:
+ text_original = None
+ original_has_text = False
+
+ # If the original has text, and the user doesn't want an archive,
+ # we're done here
+ skip_archive_for_text = (
+ settings.OCR_MODE == "skip_noarchive"
+ or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
+ )
+ if skip_archive_for_text and original_has_text:
+ self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
+ self.text = text_original
+ return
+
+ # Either no text was in the original or there should be an archive
+ # file created, so OCR the file and create an archive with any
+ # text located via OCR
+
+ import ocrmypdf
+ from ocrmypdf import InputFileError, EncryptedPdfError
+
+ archive_path = Path(os.path.join(self.tempdir, "archive.pdf"))
+ sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt"))
+
+ specific_pages = None
+ is_large_file = False
+ if custom_options is not None:
+ if 'ocr_specific_pages' in custom_options:
+ specific_pages = custom_options['ocr_specific_pages']
+ if 'is_large_file' in custom_options:
+ is_large_file = custom_options['is_large_file']
+
+ args = self.construct_ocrmypdf_parameters(
+ document_path,
+ mime_type,
+ archive_path,
+ sidecar_file,
+ is_large_file=is_large_file,
+ specific_pages=specific_pages
+ )
+
+ try:
+ self.log("debug", f"Calling OCRmyPDF with args: {args}")
+ ocrmypdf.ocr(**args)
+
+ if settings.OCR_SKIP_ARCHIVE_FILE != "always":
+ self.archive_path = archive_path
+
+ self.text = self.extract_text(sidecar_file, archive_path, custom_options=custom_options)
+
+ if not self.text:
+ raise NoTextFoundException("No text was found in the original document")
+ except EncryptedPdfError:
+ self.log(
+ "warning",
+ "This file is encrypted, OCR is impossible. Using "
+ "any text present in the original file.",
+ )
+ if original_has_text:
+ self.text = text_original
+ except (NoTextFoundException, InputFileError) as e:
+ self.log(
+ "warning",
+ f"Encountered an error while running OCR: {str(e)}. "
+ f"Attempting force OCR to get the text.",
+ )
+
+ archive_path_fallback = Path(
+ os.path.join(self.tempdir, "archive-fallback.pdf"),
+ )
+ sidecar_file_fallback = Path(
+ os.path.join(self.tempdir, "sidecar-fallback.txt"),
+ )
+
+ # Attempt to run OCR with safe settings.
+
+ args = self.construct_ocrmypdf_parameters(
+ document_path,
+ mime_type,
+ archive_path_fallback,
+ sidecar_file_fallback,
+ safe_fallback=True,
+ is_large_file=is_large_file,
+ specific_pages=specific_pages
+ )
+
+ try:
+ self.log("debug", f"Fallback: Calling OCRmyPDF with args: {args}")
+ ocrmypdf.ocr(**args)
+
+ # Don't return the archived file here, since this file
+ # is bigger and blurry due to --force-ocr.
+
+ self.text = self.extract_text(
+ sidecar_file_fallback,
+ archive_path_fallback,
+ )
+
+ except Exception as e:
+ # If this fails, we have a serious issue at hand.
+ raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e
+
+ except Exception as e:
+ # Anything else is probably serious.
+ raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e
+
+ # As a last resort, if we still don't have any text for any reason,
+ # try to extract the text from the original document.
+ if not self.text:
+ if original_has_text:
+ self.text = text_original
+ else:
+ self.log(
+ "warning",
+ f"No text was found in {document_path}, the content will "
+ f"be empty.",
+ )
+ self.text = ""
+
+
+def post_process_text(text):
+ if not text:
+ return None
+
+ collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
+ no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
+ no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
+
+ # TODO: this needs a rework
+ # replace \0 prevents issues with saving to postgres.
+ # text may contain \0 when this character is present in PDF files.
+ return no_trailing_whitespace.strip().replace("\0", " ")