From dec3b582700bb8c25bd5f99e2304b3d388b1898c Mon Sep 17 00:00:00 2001 From: Martin Tan Date: Sun, 3 Sep 2023 18:15:03 +0800 Subject: [PATCH] Add basic large file upload feature --- .../upload-large-file.component.html | 4 + .../upload-large-file.component.ts | 9 +- src/documents/consumer.py | 8 +- src/documents/data_models.py | 2 + src/documents/serialisers.py | 18 +- src/documents/tasks.py | 4 +- src/documents/views.py | 4 + src/paperless_tesseract/parsers.py | 863 +++++++++--------- 8 files changed, 493 insertions(+), 419 deletions(-) diff --git a/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.html b/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.html index c32eb8e80..b584c86d5 100644 --- a/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.html +++ b/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.html @@ -28,6 +28,10 @@ +
+ {{ file.fileEntry.name }} +
+
{ diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 863eea8ad..790281ca8 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -296,6 +296,8 @@ class Consumer(LoggingMixin): override_owner_id=None, override_storage_path_id=None, full_path=None, + is_large_file=None, + ocr_specific_pages=None ) -> Document: """ Return the document object if it was successfully created. @@ -390,7 +392,11 @@ class Consumer(LoggingMixin): try: self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT) self.log("debug", f"Parsing {self.filename}...") - document_parser.parse(self.path, mime_type, self.filename) + custom_options = { + 'is_large_file': is_large_file, + 'ocr_specific_pages': ocr_specific_pages + } + document_parser.parse(self.path, mime_type, self.filename, custom_options) self.log("debug", f"Generating thumbnail for {self.filename}...") self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL) diff --git a/src/documents/data_models.py b/src/documents/data_models.py index 9ea720b00..251406db3 100644 --- a/src/documents/data_models.py +++ b/src/documents/data_models.py @@ -26,6 +26,8 @@ class DocumentMetadataOverrides: owner_id: Optional[int] = None storage_path_id: Optional[int] = None full_path: Optional[str] = None + is_large_file: Optional[bool] = None + ocr_specific_pages: Optional[str] = None class DocumentSource(enum.IntEnum): diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index 1217410a0..9b063038a 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -726,6 +726,8 @@ class PostDocumentSerializer(serializers.Serializer): min_value=Document.ARCHIVE_SERIAL_NUMBER_MIN, max_value=Document.ARCHIVE_SERIAL_NUMBER_MAX, ) + + # Custom fields storage_path_id = serializers.IntegerField( label="Storage path ID", @@ -735,7 +737,21 @@ class PostDocumentSerializer(serializers.Serializer): ) full_path = serializers.CharField( - label="Full Path", + label="Full path", + allow_null=True, + write_only=True, + required=False, + ) + + is_large_file = serializers.CharField( + label="Is large file", + allow_null=True, + write_only=True, + required=False, + ) + + ocr_specific_pages = serializers.CharField( + label="OCR specific pages", allow_null=True, write_only=True, required=False, diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 33e93b703..81d627335 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -201,7 +201,9 @@ def consume_file( override_asn=overrides.asn, override_owner_id=overrides.owner_id, override_storage_path_id=overrides.storage_path_id, - full_path=overrides.full_path + full_path=overrides.full_path, + is_large_file=overrides.is_large_file, + ocr_specific_pages=overrides.ocr_specific_pages ) if document: diff --git a/src/documents/views.py b/src/documents/views.py index 9abe84eb7..e1ae9c4cc 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -719,6 +719,8 @@ class PostDocumentView(GenericAPIView): archive_serial_number = serializer.validated_data.get("archive_serial_number") storage_path_id = serializer.validated_data.get("storage_path_id") full_path = serializer.validated_data.get("full_path") + is_large_file = serializer.validated_data.get("is_large_file") + ocr_specific_pages = serializer.validated_data.get("ocr_specific_pages") logger.debug(f"storage_path_id: {storage_path_id}") @@ -750,6 +752,8 @@ class PostDocumentView(GenericAPIView): # owner_id=request.user.id, storage_path_id=storage_path_id, full_path=full_path, + is_large_file=is_large_file, + ocr_specific_pages=ocr_specific_pages ) async_task = consume_file.delay( diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index f3e8e21fd..6b63516f7 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -1,413 +1,450 @@ -import json -import os -import re -import subprocess -import tempfile -from pathlib import Path -from typing import Optional - -from django.conf import settings -from documents.parsers import DocumentParser -from documents.parsers import make_thumbnail_from_pdf -from documents.parsers import ParseError -from PIL import Image - - -class NoTextFoundException(Exception): - pass - - -class RtlLanguageException(Exception): - pass - - -class RasterisedDocumentParser(DocumentParser): - """ - This parser uses Tesseract to try and get some text out of a rasterised - image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) - """ - - logging_name = "paperless.parsing.tesseract" - - def extract_metadata(self, document_path, mime_type): - - result = [] - if mime_type == "application/pdf": - import pikepdf - - namespace_pattern = re.compile(r"\{(.*)\}(.*)") - - pdf = pikepdf.open(document_path) - meta = pdf.open_metadata() - for key, value in meta.items(): - if isinstance(value, list): - value = " ".join([str(e) for e in value]) - value = str(value) - try: - m = namespace_pattern.match(key) - result.append( - { - "namespace": m.group(1), - "prefix": meta.REVERSE_NS[m.group(1)], - "key": m.group(2), - "value": value, - }, - ) - except Exception as e: - self.log( - "warning", - f"Error while reading metadata {key}: {value}. Error: {e}", - ) - return result - - def get_thumbnail(self, document_path, mime_type, file_name=None): - return make_thumbnail_from_pdf( - self.archive_path or document_path, - self.tempdir, - self.logging_group, - ) - - def is_image(self, mime_type): - return mime_type in [ - "image/png", - "image/jpeg", - "image/tiff", - "image/bmp", - "image/gif", - "image/webp", - ] - - def has_alpha(self, image): - with Image.open(image) as im: - return im.mode in ("RGBA", "LA") - - def remove_alpha(self, image_path: str): - subprocess.run( - [ - settings.CONVERT_BINARY, - "-alpha", - "off", - image_path, - image_path, - ], - ) - - def get_dpi(self, image): - try: - with Image.open(image) as im: - x, y = im.info["dpi"] - return round(x) - except Exception as e: - self.log("warning", f"Error while getting DPI from image {image}: {e}") - return None - - def calculate_a4_dpi(self, image): - try: - with Image.open(image) as im: - width, height = im.size - # divide image width by A4 width (210mm) in inches. - dpi = int(width / (21 / 2.54)) - self.log("debug", f"Estimated DPI {dpi} based on image width {width}") - return dpi - - except Exception as e: - self.log("warning", f"Error while calculating DPI for image {image}: {e}") - return None - - def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path): - # When re-doing OCR, the sidecar contains ONLY the new text, not - # the whole text, so do not utilize it in that case - if ( - sidecar_file is not None - and os.path.isfile(sidecar_file) - and settings.OCR_MODE != "redo" - ): - with open(sidecar_file) as f: - text = f.read() - - if "[OCR skipped on page" not in text: - # This happens when there's already text in the input file. - # The sidecar file will only contain text for OCR'ed pages. - self.log("debug", "Using text from sidecar file") - return post_process_text(text) - else: - self.log("debug", "Incomplete sidecar file: discarding.") - - # no success with the sidecar file, try PDF - - if not os.path.isfile(pdf_file): - return None - - try: - text = None - with tempfile.NamedTemporaryFile( - mode="w+", - dir=self.tempdir, - ) as tmp: - subprocess.run( - [ - "pdftotext", - "-q", - "-layout", - "-enc", - "UTF-8", - pdf_file, - tmp.name, - ], - ) - text = tmp.read() - - return post_process_text(text) - - except Exception: - # If pdftotext fails, fall back to OCR. - self.log( - "warning", - "Error while getting text from PDF document with pdftotext", - exc_info=True, - ) - # probably not a PDF file. - return None - - def construct_ocrmypdf_parameters( - self, - input_file, - mime_type, - output_file, - sidecar_file, - safe_fallback=False, - ): - ocrmypdf_args = { - "input_file": input_file, - "output_file": output_file, - # need to use threads, since this will be run in daemonized - # processes via the task library. - "use_threads": True, - "jobs": settings.THREADS_PER_WORKER, - "language": settings.OCR_LANGUAGE, - "output_type": settings.OCR_OUTPUT_TYPE, - "progress_bar": False, - } - - if settings.OCR_MODE == "force" or safe_fallback: - ocrmypdf_args["force_ocr"] = True - elif settings.OCR_MODE in ["skip", "skip_noarchive"]: - ocrmypdf_args["skip_text"] = True - elif settings.OCR_MODE == "redo": - ocrmypdf_args["redo_ocr"] = True - else: - raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}") - - if settings.OCR_CLEAN == "clean": - ocrmypdf_args["clean"] = True - elif settings.OCR_CLEAN == "clean-final": - if settings.OCR_MODE == "redo": - ocrmypdf_args["clean"] = True - else: - # --clean-final is not compatible with --redo-ocr - ocrmypdf_args["clean_final"] = True - - if settings.OCR_DESKEW and settings.OCR_MODE != "redo": - # --deskew is not compatible with --redo-ocr - ocrmypdf_args["deskew"] = True - - if settings.OCR_ROTATE_PAGES: - ocrmypdf_args["rotate_pages"] = True - ocrmypdf_args[ - "rotate_pages_threshold" - ] = settings.OCR_ROTATE_PAGES_THRESHOLD - - if settings.OCR_PAGES > 0: - ocrmypdf_args["pages"] = f"1-{settings.OCR_PAGES}" - else: - # sidecar is incompatible with pages - ocrmypdf_args["sidecar"] = sidecar_file - - if self.is_image(mime_type): - dpi = self.get_dpi(input_file) - a4_dpi = self.calculate_a4_dpi(input_file) - - if self.has_alpha(input_file): - self.log( - "info", - f"Removing alpha layer from {input_file} " - "for compatibility with img2pdf", - ) - self.remove_alpha(input_file) - - if dpi: - self.log("debug", f"Detected DPI for image {input_file}: {dpi}") - ocrmypdf_args["image_dpi"] = dpi - elif settings.OCR_IMAGE_DPI: - ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI - elif a4_dpi: - ocrmypdf_args["image_dpi"] = a4_dpi - else: - raise ParseError( - f"Cannot produce archive PDF for image {input_file}, " - f"no DPI information is present in this image and " - f"OCR_IMAGE_DPI is not set.", - ) - - if settings.OCR_USER_ARGS and not safe_fallback: - try: - user_args = json.loads(settings.OCR_USER_ARGS) - ocrmypdf_args = {**ocrmypdf_args, **user_args} - except Exception as e: - self.log( - "warning", - f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " - f"they will not be used. Error: {e}", - ) - - if settings.OCR_MAX_IMAGE_PIXELS is not None: - # Convert pixels to mega-pixels and provide to ocrmypdf - max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0 - if max_pixels_mpixels > 0: - - self.log( - "debug", - f"Calculated {max_pixels_mpixels} megapixels for OCR", - ) - - ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels - else: - self.log( - "warning", - "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, " - "this value must be at least 1 megapixel if set", - ) - - return ocrmypdf_args - - def parse(self, document_path: Path, mime_type, file_name=None): - # This forces tesseract to use one core per page. - os.environ["OMP_THREAD_LIMIT"] = "1" - VALID_TEXT_LENGTH = 50 - - if mime_type == "application/pdf": - text_original = self.extract_text(None, document_path) - original_has_text = ( - text_original is not None and len(text_original) > VALID_TEXT_LENGTH - ) - else: - text_original = None - original_has_text = False - - # If the original has text, and the user doesn't want an archive, - # we're done here - skip_archive_for_text = ( - settings.OCR_MODE == "skip_noarchive" - or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"] - ) - if skip_archive_for_text and original_has_text: - self.log("debug", "Document has text, skipping OCRmyPDF entirely.") - self.text = text_original - return - - # Either no text was in the original or there should be an archive - # file created, so OCR the file and create an archive with any - # text located via OCR - - import ocrmypdf - from ocrmypdf import InputFileError, EncryptedPdfError - - archive_path = Path(os.path.join(self.tempdir, "archive.pdf")) - sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt")) - - args = self.construct_ocrmypdf_parameters( - document_path, - mime_type, - archive_path, - sidecar_file, - ) - - try: - self.log("debug", f"Calling OCRmyPDF with args: {args}") - ocrmypdf.ocr(**args) - - if settings.OCR_SKIP_ARCHIVE_FILE != "always": - self.archive_path = archive_path - - self.text = self.extract_text(sidecar_file, archive_path) - - if not self.text: - raise NoTextFoundException("No text was found in the original document") - except EncryptedPdfError: - self.log( - "warning", - "This file is encrypted, OCR is impossible. Using " - "any text present in the original file.", - ) - if original_has_text: - self.text = text_original - except (NoTextFoundException, InputFileError) as e: - self.log( - "warning", - f"Encountered an error while running OCR: {str(e)}. " - f"Attempting force OCR to get the text.", - ) - - archive_path_fallback = Path( - os.path.join(self.tempdir, "archive-fallback.pdf"), - ) - sidecar_file_fallback = Path( - os.path.join(self.tempdir, "sidecar-fallback.txt"), - ) - - # Attempt to run OCR with safe settings. - - args = self.construct_ocrmypdf_parameters( - document_path, - mime_type, - archive_path_fallback, - sidecar_file_fallback, - safe_fallback=True, - ) - - try: - self.log("debug", f"Fallback: Calling OCRmyPDF with args: {args}") - ocrmypdf.ocr(**args) - - # Don't return the archived file here, since this file - # is bigger and blurry due to --force-ocr. - - self.text = self.extract_text( - sidecar_file_fallback, - archive_path_fallback, - ) - - except Exception as e: - # If this fails, we have a serious issue at hand. - raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e - - except Exception as e: - # Anything else is probably serious. - raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e - - # As a last resort, if we still don't have any text for any reason, - # try to extract the text from the original document. - if not self.text: - if original_has_text: - self.text = text_original - else: - self.log( - "warning", - f"No text was found in {document_path}, the content will " - f"be empty.", - ) - self.text = "" - - -def post_process_text(text): - if not text: - return None - - collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) - no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces) - no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace) - - # TODO: this needs a rework - # replace \0 prevents issues with saving to postgres. - # text may contain \0 when this character is present in PDF files. - return no_trailing_whitespace.strip().replace("\0", " ") +import json +import os +import re +import subprocess +import tempfile +from pathlib import Path +from typing import Optional + +from django.conf import settings +from documents.parsers import DocumentParser +from documents.parsers import make_thumbnail_from_pdf +from documents.parsers import ParseError +from PIL import Image + + +class NoTextFoundException(Exception): + pass + + +class RtlLanguageException(Exception): + pass + + +class RasterisedDocumentParser(DocumentParser): + """ + This parser uses Tesseract to try and get some text out of a rasterised + image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) + """ + + logging_name = "paperless.parsing.tesseract" + + def extract_metadata(self, document_path, mime_type): + + result = [] + if mime_type == "application/pdf": + import pikepdf + + namespace_pattern = re.compile(r"\{(.*)\}(.*)") + + pdf = pikepdf.open(document_path) + meta = pdf.open_metadata() + for key, value in meta.items(): + if isinstance(value, list): + value = " ".join([str(e) for e in value]) + value = str(value) + try: + m = namespace_pattern.match(key) + result.append( + { + "namespace": m.group(1), + "prefix": meta.REVERSE_NS[m.group(1)], + "key": m.group(2), + "value": value, + }, + ) + except Exception as e: + self.log( + "warning", + f"Error while reading metadata {key}: {value}. Error: {e}", + ) + return result + + def get_thumbnail(self, document_path, mime_type, file_name=None): + return make_thumbnail_from_pdf( + self.archive_path or document_path, + self.tempdir, + self.logging_group, + ) + + def is_image(self, mime_type): + return mime_type in [ + "image/png", + "image/jpeg", + "image/tiff", + "image/bmp", + "image/gif", + "image/webp", + ] + + def has_alpha(self, image): + with Image.open(image) as im: + return im.mode in ("RGBA", "LA") + + def remove_alpha(self, image_path: str): + subprocess.run( + [ + settings.CONVERT_BINARY, + "-alpha", + "off", + image_path, + image_path, + ], + ) + + def get_dpi(self, image): + try: + with Image.open(image) as im: + x, y = im.info["dpi"] + return round(x) + except Exception as e: + self.log("warning", f"Error while getting DPI from image {image}: {e}") + return None + + def calculate_a4_dpi(self, image): + try: + with Image.open(image) as im: + width, height = im.size + # divide image width by A4 width (210mm) in inches. + dpi = int(width / (21 / 2.54)) + self.log("debug", f"Estimated DPI {dpi} based on image width {width}") + return dpi + + except Exception as e: + self.log("warning", f"Error while calculating DPI for image {image}: {e}") + return None + + def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path, custom_options=None): + # When re-doing OCR, the sidecar contains ONLY the new text, not + # the whole text, so do not utilize it in that case + if ( + sidecar_file is not None + and os.path.isfile(sidecar_file) + and settings.OCR_MODE != "redo" + ): + with open(sidecar_file) as f: + text = f.read() + + if "[OCR skipped on page" not in text: + # This happens when there's already text in the input file. + # The sidecar file will only contain text for OCR'ed pages. + self.log("debug", "Using text from sidecar file") + return post_process_text(text) + elif custom_options is not None and 'is_large_file' in custom_options: + self.log("debug", "File is large so some pages may have been skipped intentionally. Using text from incomplete sidecar file") + return post_process_text(text) + else: + self.log("debug", "Incomplete sidecar file: discarding.") + + # no success with the sidecar file, try PDF + + if not os.path.isfile(pdf_file): + return None + + try: + text = None + with tempfile.NamedTemporaryFile( + mode="w+", + dir=self.tempdir, + ) as tmp: + subprocess.run( + [ + "pdftotext", + "-q", + "-layout", + "-enc", + "UTF-8", + pdf_file, + tmp.name, + ], + ) + text = tmp.read() + + return post_process_text(text) + + except Exception: + # If pdftotext fails, fall back to OCR. + self.log( + "warning", + "Error while getting text from PDF document with pdftotext", + exc_info=True, + ) + # probably not a PDF file. + return None + + def construct_ocrmypdf_parameters( + self, + input_file, + mime_type, + output_file, + sidecar_file, + safe_fallback=False, + # used for large files, to only do OCR on specific pages + is_large_file=False, + specific_pages=None + ): + ocrmypdf_args = { + "input_file": input_file, + "output_file": output_file, + # need to use threads, since this will be run in daemonized + # processes via the task library. + "use_threads": True, + "jobs": settings.THREADS_PER_WORKER, + "language": settings.OCR_LANGUAGE, + "output_type": settings.OCR_OUTPUT_TYPE, + "progress_bar": False, + } + + if settings.OCR_MODE == "force" or safe_fallback: + ocrmypdf_args["force_ocr"] = True + elif settings.OCR_MODE in ["skip", "skip_noarchive"]: + ocrmypdf_args["skip_text"] = True + elif settings.OCR_MODE == "redo": + ocrmypdf_args["redo_ocr"] = True + else: + raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}") + + if settings.OCR_CLEAN == "clean": + ocrmypdf_args["clean"] = True + elif settings.OCR_CLEAN == "clean-final": + if settings.OCR_MODE == "redo": + ocrmypdf_args["clean"] = True + else: + # --clean-final is not compatible with --redo-ocr + ocrmypdf_args["clean_final"] = True + + if settings.OCR_DESKEW and settings.OCR_MODE != "redo": + # --deskew is not compatible with --redo-ocr + ocrmypdf_args["deskew"] = True + + if settings.OCR_ROTATE_PAGES: + ocrmypdf_args["rotate_pages"] = True + ocrmypdf_args[ + "rotate_pages_threshold" + ] = settings.OCR_ROTATE_PAGES_THRESHOLD + + if settings.OCR_PAGES > 0: + ocrmypdf_args["pages"] = f"1-{settings.OCR_PAGES}" + else: + # sidecar is incompatible with pages + ocrmypdf_args["sidecar"] = sidecar_file + + if is_large_file and specific_pages is not None: + specific_pages = specific_pages.strip(',') + ocrmypdf_args["pages"] = specific_pages + elif is_large_file: + self.log("debug", "Large file but did not specify pages, so disabling OCR") + ocrmypdf_args["tesseract-timeout"] = 0 + + # Regardless of other options, disable postprocessing if large file + # Source: https://ocrmypdf.readthedocs.io/en/latest/performance.html?highlight=Postprocessing#speed + if is_large_file: + self.log("debug", "Since large file, disabling postprocessing") + ocrmypdf_args["optimize"] = 0 + ocrmypdf_args["output-type"] = 'pdf' + ocrmypdf_args["fast-web-view"] = 0 + ocrmypdf_args["skip-big"] = 200 + ocrmypdf_args["deskew"] = False + ocrmypdf_args["rotate_pages"] = False + ocrmypdf_args["clean"] = False + + if self.is_image(mime_type): + dpi = self.get_dpi(input_file) + a4_dpi = self.calculate_a4_dpi(input_file) + + if self.has_alpha(input_file): + self.log( + "info", + f"Removing alpha layer from {input_file} " + "for compatibility with img2pdf", + ) + self.remove_alpha(input_file) + + if dpi: + self.log("debug", f"Detected DPI for image {input_file}: {dpi}") + ocrmypdf_args["image_dpi"] = dpi + elif settings.OCR_IMAGE_DPI: + ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI + elif a4_dpi: + ocrmypdf_args["image_dpi"] = a4_dpi + else: + raise ParseError( + f"Cannot produce archive PDF for image {input_file}, " + f"no DPI information is present in this image and " + f"OCR_IMAGE_DPI is not set.", + ) + + if settings.OCR_USER_ARGS and not safe_fallback: + try: + user_args = json.loads(settings.OCR_USER_ARGS) + ocrmypdf_args = {**ocrmypdf_args, **user_args} + except Exception as e: + self.log( + "warning", + f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " + f"they will not be used. Error: {e}", + ) + + if settings.OCR_MAX_IMAGE_PIXELS is not None: + # Convert pixels to mega-pixels and provide to ocrmypdf + max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0 + if max_pixels_mpixels > 0: + + self.log( + "debug", + f"Calculated {max_pixels_mpixels} megapixels for OCR", + ) + + ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels + else: + self.log( + "warning", + "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, " + "this value must be at least 1 megapixel if set", + ) + + return ocrmypdf_args + + def parse(self, document_path: Path, mime_type, file_name=None, custom_options=None): + # This forces tesseract to use one core per page. + os.environ["OMP_THREAD_LIMIT"] = "1" + VALID_TEXT_LENGTH = 50 + + if mime_type == "application/pdf": + text_original = self.extract_text(None, document_path) + original_has_text = ( + text_original is not None and len(text_original) > VALID_TEXT_LENGTH + ) + else: + text_original = None + original_has_text = False + + # If the original has text, and the user doesn't want an archive, + # we're done here + skip_archive_for_text = ( + settings.OCR_MODE == "skip_noarchive" + or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"] + ) + if skip_archive_for_text and original_has_text: + self.log("debug", "Document has text, skipping OCRmyPDF entirely.") + self.text = text_original + return + + # Either no text was in the original or there should be an archive + # file created, so OCR the file and create an archive with any + # text located via OCR + + import ocrmypdf + from ocrmypdf import InputFileError, EncryptedPdfError + + archive_path = Path(os.path.join(self.tempdir, "archive.pdf")) + sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt")) + + specific_pages = None + is_large_file = False + if custom_options is not None: + if 'ocr_specific_pages' in custom_options: + specific_pages = custom_options['ocr_specific_pages'] + if 'is_large_file' in custom_options: + is_large_file = custom_options['is_large_file'] + + args = self.construct_ocrmypdf_parameters( + document_path, + mime_type, + archive_path, + sidecar_file, + is_large_file=is_large_file, + specific_pages=specific_pages + ) + + try: + self.log("debug", f"Calling OCRmyPDF with args: {args}") + ocrmypdf.ocr(**args) + + if settings.OCR_SKIP_ARCHIVE_FILE != "always": + self.archive_path = archive_path + + self.text = self.extract_text(sidecar_file, archive_path, custom_options=custom_options) + + if not self.text: + raise NoTextFoundException("No text was found in the original document") + except EncryptedPdfError: + self.log( + "warning", + "This file is encrypted, OCR is impossible. Using " + "any text present in the original file.", + ) + if original_has_text: + self.text = text_original + except (NoTextFoundException, InputFileError) as e: + self.log( + "warning", + f"Encountered an error while running OCR: {str(e)}. " + f"Attempting force OCR to get the text.", + ) + + archive_path_fallback = Path( + os.path.join(self.tempdir, "archive-fallback.pdf"), + ) + sidecar_file_fallback = Path( + os.path.join(self.tempdir, "sidecar-fallback.txt"), + ) + + # Attempt to run OCR with safe settings. + + args = self.construct_ocrmypdf_parameters( + document_path, + mime_type, + archive_path_fallback, + sidecar_file_fallback, + safe_fallback=True, + is_large_file=is_large_file, + specific_pages=specific_pages + ) + + try: + self.log("debug", f"Fallback: Calling OCRmyPDF with args: {args}") + ocrmypdf.ocr(**args) + + # Don't return the archived file here, since this file + # is bigger and blurry due to --force-ocr. + + self.text = self.extract_text( + sidecar_file_fallback, + archive_path_fallback, + ) + + except Exception as e: + # If this fails, we have a serious issue at hand. + raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e + + except Exception as e: + # Anything else is probably serious. + raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e + + # As a last resort, if we still don't have any text for any reason, + # try to extract the text from the original document. + if not self.text: + if original_has_text: + self.text = text_original + else: + self.log( + "warning", + f"No text was found in {document_path}, the content will " + f"be empty.", + ) + self.text = "" + + +def post_process_text(text): + if not text: + return None + + collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) + no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces) + no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace) + + # TODO: this needs a rework + # replace \0 prevents issues with saving to postgres. + # text may contain \0 when this character is present in PDF files. + return no_trailing_whitespace.strip().replace("\0", " ")