Add basic large file upload feature

2023-09-03 18:15:03 +08:00 · 2023-09-03 18:15:03 +08:00 · dec3b58270
commit dec3b58270
parent a29453eaa3
8 changed files with 493 additions and 419 deletions
--- a/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.html
+++ b/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.html
@ -28,6 +28,10 @@
      </ng-template>
    </ngx-file-drop>
    <div class="w-full mb-2" *ngFor="let file of files">
      <span>{{ file.fileEntry.name }}</span>
    </div>
    <div class="w-full mb-2">
      <app-input-text
        i18n-title
--- a/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.ts
+++ b/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.ts
@ -26,7 +26,7 @@ export class UploadLargeFileComponent
  private fileLeaveTimeoutID: any
  fileIsOver: boolean = false
  hideFileDrop: boolean = true
-  private files: NgxFileDropEntry[];
+  files: NgxFileDropEntry[];
  constructor(
    private route: ActivatedRoute,
@ -57,8 +57,11 @@ export class UploadLargeFileComponent
    let storagePathId = parseInt(this.route.snapshot.queryParams['spid'])
    storagePathId = !isNaN(storagePathId) ? storagePathId : undefined
    this.toastService.showInfo($localize`Initiating large file upload...`, 3000)
-    this.uploadDocumentsService.uploadFiles(this.files, { storagePathId })
+    this.uploadDocumentsService.uploadFiles(this.files, {
-
+      storagePathId, 
      isLargeFile: true, 
      ocrSpecificPages: this.objectForm.get('ocr_pages').value 
    })
  }
  getForm(): FormGroup<any> {
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@ -296,6 +296,8 @@ class Consumer(LoggingMixin):
        override_owner_id=None,
        override_storage_path_id=None,
        full_path=None,
        is_large_file=None,
        ocr_specific_pages=None
    ) -> Document:
        """
        Return the document object if it was successfully created.
@ -390,7 +392,11 @@ class Consumer(LoggingMixin):
        try:
            self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT)
            self.log("debug", f"Parsing {self.filename}...")
-            document_parser.parse(self.path, mime_type, self.filename)
+            custom_options = { 
                'is_large_file': is_large_file,
                'ocr_specific_pages': ocr_specific_pages 
            }
            document_parser.parse(self.path, mime_type, self.filename, custom_options)
            self.log("debug", f"Generating thumbnail for {self.filename}...")
            self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL)
--- a/src/documents/data_models.py
+++ b/src/documents/data_models.py
@ -26,6 +26,8 @@ class DocumentMetadataOverrides:
    owner_id: Optional[int] = None
    storage_path_id: Optional[int] = None
    full_path: Optional[str] = None
    is_large_file: Optional[bool] = None
    ocr_specific_pages: Optional[str] = None
 class DocumentSource(enum.IntEnum):
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@ -726,6 +726,8 @@ class PostDocumentSerializer(serializers.Serializer):
        min_value=Document.ARCHIVE_SERIAL_NUMBER_MIN,
        max_value=Document.ARCHIVE_SERIAL_NUMBER_MAX,
    )
    # Custom fields
    storage_path_id = serializers.IntegerField(
        label="Storage path ID",
@ -735,7 +737,21 @@ class PostDocumentSerializer(serializers.Serializer):
    )
    full_path = serializers.CharField(
-        label="Full Path",
+        label="Full path",
        allow_null=True,
        write_only=True,
        required=False,
    )
    is_large_file = serializers.CharField(
        label="Is large file",
        allow_null=True,
        write_only=True,
        required=False,
    )
    ocr_specific_pages = serializers.CharField(
        label="OCR specific pages",
        allow_null=True,
        write_only=True,
        required=False,
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@ -201,7 +201,9 @@ def consume_file(
        override_asn=overrides.asn,
        override_owner_id=overrides.owner_id,
        override_storage_path_id=overrides.storage_path_id,
-        full_path=overrides.full_path
+        full_path=overrides.full_path,
        is_large_file=overrides.is_large_file,
        ocr_specific_pages=overrides.ocr_specific_pages
    )
    if document:
--- a/src/documents/views.py
+++ b/src/documents/views.py
@ -719,6 +719,8 @@ class PostDocumentView(GenericAPIView):
        archive_serial_number = serializer.validated_data.get("archive_serial_number")
        storage_path_id = serializer.validated_data.get("storage_path_id")
        full_path = serializer.validated_data.get("full_path")
        is_large_file = serializer.validated_data.get("is_large_file")
        ocr_specific_pages = serializer.validated_data.get("ocr_specific_pages")
        logger.debug(f"storage_path_id: {storage_path_id}")
@ -750,6 +752,8 @@ class PostDocumentView(GenericAPIView):
            # owner_id=request.user.id,
            storage_path_id=storage_path_id,
            full_path=full_path,
            is_large_file=is_large_file,
            ocr_specific_pages=ocr_specific_pages
        )
        async_task = consume_file.delay(
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@ -1,413 +1,450 @@
-import json
+import json
-import os
+import os
-import re
+import re
-import subprocess
+import subprocess
-import tempfile
+import tempfile
-from pathlib import Path
+from pathlib import Path
-from typing import Optional
+from typing import Optional
-
+
-from django.conf import settings
+from django.conf import settings
-from documents.parsers import DocumentParser
+from documents.parsers import DocumentParser
-from documents.parsers import make_thumbnail_from_pdf
+from documents.parsers import make_thumbnail_from_pdf
-from documents.parsers import ParseError
+from documents.parsers import ParseError
-from PIL import Image
+from PIL import Image
-
+
-
+
-class NoTextFoundException(Exception):
+class NoTextFoundException(Exception):
-    pass
+    pass
-
+
-
+
-class RtlLanguageException(Exception):
+class RtlLanguageException(Exception):
-    pass
+    pass
-
+
-
+
-class RasterisedDocumentParser(DocumentParser):
+class RasterisedDocumentParser(DocumentParser):
-    """
+    """
-    This parser uses Tesseract to try and get some text out of a rasterised
+    This parser uses Tesseract to try and get some text out of a rasterised
-    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
+    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
-    """
+    """
-
+
-    logging_name = "paperless.parsing.tesseract"
+    logging_name = "paperless.parsing.tesseract"
-
+
-    def extract_metadata(self, document_path, mime_type):
+    def extract_metadata(self, document_path, mime_type):
-
+
-        result = []
+        result = []
-        if mime_type == "application/pdf":
+        if mime_type == "application/pdf":
-            import pikepdf
+            import pikepdf
-
+
-            namespace_pattern = re.compile(r"\{(.*)\}(.*)")
+            namespace_pattern = re.compile(r"\{(.*)\}(.*)")
-
+
-            pdf = pikepdf.open(document_path)
+            pdf = pikepdf.open(document_path)
-            meta = pdf.open_metadata()
+            meta = pdf.open_metadata()
-            for key, value in meta.items():
+            for key, value in meta.items():
-                if isinstance(value, list):
+                if isinstance(value, list):
-                    value = " ".join([str(e) for e in value])
+                    value = " ".join([str(e) for e in value])
-                value = str(value)
+                value = str(value)
-                try:
+                try:
-                    m = namespace_pattern.match(key)
+                    m = namespace_pattern.match(key)
-                    result.append(
+                    result.append(
-                        {
+                        {
-                            "namespace": m.group(1),
+                            "namespace": m.group(1),
-                            "prefix": meta.REVERSE_NS[m.group(1)],
+                            "prefix": meta.REVERSE_NS[m.group(1)],
-                            "key": m.group(2),
+                            "key": m.group(2),
-                            "value": value,
+                            "value": value,
-                        },
+                        },
-                    )
+                    )
-                except Exception as e:
+                except Exception as e:
-                    self.log(
+                    self.log(
-                        "warning",
+                        "warning",
-                        f"Error while reading metadata {key}: {value}. Error: {e}",
+                        f"Error while reading metadata {key}: {value}. Error: {e}",
-                    )
+                    )
-        return result
+        return result
-
+
-    def get_thumbnail(self, document_path, mime_type, file_name=None):
+    def get_thumbnail(self, document_path, mime_type, file_name=None):
-        return make_thumbnail_from_pdf(
+        return make_thumbnail_from_pdf(
-            self.archive_path or document_path,
+            self.archive_path or document_path,
-            self.tempdir,
+            self.tempdir,
-            self.logging_group,
+            self.logging_group,
-        )
+        )
-
+
-    def is_image(self, mime_type):
+    def is_image(self, mime_type):
-        return mime_type in [
+        return mime_type in [
-            "image/png",
+            "image/png",
-            "image/jpeg",
+            "image/jpeg",
-            "image/tiff",
+            "image/tiff",
-            "image/bmp",
+            "image/bmp",
-            "image/gif",
+            "image/gif",
-            "image/webp",
+            "image/webp",
-        ]
+        ]
-
+
-    def has_alpha(self, image):
+    def has_alpha(self, image):
-        with Image.open(image) as im:
+        with Image.open(image) as im:
-            return im.mode in ("RGBA", "LA")
+            return im.mode in ("RGBA", "LA")
-
+
-    def remove_alpha(self, image_path: str):
+    def remove_alpha(self, image_path: str):
-        subprocess.run(
+        subprocess.run(
-            [
+            [
-                settings.CONVERT_BINARY,
+                settings.CONVERT_BINARY,
-                "-alpha",
+                "-alpha",
-                "off",
+                "off",
-                image_path,
+                image_path,
-                image_path,
+                image_path,
-            ],
+            ],
-        )
+        )
-
+
-    def get_dpi(self, image):
+    def get_dpi(self, image):
-        try:
+        try:
-            with Image.open(image) as im:
+            with Image.open(image) as im:
-                x, y = im.info["dpi"]
+                x, y = im.info["dpi"]
-                return round(x)
+                return round(x)
-        except Exception as e:
+        except Exception as e:
-            self.log("warning", f"Error while getting DPI from image {image}: {e}")
+            self.log("warning", f"Error while getting DPI from image {image}: {e}")
-            return None
+            return None
-
+
-    def calculate_a4_dpi(self, image):
+    def calculate_a4_dpi(self, image):
-        try:
+        try:
-            with Image.open(image) as im:
+            with Image.open(image) as im:
-                width, height = im.size
+                width, height = im.size
-                # divide image width by A4 width (210mm) in inches.
+                # divide image width by A4 width (210mm) in inches.
-                dpi = int(width / (21 / 2.54))
+                dpi = int(width / (21 / 2.54))
-                self.log("debug", f"Estimated DPI {dpi} based on image width {width}")
+                self.log("debug", f"Estimated DPI {dpi} based on image width {width}")
-                return dpi
+                return dpi
-
+
-        except Exception as e:
+        except Exception as e:
-            self.log("warning", f"Error while calculating DPI for image {image}: {e}")
+            self.log("warning", f"Error while calculating DPI for image {image}: {e}")
-            return None
+            return None
-
+
-    def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path):
+    def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path, custom_options=None):
-        # When re-doing OCR, the sidecar contains ONLY the new text, not
+        # When re-doing OCR, the sidecar contains ONLY the new text, not
-        # the whole text, so do not utilize it in that case
+        # the whole text, so do not utilize it in that case
-        if (
+        if (
-            sidecar_file is not None
+            sidecar_file is not None
-            and os.path.isfile(sidecar_file)
+            and os.path.isfile(sidecar_file)
-            and settings.OCR_MODE != "redo"
+            and settings.OCR_MODE != "redo"
-        ):
+        ):
-            with open(sidecar_file) as f:
+            with open(sidecar_file) as f:
-                text = f.read()
+                text = f.read()
-
+
-            if "[OCR skipped on page" not in text:
+            if "[OCR skipped on page" not in text:
-                # This happens when there's already text in the input file.
+                # This happens when there's already text in the input file.
-                # The sidecar file will only contain text for OCR'ed pages.
+                # The sidecar file will only contain text for OCR'ed pages.
-                self.log("debug", "Using text from sidecar file")
+                self.log("debug", "Using text from sidecar file")
-                return post_process_text(text)
+                return post_process_text(text)
-            else:
+            elif custom_options is not None and 'is_large_file' in custom_options:
-                self.log("debug", "Incomplete sidecar file: discarding.")
+                self.log("debug", "File is large so some pages may have been skipped intentionally. Using text from incomplete sidecar file")
-
+                return post_process_text(text)
-        # no success with the sidecar file, try PDF
+            else:
-
+                self.log("debug", "Incomplete sidecar file: discarding.")
-        if not os.path.isfile(pdf_file):
+
-            return None
+        # no success with the sidecar file, try PDF
-
+
-        try:
+        if not os.path.isfile(pdf_file):
-            text = None
+            return None
-            with tempfile.NamedTemporaryFile(
+
-                mode="w+",
+        try:
-                dir=self.tempdir,
+            text = None
-            ) as tmp:
+            with tempfile.NamedTemporaryFile(
-                subprocess.run(
+                mode="w+",
-                    [
+                dir=self.tempdir,
-                        "pdftotext",
+            ) as tmp:
-                        "-q",
+                subprocess.run(
-                        "-layout",
+                    [
-                        "-enc",
+                        "pdftotext",
-                        "UTF-8",
+                        "-q",
-                        pdf_file,
+                        "-layout",
-                        tmp.name,
+                        "-enc",
-                    ],
+                        "UTF-8",
-                )
+                        pdf_file,
-                text = tmp.read()
+                        tmp.name,
-
+                    ],
-            return post_process_text(text)
+                )
-
+                text = tmp.read()
-        except Exception:
+
-            #  If pdftotext fails, fall back to OCR.
+            return post_process_text(text)
-            self.log(
+
-                "warning",
+        except Exception:
-                "Error while getting text from PDF document with pdftotext",
+            #  If pdftotext fails, fall back to OCR.
-                exc_info=True,
+            self.log(
-            )
+                "warning",
-            # probably not a PDF file.
+                "Error while getting text from PDF document with pdftotext",
-            return None
+                exc_info=True,
-
+            )
-    def construct_ocrmypdf_parameters(
+            # probably not a PDF file.
-        self,
+            return None
-        input_file,
+
-        mime_type,
+    def construct_ocrmypdf_parameters(
-        output_file,
+        self,
-        sidecar_file,
+        input_file,
-        safe_fallback=False,
+        mime_type,
-    ):
+        output_file,
-        ocrmypdf_args = {
+        sidecar_file,
-            "input_file": input_file,
+        safe_fallback=False,
-            "output_file": output_file,
+        # used for large files, to only do OCR on specific pages
-            # need to use threads, since this will be run in daemonized
+        is_large_file=False,
-            # processes via the task library.
+        specific_pages=None
-            "use_threads": True,
+    ):
-            "jobs": settings.THREADS_PER_WORKER,
+        ocrmypdf_args = {
-            "language": settings.OCR_LANGUAGE,
+            "input_file": input_file,
-            "output_type": settings.OCR_OUTPUT_TYPE,
+            "output_file": output_file,
-            "progress_bar": False,
+            # need to use threads, since this will be run in daemonized
-        }
+            # processes via the task library.
-
+            "use_threads": True,
-        if settings.OCR_MODE == "force" or safe_fallback:
+            "jobs": settings.THREADS_PER_WORKER,
-            ocrmypdf_args["force_ocr"] = True
+            "language": settings.OCR_LANGUAGE,
-        elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
+            "output_type": settings.OCR_OUTPUT_TYPE,
-            ocrmypdf_args["skip_text"] = True
+            "progress_bar": False,
-        elif settings.OCR_MODE == "redo":
+        }
-            ocrmypdf_args["redo_ocr"] = True
+
-        else:
+        if settings.OCR_MODE == "force" or safe_fallback:
-            raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}")
+            ocrmypdf_args["force_ocr"] = True
-
+        elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
-        if settings.OCR_CLEAN == "clean":
+            ocrmypdf_args["skip_text"] = True
-            ocrmypdf_args["clean"] = True
+        elif settings.OCR_MODE == "redo":
-        elif settings.OCR_CLEAN == "clean-final":
+            ocrmypdf_args["redo_ocr"] = True
-            if settings.OCR_MODE == "redo":
+        else:
-                ocrmypdf_args["clean"] = True
+            raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}")
-            else:
+
-                # --clean-final is not compatible with --redo-ocr
+        if settings.OCR_CLEAN == "clean":
-                ocrmypdf_args["clean_final"] = True
+            ocrmypdf_args["clean"] = True
-
+        elif settings.OCR_CLEAN == "clean-final":
-        if settings.OCR_DESKEW and settings.OCR_MODE != "redo":
+            if settings.OCR_MODE == "redo":
-            # --deskew is not compatible with --redo-ocr
+                ocrmypdf_args["clean"] = True
-            ocrmypdf_args["deskew"] = True
+            else:
-
+                # --clean-final is not compatible with --redo-ocr
-        if settings.OCR_ROTATE_PAGES:
+                ocrmypdf_args["clean_final"] = True
-            ocrmypdf_args["rotate_pages"] = True
+
-            ocrmypdf_args[
+        if settings.OCR_DESKEW and settings.OCR_MODE != "redo":
-                "rotate_pages_threshold"
+            # --deskew is not compatible with --redo-ocr
-            ] = settings.OCR_ROTATE_PAGES_THRESHOLD
+            ocrmypdf_args["deskew"] = True
-
+
-        if settings.OCR_PAGES > 0:
+        if settings.OCR_ROTATE_PAGES:
-            ocrmypdf_args["pages"] = f"1-{settings.OCR_PAGES}"
+            ocrmypdf_args["rotate_pages"] = True
-        else:
+            ocrmypdf_args[
-            # sidecar is incompatible with pages
+                "rotate_pages_threshold"
-            ocrmypdf_args["sidecar"] = sidecar_file
+            ] = settings.OCR_ROTATE_PAGES_THRESHOLD
-
+
-        if self.is_image(mime_type):
+        if settings.OCR_PAGES > 0:
-            dpi = self.get_dpi(input_file)
+            ocrmypdf_args["pages"] = f"1-{settings.OCR_PAGES}"
-            a4_dpi = self.calculate_a4_dpi(input_file)
+        else:
-
+            # sidecar is incompatible with pages
-            if self.has_alpha(input_file):
+            ocrmypdf_args["sidecar"] = sidecar_file
-                self.log(
+
-                    "info",
+        if is_large_file and specific_pages is not None:
-                    f"Removing alpha layer from {input_file} "
+            specific_pages = specific_pages.strip(',')
-                    "for compatibility with img2pdf",
+            ocrmypdf_args["pages"] = specific_pages
-                )
+        elif is_large_file:
-                self.remove_alpha(input_file)
+            self.log("debug", "Large file but did not specify pages, so disabling OCR")
-
+            ocrmypdf_args["tesseract-timeout"] = 0
-            if dpi:
+
-                self.log("debug", f"Detected DPI for image {input_file}: {dpi}")
+        # Regardless of other options, disable postprocessing if large file
-                ocrmypdf_args["image_dpi"] = dpi
+        # Source: https://ocrmypdf.readthedocs.io/en/latest/performance.html?highlight=Postprocessing#speed
-            elif settings.OCR_IMAGE_DPI:
+        if is_large_file:
-                ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI
+            self.log("debug", "Since large file, disabling postprocessing")
-            elif a4_dpi:
+            ocrmypdf_args["optimize"] = 0
-                ocrmypdf_args["image_dpi"] = a4_dpi
+            ocrmypdf_args["output-type"] = 'pdf'
-            else:
+            ocrmypdf_args["fast-web-view"] = 0
-                raise ParseError(
+            ocrmypdf_args["skip-big"] = 200
-                    f"Cannot produce archive PDF for image {input_file}, "
+            ocrmypdf_args["deskew"] = False
-                    f"no DPI information is present in this image and "
+            ocrmypdf_args["rotate_pages"] = False
-                    f"OCR_IMAGE_DPI is not set.",
+            ocrmypdf_args["clean"] = False
-                )
+
-
+        if self.is_image(mime_type):
-        if settings.OCR_USER_ARGS and not safe_fallback:
+            dpi = self.get_dpi(input_file)
-            try:
+            a4_dpi = self.calculate_a4_dpi(input_file)
-                user_args = json.loads(settings.OCR_USER_ARGS)
+
-                ocrmypdf_args = {**ocrmypdf_args, **user_args}
+            if self.has_alpha(input_file):
-            except Exception as e:
+                self.log(
-                self.log(
+                    "info",
-                    "warning",
+                    f"Removing alpha layer from {input_file} "
-                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
+                    "for compatibility with img2pdf",
-                    f"they will not be used. Error: {e}",
+                )
-                )
+                self.remove_alpha(input_file)
-
+
-        if settings.OCR_MAX_IMAGE_PIXELS is not None:
+            if dpi:
-            # Convert pixels to mega-pixels and provide to ocrmypdf
+                self.log("debug", f"Detected DPI for image {input_file}: {dpi}")
-            max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
+                ocrmypdf_args["image_dpi"] = dpi
-            if max_pixels_mpixels > 0:
+            elif settings.OCR_IMAGE_DPI:
-
+                ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI
-                self.log(
+            elif a4_dpi:
-                    "debug",
+                ocrmypdf_args["image_dpi"] = a4_dpi
-                    f"Calculated {max_pixels_mpixels} megapixels for OCR",
+            else:
-                )
+                raise ParseError(
-
+                    f"Cannot produce archive PDF for image {input_file}, "
-                ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
+                    f"no DPI information is present in this image and "
-            else:
+                    f"OCR_IMAGE_DPI is not set.",
-                self.log(
+                )
-                    "warning",
+
-                    "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
+        if settings.OCR_USER_ARGS and not safe_fallback:
-                    "this value must be at least 1 megapixel if set",
+            try:
-                )
+                user_args = json.loads(settings.OCR_USER_ARGS)
-
+                ocrmypdf_args = {**ocrmypdf_args, **user_args}
-        return ocrmypdf_args
+            except Exception as e:
-
+                self.log(
-    def parse(self, document_path: Path, mime_type, file_name=None):
+                    "warning",
-        # This forces tesseract to use one core per page.
+                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
-        os.environ["OMP_THREAD_LIMIT"] = "1"
+                    f"they will not be used. Error: {e}",
-        VALID_TEXT_LENGTH = 50
+                )
-
+
-        if mime_type == "application/pdf":
+        if settings.OCR_MAX_IMAGE_PIXELS is not None:
-            text_original = self.extract_text(None, document_path)
+            # Convert pixels to mega-pixels and provide to ocrmypdf
-            original_has_text = (
+            max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
-                text_original is not None and len(text_original) > VALID_TEXT_LENGTH
+            if max_pixels_mpixels > 0:
-            )
+
-        else:
+                self.log(
-            text_original = None
+                    "debug",
-            original_has_text = False
+                    f"Calculated {max_pixels_mpixels} megapixels for OCR",
-
+                )
-        # If the original has text, and the user doesn't want an archive,
+
-        # we're done here
+                ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
-        skip_archive_for_text = (
+            else:
-            settings.OCR_MODE == "skip_noarchive"
+                self.log(
-            or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
+                    "warning",
-        )
+                    "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
-        if skip_archive_for_text and original_has_text:
+                    "this value must be at least 1 megapixel if set",
-            self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
+                )
-            self.text = text_original
+
-            return
+        return ocrmypdf_args
-
+
-        # Either no text was in the original or there should be an archive
+    def parse(self, document_path: Path, mime_type, file_name=None, custom_options=None):
-        # file created, so OCR the file and create an archive with any
+        # This forces tesseract to use one core per page.
-        # text located via OCR
+        os.environ["OMP_THREAD_LIMIT"] = "1"
-
+        VALID_TEXT_LENGTH = 50
-        import ocrmypdf
+
-        from ocrmypdf import InputFileError, EncryptedPdfError
+        if mime_type == "application/pdf":
-
+            text_original = self.extract_text(None, document_path)
-        archive_path = Path(os.path.join(self.tempdir, "archive.pdf"))
+            original_has_text = (
-        sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt"))
+                text_original is not None and len(text_original) > VALID_TEXT_LENGTH
-
+            )
-        args = self.construct_ocrmypdf_parameters(
+        else:
-            document_path,
+            text_original = None
-            mime_type,
+            original_has_text = False
-            archive_path,
+
-            sidecar_file,
+        # If the original has text, and the user doesn't want an archive,
-        )
+        # we're done here
-
+        skip_archive_for_text = (
-        try:
+            settings.OCR_MODE == "skip_noarchive"
-            self.log("debug", f"Calling OCRmyPDF with args: {args}")
+            or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
-            ocrmypdf.ocr(**args)
+        )
-
+        if skip_archive_for_text and original_has_text:
-            if settings.OCR_SKIP_ARCHIVE_FILE != "always":
+            self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
-                self.archive_path = archive_path
+            self.text = text_original
-
+            return
-            self.text = self.extract_text(sidecar_file, archive_path)
+
-
+        # Either no text was in the original or there should be an archive
-            if not self.text:
+        # file created, so OCR the file and create an archive with any
-                raise NoTextFoundException("No text was found in the original document")
+        # text located via OCR
-        except EncryptedPdfError:
+
-            self.log(
+        import ocrmypdf
-                "warning",
+        from ocrmypdf import InputFileError, EncryptedPdfError
-                "This file is encrypted, OCR is impossible. Using "
+
-                "any text present in the original file.",
+        archive_path = Path(os.path.join(self.tempdir, "archive.pdf"))
-            )
+        sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt"))
-            if original_has_text:
+
-                self.text = text_original
+        specific_pages = None
-        except (NoTextFoundException, InputFileError) as e:
+        is_large_file = False
-            self.log(
+        if custom_options is not None:
-                "warning",
+            if 'ocr_specific_pages' in custom_options:
-                f"Encountered an error while running OCR: {str(e)}. "
+                specific_pages = custom_options['ocr_specific_pages']
-                f"Attempting force OCR to get the text.",
+            if 'is_large_file' in custom_options:
-            )
+                is_large_file = custom_options['is_large_file']
-
+
-            archive_path_fallback = Path(
+        args = self.construct_ocrmypdf_parameters(
-                os.path.join(self.tempdir, "archive-fallback.pdf"),
+            document_path,
-            )
+            mime_type,
-            sidecar_file_fallback = Path(
+            archive_path,
-                os.path.join(self.tempdir, "sidecar-fallback.txt"),
+            sidecar_file,
-            )
+            is_large_file=is_large_file,
-
+            specific_pages=specific_pages
-            # Attempt to run OCR with safe settings.
+        )
-
+
-            args = self.construct_ocrmypdf_parameters(
+        try:
-                document_path,
+            self.log("debug", f"Calling OCRmyPDF with args: {args}")
-                mime_type,
+            ocrmypdf.ocr(**args)
-                archive_path_fallback,
+
-                sidecar_file_fallback,
+            if settings.OCR_SKIP_ARCHIVE_FILE != "always":
-                safe_fallback=True,
+                self.archive_path = archive_path
-            )
+
-
+            self.text = self.extract_text(sidecar_file, archive_path, custom_options=custom_options)
-            try:
+
-                self.log("debug", f"Fallback: Calling OCRmyPDF with args: {args}")
+            if not self.text:
-                ocrmypdf.ocr(**args)
+                raise NoTextFoundException("No text was found in the original document")
-
+        except EncryptedPdfError:
-                # Don't return the archived file here, since this file
+            self.log(
-                # is bigger and blurry due to --force-ocr.
+                "warning",
-
+                "This file is encrypted, OCR is impossible. Using "
-                self.text = self.extract_text(
+                "any text present in the original file.",
-                    sidecar_file_fallback,
+            )
-                    archive_path_fallback,
+            if original_has_text:
-                )
+                self.text = text_original
-
+        except (NoTextFoundException, InputFileError) as e:
-            except Exception as e:
+            self.log(
-                # If this fails, we have a serious issue at hand.
+                "warning",
-                raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e
+                f"Encountered an error while running OCR: {str(e)}. "
-
+                f"Attempting force OCR to get the text.",
-        except Exception as e:
+            )
-            # Anything else is probably serious.
+
-            raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e
+            archive_path_fallback = Path(
-
+                os.path.join(self.tempdir, "archive-fallback.pdf"),
-        # As a last resort, if we still don't have any text for any reason,
+            )
-        # try to extract the text from the original document.
+            sidecar_file_fallback = Path(
-        if not self.text:
+                os.path.join(self.tempdir, "sidecar-fallback.txt"),
-            if original_has_text:
+            )
-                self.text = text_original
+
-            else:
+            # Attempt to run OCR with safe settings.
-                self.log(
+
-                    "warning",
+            args = self.construct_ocrmypdf_parameters(
-                    f"No text was found in {document_path}, the content will "
+                document_path,
-                    f"be empty.",
+                mime_type,
-                )
+                archive_path_fallback,
-                self.text = ""
+                sidecar_file_fallback,
-
+                safe_fallback=True,
-
+                is_large_file=is_large_file,
-def post_process_text(text):
+                specific_pages=specific_pages
-    if not text:
+            )
-        return None
+
-
+            try:
-    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
+                self.log("debug", f"Fallback: Calling OCRmyPDF with args: {args}")
-    no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
+                ocrmypdf.ocr(**args)
-    no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
+
-
+                # Don't return the archived file here, since this file
-    # TODO: this needs a rework
+                # is bigger and blurry due to --force-ocr.
-    # replace \0 prevents issues with saving to postgres.
+
-    # text may contain \0 when this character is present in PDF files.
+                self.text = self.extract_text(
-    return no_trailing_whitespace.strip().replace("\0", " ")
+                    sidecar_file_fallback,
                    archive_path_fallback,
                )
            except Exception as e:
                # If this fails, we have a serious issue at hand.
                raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e
        except Exception as e:
            # Anything else is probably serious.
            raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e
        # As a last resort, if we still don't have any text for any reason,
        # try to extract the text from the original document.
        if not self.text:
            if original_has_text:
                self.text = text_original
            else:
                self.log(
                    "warning",
                    f"No text was found in {document_path}, the content will "
                    f"be empty.",
                )
                self.text = ""
 def post_process_text(text):
    if not text:
        return None
    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
    no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
    no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
    # TODO: this needs a rework
    # replace \0 prevents issues with saving to postgres.
    # text may contain \0 when this character is present in PDF files.
    return no_trailing_whitespace.strip().replace("\0", " ")