Add basic large file upload feature

2023-09-03 18:15:03 +08:00 · 2023-09-03 18:15:03 +08:00 · dec3b58270
commit dec3b58270
parent a29453eaa3
8 changed files with 493 additions and 419 deletions
--- a/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.html
+++ b/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.html
@ -28,6 +28,10 @@
      </ng-template>
    </ngx-file-drop>

+    <div class="w-full mb-2" *ngFor="let file of files">
+      <span>{{ file.fileEntry.name }}</span>
+    </div>
+
    <div class="w-full mb-2">
      <app-input-text
        i18n-title
--- a/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.ts
+++ b/src-ui/src/app/components/common/create-dialog/upload-large-file/upload-large-file.component.ts
@ -26,7 +26,7 @@ export class UploadLargeFileComponent
  private fileLeaveTimeoutID: any
  fileIsOver: boolean = false
  hideFileDrop: boolean = true
-  private files: NgxFileDropEntry[];
+  files: NgxFileDropEntry[];

  constructor(
    private route: ActivatedRoute,
@ -57,8 +57,11 @@ export class UploadLargeFileComponent
    let storagePathId = parseInt(this.route.snapshot.queryParams['spid'])
    storagePathId = !isNaN(storagePathId) ? storagePathId : undefined
    this.toastService.showInfo($localize`Initiating large file upload...`, 3000)
-    this.uploadDocumentsService.uploadFiles(this.files, { storagePathId })
-
+    this.uploadDocumentsService.uploadFiles(this.files, {
+      storagePathId, 
+      isLargeFile: true, 
+      ocrSpecificPages: this.objectForm.get('ocr_pages').value 
+    })
  }

  getForm(): FormGroup<any> {
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@ -296,6 +296,8 @@ class Consumer(LoggingMixin):
        override_owner_id=None,
        override_storage_path_id=None,
        full_path=None,
+        is_large_file=None,
+        ocr_specific_pages=None
    ) -> Document:
        """
        Return the document object if it was successfully created.
@ -390,7 +392,11 @@ class Consumer(LoggingMixin):
        try:
            self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT)
            self.log("debug", f"Parsing {self.filename}...")
-            document_parser.parse(self.path, mime_type, self.filename)
+            custom_options = { 
+                'is_large_file': is_large_file,
+                'ocr_specific_pages': ocr_specific_pages 
+            }
+            document_parser.parse(self.path, mime_type, self.filename, custom_options)

            self.log("debug", f"Generating thumbnail for {self.filename}...")
            self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL)
--- a/src/documents/data_models.py
+++ b/src/documents/data_models.py
@ -26,6 +26,8 @@ class DocumentMetadataOverrides:
    owner_id: Optional[int] = None
    storage_path_id: Optional[int] = None
    full_path: Optional[str] = None
+    is_large_file: Optional[bool] = None
+    ocr_specific_pages: Optional[str] = None


 class DocumentSource(enum.IntEnum):
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@ -727,6 +727,8 @@ class PostDocumentSerializer(serializers.Serializer):
        max_value=Document.ARCHIVE_SERIAL_NUMBER_MAX,
    )
    
+    # Custom fields
+
    storage_path_id = serializers.IntegerField(
        label="Storage path ID",
        allow_null=True,
@ -735,7 +737,21 @@ class PostDocumentSerializer(serializers.Serializer):
    )

    full_path = serializers.CharField(
-        label="Full Path",
+        label="Full path",
+        allow_null=True,
+        write_only=True,
+        required=False,
+    )
+
+    is_large_file = serializers.CharField(
+        label="Is large file",
+        allow_null=True,
+        write_only=True,
+        required=False,
+    )
+
+    ocr_specific_pages = serializers.CharField(
+        label="OCR specific pages",
        allow_null=True,
        write_only=True,
        required=False,
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@ -201,7 +201,9 @@ def consume_file(
        override_asn=overrides.asn,
        override_owner_id=overrides.owner_id,
        override_storage_path_id=overrides.storage_path_id,
-        full_path=overrides.full_path
+        full_path=overrides.full_path,
+        is_large_file=overrides.is_large_file,
+        ocr_specific_pages=overrides.ocr_specific_pages
    )

    if document:
--- a/src/documents/views.py
+++ b/src/documents/views.py
@ -719,6 +719,8 @@ class PostDocumentView(GenericAPIView):
        archive_serial_number = serializer.validated_data.get("archive_serial_number")
        storage_path_id = serializer.validated_data.get("storage_path_id")
        full_path = serializer.validated_data.get("full_path")
+        is_large_file = serializer.validated_data.get("is_large_file")
+        ocr_specific_pages = serializer.validated_data.get("ocr_specific_pages")

        logger.debug(f"storage_path_id: {storage_path_id}")

@ -750,6 +752,8 @@ class PostDocumentView(GenericAPIView):
            # owner_id=request.user.id,
            storage_path_id=storage_path_id,
            full_path=full_path,
+            is_large_file=is_large_file,
+            ocr_specific_pages=ocr_specific_pages
        )

        async_task = consume_file.delay(
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@ -114,7 +114,7 @@ class RasterisedDocumentParser(DocumentParser):
            self.log("warning", f"Error while calculating DPI for image {image}: {e}")
            return None

-    def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path):
+    def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path, custom_options=None):
        # When re-doing OCR, the sidecar contains ONLY the new text, not
        # the whole text, so do not utilize it in that case
        if (
@ -130,6 +130,9 @@ class RasterisedDocumentParser(DocumentParser):
                # The sidecar file will only contain text for OCR'ed pages.
                self.log("debug", "Using text from sidecar file")
                return post_process_text(text)
+            elif custom_options is not None and 'is_large_file' in custom_options:
+                self.log("debug", "File is large so some pages may have been skipped intentionally. Using text from incomplete sidecar file")
+                return post_process_text(text)
            else:
                self.log("debug", "Incomplete sidecar file: discarding.")

@ -176,6 +179,9 @@ class RasterisedDocumentParser(DocumentParser):
        output_file,
        sidecar_file,
        safe_fallback=False,
+        # used for large files, to only do OCR on specific pages
+        is_large_file=False,
+        specific_pages=None
    ):
        ocrmypdf_args = {
            "input_file": input_file,
@ -223,6 +229,25 @@ class RasterisedDocumentParser(DocumentParser):
            # sidecar is incompatible with pages
            ocrmypdf_args["sidecar"] = sidecar_file

+        if is_large_file and specific_pages is not None:
+            specific_pages = specific_pages.strip(',')
+            ocrmypdf_args["pages"] = specific_pages
+        elif is_large_file:
+            self.log("debug", "Large file but did not specify pages, so disabling OCR")
+            ocrmypdf_args["tesseract-timeout"] = 0
+
+        # Regardless of other options, disable postprocessing if large file
+        # Source: https://ocrmypdf.readthedocs.io/en/latest/performance.html?highlight=Postprocessing#speed
+        if is_large_file:
+            self.log("debug", "Since large file, disabling postprocessing")
+            ocrmypdf_args["optimize"] = 0
+            ocrmypdf_args["output-type"] = 'pdf'
+            ocrmypdf_args["fast-web-view"] = 0
+            ocrmypdf_args["skip-big"] = 200
+            ocrmypdf_args["deskew"] = False
+            ocrmypdf_args["rotate_pages"] = False
+            ocrmypdf_args["clean"] = False
+
        if self.is_image(mime_type):
            dpi = self.get_dpi(input_file)
            a4_dpi = self.calculate_a4_dpi(input_file)
@ -280,7 +305,7 @@ class RasterisedDocumentParser(DocumentParser):

        return ocrmypdf_args

-    def parse(self, document_path: Path, mime_type, file_name=None):
+    def parse(self, document_path: Path, mime_type, file_name=None, custom_options=None):
        # This forces tesseract to use one core per page.
        os.environ["OMP_THREAD_LIMIT"] = "1"
        VALID_TEXT_LENGTH = 50
@ -315,11 +340,21 @@ class RasterisedDocumentParser(DocumentParser):
        archive_path = Path(os.path.join(self.tempdir, "archive.pdf"))
        sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt"))

+        specific_pages = None
+        is_large_file = False
+        if custom_options is not None:
+            if 'ocr_specific_pages' in custom_options:
+                specific_pages = custom_options['ocr_specific_pages']
+            if 'is_large_file' in custom_options:
+                is_large_file = custom_options['is_large_file']
+
        args = self.construct_ocrmypdf_parameters(
            document_path,
            mime_type,
            archive_path,
            sidecar_file,
+            is_large_file=is_large_file,
+            specific_pages=specific_pages
        )

        try:
@ -329,7 +364,7 @@ class RasterisedDocumentParser(DocumentParser):
            if settings.OCR_SKIP_ARCHIVE_FILE != "always":
                self.archive_path = archive_path

-            self.text = self.extract_text(sidecar_file, archive_path)
+            self.text = self.extract_text(sidecar_file, archive_path, custom_options=custom_options)

            if not self.text:
                raise NoTextFoundException("No text was found in the original document")
@ -363,6 +398,8 @@ class RasterisedDocumentParser(DocumentParser):
                archive_path_fallback,
                sidecar_file_fallback,
                safe_fallback=True,
+                is_large_file=is_large_file,
+                specific_pages=specific_pages
            )

            try: