Add basic large file upload feature

This commit is contained in:
Martin Tan 2023-09-03 18:15:03 +08:00
parent a29453eaa3
commit dec3b58270
8 changed files with 493 additions and 419 deletions

View File

@ -28,6 +28,10 @@
</ng-template> </ng-template>
</ngx-file-drop> </ngx-file-drop>
<div class="w-full mb-2" *ngFor="let file of files">
<span>{{ file.fileEntry.name }}</span>
</div>
<div class="w-full mb-2"> <div class="w-full mb-2">
<app-input-text <app-input-text
i18n-title i18n-title

View File

@ -26,7 +26,7 @@ export class UploadLargeFileComponent
private fileLeaveTimeoutID: any private fileLeaveTimeoutID: any
fileIsOver: boolean = false fileIsOver: boolean = false
hideFileDrop: boolean = true hideFileDrop: boolean = true
private files: NgxFileDropEntry[]; files: NgxFileDropEntry[];
constructor( constructor(
private route: ActivatedRoute, private route: ActivatedRoute,
@ -57,8 +57,11 @@ export class UploadLargeFileComponent
let storagePathId = parseInt(this.route.snapshot.queryParams['spid']) let storagePathId = parseInt(this.route.snapshot.queryParams['spid'])
storagePathId = !isNaN(storagePathId) ? storagePathId : undefined storagePathId = !isNaN(storagePathId) ? storagePathId : undefined
this.toastService.showInfo($localize`Initiating large file upload...`, 3000) this.toastService.showInfo($localize`Initiating large file upload...`, 3000)
this.uploadDocumentsService.uploadFiles(this.files, { storagePathId }) this.uploadDocumentsService.uploadFiles(this.files, {
storagePathId,
isLargeFile: true,
ocrSpecificPages: this.objectForm.get('ocr_pages').value
})
} }
getForm(): FormGroup<any> { getForm(): FormGroup<any> {

View File

@ -296,6 +296,8 @@ class Consumer(LoggingMixin):
override_owner_id=None, override_owner_id=None,
override_storage_path_id=None, override_storage_path_id=None,
full_path=None, full_path=None,
is_large_file=None,
ocr_specific_pages=None
) -> Document: ) -> Document:
""" """
Return the document object if it was successfully created. Return the document object if it was successfully created.
@ -390,7 +392,11 @@ class Consumer(LoggingMixin):
try: try:
self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT) self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT)
self.log("debug", f"Parsing {self.filename}...") self.log("debug", f"Parsing {self.filename}...")
document_parser.parse(self.path, mime_type, self.filename) custom_options = {
'is_large_file': is_large_file,
'ocr_specific_pages': ocr_specific_pages
}
document_parser.parse(self.path, mime_type, self.filename, custom_options)
self.log("debug", f"Generating thumbnail for {self.filename}...") self.log("debug", f"Generating thumbnail for {self.filename}...")
self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL) self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL)

View File

@ -26,6 +26,8 @@ class DocumentMetadataOverrides:
owner_id: Optional[int] = None owner_id: Optional[int] = None
storage_path_id: Optional[int] = None storage_path_id: Optional[int] = None
full_path: Optional[str] = None full_path: Optional[str] = None
is_large_file: Optional[bool] = None
ocr_specific_pages: Optional[str] = None
class DocumentSource(enum.IntEnum): class DocumentSource(enum.IntEnum):

View File

@ -726,6 +726,8 @@ class PostDocumentSerializer(serializers.Serializer):
min_value=Document.ARCHIVE_SERIAL_NUMBER_MIN, min_value=Document.ARCHIVE_SERIAL_NUMBER_MIN,
max_value=Document.ARCHIVE_SERIAL_NUMBER_MAX, max_value=Document.ARCHIVE_SERIAL_NUMBER_MAX,
) )
# Custom fields
storage_path_id = serializers.IntegerField( storage_path_id = serializers.IntegerField(
label="Storage path ID", label="Storage path ID",
@ -735,7 +737,21 @@ class PostDocumentSerializer(serializers.Serializer):
) )
full_path = serializers.CharField( full_path = serializers.CharField(
label="Full Path", label="Full path",
allow_null=True,
write_only=True,
required=False,
)
is_large_file = serializers.CharField(
label="Is large file",
allow_null=True,
write_only=True,
required=False,
)
ocr_specific_pages = serializers.CharField(
label="OCR specific pages",
allow_null=True, allow_null=True,
write_only=True, write_only=True,
required=False, required=False,

View File

@ -201,7 +201,9 @@ def consume_file(
override_asn=overrides.asn, override_asn=overrides.asn,
override_owner_id=overrides.owner_id, override_owner_id=overrides.owner_id,
override_storage_path_id=overrides.storage_path_id, override_storage_path_id=overrides.storage_path_id,
full_path=overrides.full_path full_path=overrides.full_path,
is_large_file=overrides.is_large_file,
ocr_specific_pages=overrides.ocr_specific_pages
) )
if document: if document:

View File

@ -719,6 +719,8 @@ class PostDocumentView(GenericAPIView):
archive_serial_number = serializer.validated_data.get("archive_serial_number") archive_serial_number = serializer.validated_data.get("archive_serial_number")
storage_path_id = serializer.validated_data.get("storage_path_id") storage_path_id = serializer.validated_data.get("storage_path_id")
full_path = serializer.validated_data.get("full_path") full_path = serializer.validated_data.get("full_path")
is_large_file = serializer.validated_data.get("is_large_file")
ocr_specific_pages = serializer.validated_data.get("ocr_specific_pages")
logger.debug(f"storage_path_id: {storage_path_id}") logger.debug(f"storage_path_id: {storage_path_id}")
@ -750,6 +752,8 @@ class PostDocumentView(GenericAPIView):
# owner_id=request.user.id, # owner_id=request.user.id,
storage_path_id=storage_path_id, storage_path_id=storage_path_id,
full_path=full_path, full_path=full_path,
is_large_file=is_large_file,
ocr_specific_pages=ocr_specific_pages
) )
async_task = consume_file.delay( async_task = consume_file.delay(

View File

@ -1,413 +1,450 @@
import json import json
import os import os
import re import re
import subprocess import subprocess
import tempfile import tempfile
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
from django.conf import settings from django.conf import settings
from documents.parsers import DocumentParser from documents.parsers import DocumentParser
from documents.parsers import make_thumbnail_from_pdf from documents.parsers import make_thumbnail_from_pdf
from documents.parsers import ParseError from documents.parsers import ParseError
from PIL import Image from PIL import Image
class NoTextFoundException(Exception): class NoTextFoundException(Exception):
pass pass
class RtlLanguageException(Exception): class RtlLanguageException(Exception):
pass pass
class RasterisedDocumentParser(DocumentParser): class RasterisedDocumentParser(DocumentParser):
""" """
This parser uses Tesseract to try and get some text out of a rasterised This parser uses Tesseract to try and get some text out of a rasterised
image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
""" """
logging_name = "paperless.parsing.tesseract" logging_name = "paperless.parsing.tesseract"
def extract_metadata(self, document_path, mime_type): def extract_metadata(self, document_path, mime_type):
result = [] result = []
if mime_type == "application/pdf": if mime_type == "application/pdf":
import pikepdf import pikepdf
namespace_pattern = re.compile(r"\{(.*)\}(.*)") namespace_pattern = re.compile(r"\{(.*)\}(.*)")
pdf = pikepdf.open(document_path) pdf = pikepdf.open(document_path)
meta = pdf.open_metadata() meta = pdf.open_metadata()
for key, value in meta.items(): for key, value in meta.items():
if isinstance(value, list): if isinstance(value, list):
value = " ".join([str(e) for e in value]) value = " ".join([str(e) for e in value])
value = str(value) value = str(value)
try: try:
m = namespace_pattern.match(key) m = namespace_pattern.match(key)
result.append( result.append(
{ {
"namespace": m.group(1), "namespace": m.group(1),
"prefix": meta.REVERSE_NS[m.group(1)], "prefix": meta.REVERSE_NS[m.group(1)],
"key": m.group(2), "key": m.group(2),
"value": value, "value": value,
}, },
) )
except Exception as e: except Exception as e:
self.log( self.log(
"warning", "warning",
f"Error while reading metadata {key}: {value}. Error: {e}", f"Error while reading metadata {key}: {value}. Error: {e}",
) )
return result return result
def get_thumbnail(self, document_path, mime_type, file_name=None): def get_thumbnail(self, document_path, mime_type, file_name=None):
return make_thumbnail_from_pdf( return make_thumbnail_from_pdf(
self.archive_path or document_path, self.archive_path or document_path,
self.tempdir, self.tempdir,
self.logging_group, self.logging_group,
) )
def is_image(self, mime_type): def is_image(self, mime_type):
return mime_type in [ return mime_type in [
"image/png", "image/png",
"image/jpeg", "image/jpeg",
"image/tiff", "image/tiff",
"image/bmp", "image/bmp",
"image/gif", "image/gif",
"image/webp", "image/webp",
] ]
def has_alpha(self, image): def has_alpha(self, image):
with Image.open(image) as im: with Image.open(image) as im:
return im.mode in ("RGBA", "LA") return im.mode in ("RGBA", "LA")
def remove_alpha(self, image_path: str): def remove_alpha(self, image_path: str):
subprocess.run( subprocess.run(
[ [
settings.CONVERT_BINARY, settings.CONVERT_BINARY,
"-alpha", "-alpha",
"off", "off",
image_path, image_path,
image_path, image_path,
], ],
) )
def get_dpi(self, image): def get_dpi(self, image):
try: try:
with Image.open(image) as im: with Image.open(image) as im:
x, y = im.info["dpi"] x, y = im.info["dpi"]
return round(x) return round(x)
except Exception as e: except Exception as e:
self.log("warning", f"Error while getting DPI from image {image}: {e}") self.log("warning", f"Error while getting DPI from image {image}: {e}")
return None return None
def calculate_a4_dpi(self, image): def calculate_a4_dpi(self, image):
try: try:
with Image.open(image) as im: with Image.open(image) as im:
width, height = im.size width, height = im.size
# divide image width by A4 width (210mm) in inches. # divide image width by A4 width (210mm) in inches.
dpi = int(width / (21 / 2.54)) dpi = int(width / (21 / 2.54))
self.log("debug", f"Estimated DPI {dpi} based on image width {width}") self.log("debug", f"Estimated DPI {dpi} based on image width {width}")
return dpi return dpi
except Exception as e: except Exception as e:
self.log("warning", f"Error while calculating DPI for image {image}: {e}") self.log("warning", f"Error while calculating DPI for image {image}: {e}")
return None return None
def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path): def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path, custom_options=None):
# When re-doing OCR, the sidecar contains ONLY the new text, not # When re-doing OCR, the sidecar contains ONLY the new text, not
# the whole text, so do not utilize it in that case # the whole text, so do not utilize it in that case
if ( if (
sidecar_file is not None sidecar_file is not None
and os.path.isfile(sidecar_file) and os.path.isfile(sidecar_file)
and settings.OCR_MODE != "redo" and settings.OCR_MODE != "redo"
): ):
with open(sidecar_file) as f: with open(sidecar_file) as f:
text = f.read() text = f.read()
if "[OCR skipped on page" not in text: if "[OCR skipped on page" not in text:
# This happens when there's already text in the input file. # This happens when there's already text in the input file.
# The sidecar file will only contain text for OCR'ed pages. # The sidecar file will only contain text for OCR'ed pages.
self.log("debug", "Using text from sidecar file") self.log("debug", "Using text from sidecar file")
return post_process_text(text) return post_process_text(text)
else: elif custom_options is not None and 'is_large_file' in custom_options:
self.log("debug", "Incomplete sidecar file: discarding.") self.log("debug", "File is large so some pages may have been skipped intentionally. Using text from incomplete sidecar file")
return post_process_text(text)
# no success with the sidecar file, try PDF else:
self.log("debug", "Incomplete sidecar file: discarding.")
if not os.path.isfile(pdf_file):
return None # no success with the sidecar file, try PDF
try: if not os.path.isfile(pdf_file):
text = None return None
with tempfile.NamedTemporaryFile(
mode="w+", try:
dir=self.tempdir, text = None
) as tmp: with tempfile.NamedTemporaryFile(
subprocess.run( mode="w+",
[ dir=self.tempdir,
"pdftotext", ) as tmp:
"-q", subprocess.run(
"-layout", [
"-enc", "pdftotext",
"UTF-8", "-q",
pdf_file, "-layout",
tmp.name, "-enc",
], "UTF-8",
) pdf_file,
text = tmp.read() tmp.name,
],
return post_process_text(text) )
text = tmp.read()
except Exception:
# If pdftotext fails, fall back to OCR. return post_process_text(text)
self.log(
"warning", except Exception:
"Error while getting text from PDF document with pdftotext", # If pdftotext fails, fall back to OCR.
exc_info=True, self.log(
) "warning",
# probably not a PDF file. "Error while getting text from PDF document with pdftotext",
return None exc_info=True,
)
def construct_ocrmypdf_parameters( # probably not a PDF file.
self, return None
input_file,
mime_type, def construct_ocrmypdf_parameters(
output_file, self,
sidecar_file, input_file,
safe_fallback=False, mime_type,
): output_file,
ocrmypdf_args = { sidecar_file,
"input_file": input_file, safe_fallback=False,
"output_file": output_file, # used for large files, to only do OCR on specific pages
# need to use threads, since this will be run in daemonized is_large_file=False,
# processes via the task library. specific_pages=None
"use_threads": True, ):
"jobs": settings.THREADS_PER_WORKER, ocrmypdf_args = {
"language": settings.OCR_LANGUAGE, "input_file": input_file,
"output_type": settings.OCR_OUTPUT_TYPE, "output_file": output_file,
"progress_bar": False, # need to use threads, since this will be run in daemonized
} # processes via the task library.
"use_threads": True,
if settings.OCR_MODE == "force" or safe_fallback: "jobs": settings.THREADS_PER_WORKER,
ocrmypdf_args["force_ocr"] = True "language": settings.OCR_LANGUAGE,
elif settings.OCR_MODE in ["skip", "skip_noarchive"]: "output_type": settings.OCR_OUTPUT_TYPE,
ocrmypdf_args["skip_text"] = True "progress_bar": False,
elif settings.OCR_MODE == "redo": }
ocrmypdf_args["redo_ocr"] = True
else: if settings.OCR_MODE == "force" or safe_fallback:
raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}") ocrmypdf_args["force_ocr"] = True
elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
if settings.OCR_CLEAN == "clean": ocrmypdf_args["skip_text"] = True
ocrmypdf_args["clean"] = True elif settings.OCR_MODE == "redo":
elif settings.OCR_CLEAN == "clean-final": ocrmypdf_args["redo_ocr"] = True
if settings.OCR_MODE == "redo": else:
ocrmypdf_args["clean"] = True raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}")
else:
# --clean-final is not compatible with --redo-ocr if settings.OCR_CLEAN == "clean":
ocrmypdf_args["clean_final"] = True ocrmypdf_args["clean"] = True
elif settings.OCR_CLEAN == "clean-final":
if settings.OCR_DESKEW and settings.OCR_MODE != "redo": if settings.OCR_MODE == "redo":
# --deskew is not compatible with --redo-ocr ocrmypdf_args["clean"] = True
ocrmypdf_args["deskew"] = True else:
# --clean-final is not compatible with --redo-ocr
if settings.OCR_ROTATE_PAGES: ocrmypdf_args["clean_final"] = True
ocrmypdf_args["rotate_pages"] = True
ocrmypdf_args[ if settings.OCR_DESKEW and settings.OCR_MODE != "redo":
"rotate_pages_threshold" # --deskew is not compatible with --redo-ocr
] = settings.OCR_ROTATE_PAGES_THRESHOLD ocrmypdf_args["deskew"] = True
if settings.OCR_PAGES > 0: if settings.OCR_ROTATE_PAGES:
ocrmypdf_args["pages"] = f"1-{settings.OCR_PAGES}" ocrmypdf_args["rotate_pages"] = True
else: ocrmypdf_args[
# sidecar is incompatible with pages "rotate_pages_threshold"
ocrmypdf_args["sidecar"] = sidecar_file ] = settings.OCR_ROTATE_PAGES_THRESHOLD
if self.is_image(mime_type): if settings.OCR_PAGES > 0:
dpi = self.get_dpi(input_file) ocrmypdf_args["pages"] = f"1-{settings.OCR_PAGES}"
a4_dpi = self.calculate_a4_dpi(input_file) else:
# sidecar is incompatible with pages
if self.has_alpha(input_file): ocrmypdf_args["sidecar"] = sidecar_file
self.log(
"info", if is_large_file and specific_pages is not None:
f"Removing alpha layer from {input_file} " specific_pages = specific_pages.strip(',')
"for compatibility with img2pdf", ocrmypdf_args["pages"] = specific_pages
) elif is_large_file:
self.remove_alpha(input_file) self.log("debug", "Large file but did not specify pages, so disabling OCR")
ocrmypdf_args["tesseract-timeout"] = 0
if dpi:
self.log("debug", f"Detected DPI for image {input_file}: {dpi}") # Regardless of other options, disable postprocessing if large file
ocrmypdf_args["image_dpi"] = dpi # Source: https://ocrmypdf.readthedocs.io/en/latest/performance.html?highlight=Postprocessing#speed
elif settings.OCR_IMAGE_DPI: if is_large_file:
ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI self.log("debug", "Since large file, disabling postprocessing")
elif a4_dpi: ocrmypdf_args["optimize"] = 0
ocrmypdf_args["image_dpi"] = a4_dpi ocrmypdf_args["output-type"] = 'pdf'
else: ocrmypdf_args["fast-web-view"] = 0
raise ParseError( ocrmypdf_args["skip-big"] = 200
f"Cannot produce archive PDF for image {input_file}, " ocrmypdf_args["deskew"] = False
f"no DPI information is present in this image and " ocrmypdf_args["rotate_pages"] = False
f"OCR_IMAGE_DPI is not set.", ocrmypdf_args["clean"] = False
)
if self.is_image(mime_type):
if settings.OCR_USER_ARGS and not safe_fallback: dpi = self.get_dpi(input_file)
try: a4_dpi = self.calculate_a4_dpi(input_file)
user_args = json.loads(settings.OCR_USER_ARGS)
ocrmypdf_args = {**ocrmypdf_args, **user_args} if self.has_alpha(input_file):
except Exception as e: self.log(
self.log( "info",
"warning", f"Removing alpha layer from {input_file} "
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " "for compatibility with img2pdf",
f"they will not be used. Error: {e}", )
) self.remove_alpha(input_file)
if settings.OCR_MAX_IMAGE_PIXELS is not None: if dpi:
# Convert pixels to mega-pixels and provide to ocrmypdf self.log("debug", f"Detected DPI for image {input_file}: {dpi}")
max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0 ocrmypdf_args["image_dpi"] = dpi
if max_pixels_mpixels > 0: elif settings.OCR_IMAGE_DPI:
ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI
self.log( elif a4_dpi:
"debug", ocrmypdf_args["image_dpi"] = a4_dpi
f"Calculated {max_pixels_mpixels} megapixels for OCR", else:
) raise ParseError(
f"Cannot produce archive PDF for image {input_file}, "
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels f"no DPI information is present in this image and "
else: f"OCR_IMAGE_DPI is not set.",
self.log( )
"warning",
"There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, " if settings.OCR_USER_ARGS and not safe_fallback:
"this value must be at least 1 megapixel if set", try:
) user_args = json.loads(settings.OCR_USER_ARGS)
ocrmypdf_args = {**ocrmypdf_args, **user_args}
return ocrmypdf_args except Exception as e:
self.log(
def parse(self, document_path: Path, mime_type, file_name=None): "warning",
# This forces tesseract to use one core per page. f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
os.environ["OMP_THREAD_LIMIT"] = "1" f"they will not be used. Error: {e}",
VALID_TEXT_LENGTH = 50 )
if mime_type == "application/pdf": if settings.OCR_MAX_IMAGE_PIXELS is not None:
text_original = self.extract_text(None, document_path) # Convert pixels to mega-pixels and provide to ocrmypdf
original_has_text = ( max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
text_original is not None and len(text_original) > VALID_TEXT_LENGTH if max_pixels_mpixels > 0:
)
else: self.log(
text_original = None "debug",
original_has_text = False f"Calculated {max_pixels_mpixels} megapixels for OCR",
)
# If the original has text, and the user doesn't want an archive,
# we're done here ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
skip_archive_for_text = ( else:
settings.OCR_MODE == "skip_noarchive" self.log(
or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"] "warning",
) "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
if skip_archive_for_text and original_has_text: "this value must be at least 1 megapixel if set",
self.log("debug", "Document has text, skipping OCRmyPDF entirely.") )
self.text = text_original
return return ocrmypdf_args
# Either no text was in the original or there should be an archive def parse(self, document_path: Path, mime_type, file_name=None, custom_options=None):
# file created, so OCR the file and create an archive with any # This forces tesseract to use one core per page.
# text located via OCR os.environ["OMP_THREAD_LIMIT"] = "1"
VALID_TEXT_LENGTH = 50
import ocrmypdf
from ocrmypdf import InputFileError, EncryptedPdfError if mime_type == "application/pdf":
text_original = self.extract_text(None, document_path)
archive_path = Path(os.path.join(self.tempdir, "archive.pdf")) original_has_text = (
sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt")) text_original is not None and len(text_original) > VALID_TEXT_LENGTH
)
args = self.construct_ocrmypdf_parameters( else:
document_path, text_original = None
mime_type, original_has_text = False
archive_path,
sidecar_file, # If the original has text, and the user doesn't want an archive,
) # we're done here
skip_archive_for_text = (
try: settings.OCR_MODE == "skip_noarchive"
self.log("debug", f"Calling OCRmyPDF with args: {args}") or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
ocrmypdf.ocr(**args) )
if skip_archive_for_text and original_has_text:
if settings.OCR_SKIP_ARCHIVE_FILE != "always": self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
self.archive_path = archive_path self.text = text_original
return
self.text = self.extract_text(sidecar_file, archive_path)
# Either no text was in the original or there should be an archive
if not self.text: # file created, so OCR the file and create an archive with any
raise NoTextFoundException("No text was found in the original document") # text located via OCR
except EncryptedPdfError:
self.log( import ocrmypdf
"warning", from ocrmypdf import InputFileError, EncryptedPdfError
"This file is encrypted, OCR is impossible. Using "
"any text present in the original file.", archive_path = Path(os.path.join(self.tempdir, "archive.pdf"))
) sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt"))
if original_has_text:
self.text = text_original specific_pages = None
except (NoTextFoundException, InputFileError) as e: is_large_file = False
self.log( if custom_options is not None:
"warning", if 'ocr_specific_pages' in custom_options:
f"Encountered an error while running OCR: {str(e)}. " specific_pages = custom_options['ocr_specific_pages']
f"Attempting force OCR to get the text.", if 'is_large_file' in custom_options:
) is_large_file = custom_options['is_large_file']
archive_path_fallback = Path( args = self.construct_ocrmypdf_parameters(
os.path.join(self.tempdir, "archive-fallback.pdf"), document_path,
) mime_type,
sidecar_file_fallback = Path( archive_path,
os.path.join(self.tempdir, "sidecar-fallback.txt"), sidecar_file,
) is_large_file=is_large_file,
specific_pages=specific_pages
# Attempt to run OCR with safe settings. )
args = self.construct_ocrmypdf_parameters( try:
document_path, self.log("debug", f"Calling OCRmyPDF with args: {args}")
mime_type, ocrmypdf.ocr(**args)
archive_path_fallback,
sidecar_file_fallback, if settings.OCR_SKIP_ARCHIVE_FILE != "always":
safe_fallback=True, self.archive_path = archive_path
)
self.text = self.extract_text(sidecar_file, archive_path, custom_options=custom_options)
try:
self.log("debug", f"Fallback: Calling OCRmyPDF with args: {args}") if not self.text:
ocrmypdf.ocr(**args) raise NoTextFoundException("No text was found in the original document")
except EncryptedPdfError:
# Don't return the archived file here, since this file self.log(
# is bigger and blurry due to --force-ocr. "warning",
"This file is encrypted, OCR is impossible. Using "
self.text = self.extract_text( "any text present in the original file.",
sidecar_file_fallback, )
archive_path_fallback, if original_has_text:
) self.text = text_original
except (NoTextFoundException, InputFileError) as e:
except Exception as e: self.log(
# If this fails, we have a serious issue at hand. "warning",
raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e f"Encountered an error while running OCR: {str(e)}. "
f"Attempting force OCR to get the text.",
except Exception as e: )
# Anything else is probably serious.
raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e archive_path_fallback = Path(
os.path.join(self.tempdir, "archive-fallback.pdf"),
# As a last resort, if we still don't have any text for any reason, )
# try to extract the text from the original document. sidecar_file_fallback = Path(
if not self.text: os.path.join(self.tempdir, "sidecar-fallback.txt"),
if original_has_text: )
self.text = text_original
else: # Attempt to run OCR with safe settings.
self.log(
"warning", args = self.construct_ocrmypdf_parameters(
f"No text was found in {document_path}, the content will " document_path,
f"be empty.", mime_type,
) archive_path_fallback,
self.text = "" sidecar_file_fallback,
safe_fallback=True,
is_large_file=is_large_file,
def post_process_text(text): specific_pages=specific_pages
if not text: )
return None
try:
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) self.log("debug", f"Fallback: Calling OCRmyPDF with args: {args}")
no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces) ocrmypdf.ocr(**args)
no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
# Don't return the archived file here, since this file
# TODO: this needs a rework # is bigger and blurry due to --force-ocr.
# replace \0 prevents issues with saving to postgres.
# text may contain \0 when this character is present in PDF files. self.text = self.extract_text(
return no_trailing_whitespace.strip().replace("\0", " ") sidecar_file_fallback,
archive_path_fallback,
)
except Exception as e:
# If this fails, we have a serious issue at hand.
raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e
except Exception as e:
# Anything else is probably serious.
raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e
# As a last resort, if we still don't have any text for any reason,
# try to extract the text from the original document.
if not self.text:
if original_has_text:
self.text = text_original
else:
self.log(
"warning",
f"No text was found in {document_path}, the content will "
f"be empty.",
)
self.text = ""
def post_process_text(text):
if not text:
return None
collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
# TODO: this needs a rework
# replace \0 prevents issues with saving to postgres.
# text may contain \0 when this character is present in PDF files.
return no_trailing_whitespace.strip().replace("\0", " ")