diff --git a/requirements.txt b/requirements.txt index 97c33a9df..301c2e0bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -123,5 +123,6 @@ whoosh==2.7.4 wrapt==1.16.0; python_version >= '3.6' zstandard==0.22.0; python_version >= '3.8' zxing-cpp==2.2.0; platform_machine == 'x86_64' and python_version >= '3.6' +PyPDF2<3.0 python-decouple==3.8 diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 4e30db1b1..33e882101 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -294,7 +294,8 @@ INSTALLED_APPS = [ "django_extensions", "paperless", "documents.apps.DocumentsConfig", - "paperless_tesseract.apps.PaperlessTesseractConfig", + # "paperless_tesseract.apps.PaperlessTesseractConfig", + "paperless_ocr_custom.apps.PaperlessTesseractConfig", "paperless_text.apps.PaperlessTextConfig", "paperless_mail.apps.PaperlessMailConfig", "django.contrib.admin", @@ -416,6 +417,19 @@ CHANNEL_LAYERS = { }, } +# PAPERLESS_OCR_CUSTOM +TCGROUP_OCR_CUSTOM = { + "ACCOUNT": { + "OCR_CUSTOM_USERNAME": os.getenv("OCR_CUSTOM_USERNAME", "test"), + "OCR_CUSTOM_PASSWORD": os.getenv("OCR_CUSTOM_PASSWORD", "test"), + }, + "URL": { + "URL_LOGIN": os.getenv("URL_LOGIN","https://ocr-core-api.tcgroup.vn/token"), + "URL_UPLOAD_FILE": os.getenv("URL_UPLOAD_FILE","https://ocr-core-api.tcgroup.vn/api/v1/file/upload"), + "URL_OCR_BY_FILEID": os.getenv("URL_OCR_BY_FILEID","https://ocr-core-api.tcgroup.vn/api/v1/ocr/general"), + } +} + ############################################################################### # Security # ############################################################################### diff --git a/src/paperless_ocr_custom/__init__.py b/src/paperless_ocr_custom/__init__.py new file mode 100644 index 000000000..c811b3c76 --- /dev/null +++ b/src/paperless_ocr_custom/__init__.py @@ -0,0 +1,5 @@ +# this is here so that django finds the checks. +from paperless_ocr_custom.checks import check_default_language_available +from paperless_ocr_custom.checks import get_tesseract_langs + +__all__ = ["get_tesseract_langs", "check_default_language_available"] diff --git a/src/paperless_ocr_custom/apps.py b/src/paperless_ocr_custom/apps.py new file mode 100644 index 000000000..e96602cfd --- /dev/null +++ b/src/paperless_ocr_custom/apps.py @@ -0,0 +1,14 @@ +from django.apps import AppConfig + +from paperless_ocr_custom.signals import tesseract_consumer_declaration + + +class PaperlessTesseractConfig(AppConfig): + name = "paperless_ocr_custom" + + def ready(self): + from documents.signals import document_consumer_declaration + + document_consumer_declaration.connect(tesseract_consumer_declaration) + + AppConfig.ready(self) diff --git a/src/paperless_ocr_custom/checks.py b/src/paperless_ocr_custom/checks.py new file mode 100644 index 000000000..82d255005 --- /dev/null +++ b/src/paperless_ocr_custom/checks.py @@ -0,0 +1,46 @@ +import shutil +import subprocess + +from django.conf import settings +from django.core.checks import Error +from django.core.checks import Warning +from django.core.checks import register + + +def get_tesseract_langs(): + proc = subprocess.run( + [shutil.which("tesseract"), "--list-langs"], + capture_output=True, + ) + + # Decode bytes to string, split on newlines, trim out the header + proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:] + + return [x.strip() for x in proc_lines] + + +@register() +def check_default_language_available(app_configs, **kwargs): + installed_langs = get_tesseract_langs() + + if not settings.OCR_LANGUAGE: + return [ + Warning( + "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. " + "This means that tesseract will fallback to english.", + ), + ] + + specified_langs = settings.OCR_LANGUAGE.split("+") + + for lang in specified_langs: + if lang not in installed_langs: + return [ + Error( + f"The selected ocr language {lang} is " + f"not installed. Paperless cannot OCR your documents " + f"without it. Please fix PAPERLESS_OCR_LANGUAGE.", + ), + ] + + return [] diff --git a/src/paperless_ocr_custom/fonts/arial-font/ARIBL0.ttf b/src/paperless_ocr_custom/fonts/arial-font/ARIBL0.ttf new file mode 100644 index 000000000..a1e68a366 Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/ARIBL0.ttf differ diff --git a/src/paperless_ocr_custom/fonts/arial-font/ArialTh.ttf b/src/paperless_ocr_custom/fonts/arial-font/ArialTh.ttf new file mode 100644 index 000000000..614f0af2c Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/ArialTh.ttf differ diff --git a/src/paperless_ocr_custom/fonts/arial-font/Arialn.ttf b/src/paperless_ocr_custom/fonts/arial-font/Arialn.ttf new file mode 100644 index 000000000..5799f4149 Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/Arialn.ttf differ diff --git a/src/paperless_ocr_custom/fonts/arial-font/GEO_AI__.TTF b/src/paperless_ocr_custom/fonts/arial-font/GEO_AI__.TTF new file mode 100644 index 000000000..62572a4a8 Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/GEO_AI__.TTF differ diff --git a/src/paperless_ocr_custom/fonts/arial-font/G_ari_bd.TTF b/src/paperless_ocr_custom/fonts/arial-font/G_ari_bd.TTF new file mode 100644 index 000000000..61c51a0cb Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/G_ari_bd.TTF differ diff --git a/src/paperless_ocr_custom/fonts/arial-font/G_ari_i.TTF b/src/paperless_ocr_custom/fonts/arial-font/G_ari_i.TTF new file mode 100644 index 000000000..5ebda789c Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/G_ari_i.TTF differ diff --git a/src/paperless_ocr_custom/fonts/arial-font/arial.ttf b/src/paperless_ocr_custom/fonts/arial-font/arial.ttf new file mode 100644 index 000000000..ad7d8eab8 Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/arial.ttf differ diff --git a/src/paperless_ocr_custom/parsers.py b/src/paperless_ocr_custom/parsers.py new file mode 100644 index 000000000..ec65b8451 --- /dev/null +++ b/src/paperless_ocr_custom/parsers.py @@ -0,0 +1,685 @@ +import io +import logging +import os +import re +import shutil +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Optional + +import PyPDF2 +from django.conf import settings +import requests +from PyPDF2 import PdfFileWriter, PdfFileReader, PdfReader, PdfWriter +from reportlab.pdfgen import canvas +from reportlab.lib.pagesizes import letter +from PIL import Image,ImageDraw,ImageFont +from reportlab.pdfgen.canvas import Canvas +from reportlab.pdfbase.ttfonts import TTFont +from reportlab.pdfbase import pdfmetrics + +from documents.parsers import DocumentParser +from documents.parsers import ParseError +from documents.parsers import make_thumbnail_from_pdf +from documents.utils import maybe_override_pixel_limit +from documents.utils import run_subprocess +from paperless.config import OcrConfig +from paperless.models import ArchiveFileChoices +from paperless.models import CleanChoices +from paperless.models import ModeChoices + + +class NoTextFoundException(Exception): + pass + + +class RtlLanguageException(Exception): + pass + + +class RasterisedDocumentParser(DocumentParser): + """ + This parser uses Tesseract to try and get some text out of a rasterised + image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) + """ + + logging_name = "paperless.parsing.tesseract" + + def get_settings(self) -> OcrConfig: + """ + This parser uses the OCR configuration settings to parse documents + """ + return OcrConfig() + + def extract_metadata(self, document_path, mime_type): + result = [] + if mime_type == "application/pdf": + import pikepdf + + namespace_pattern = re.compile(r"\{(.*)\}(.*)") + + pdf = pikepdf.open(document_path) + meta = pdf.open_metadata() + for key, value in meta.items(): + if isinstance(value, list): + value = " ".join([str(e) for e in value]) + value = str(value) + try: + m = namespace_pattern.match(key) + if m is None: # pragma: no cover + continue + namespace = m.group(1) + key_value = m.group(2) + try: + namespace.encode("utf-8") + key_value.encode("utf-8") + except UnicodeEncodeError as e: # pragma: no cover + self.log.debug(f"Skipping metadata key {key}: {e}") + continue + result.append( + { + "namespace": namespace, + "prefix": meta.REVERSE_NS[namespace], + "key": key_value, + "value": value, + }, + ) + except Exception as e: + self.log.warning( + f"Error while reading metadata {key}: {value}. Error: {e}", + ) + return result + + def get_thumbnail(self, document_path, mime_type, file_name=None): + return make_thumbnail_from_pdf( + self.archive_path or document_path, + self.tempdir, + self.logging_group, + ) + + def is_image(self, mime_type) -> bool: + return mime_type in [ + "image/png", + "image/jpeg", + "image/tiff", + "image/bmp", + "image/gif", + "image/webp", + ] + + def has_alpha(self, image) -> bool: + with Image.open(image) as im: + return im.mode in ("RGBA", "LA") + + def remove_alpha(self, image_path: str) -> Path: + no_alpha_image = Path(self.tempdir) / "image-no-alpha" + run_subprocess( + [ + settings.CONVERT_BINARY, + "-alpha", + "off", + image_path, + no_alpha_image, + ], + logger=self.log, + ) + return no_alpha_image + + def get_dpi(self, image) -> Optional[int]: + try: + with Image.open(image) as im: + x, y = im.info["dpi"] + return round(x) + except Exception as e: + self.log.warning(f"Error while getting DPI from image {image}: {e}") + return None + + def calculate_a4_dpi(self, image) -> Optional[int]: + try: + with Image.open(image) as im: + width, height = im.size + # divide image width by A4 width (210mm) in inches. + dpi = int(width / (21 / 2.54)) + self.log.debug(f"Estimated DPI {dpi} based on image width {width}") + return dpi + + except Exception as e: + self.log.warning(f"Error while calculating DPI for image {image}: {e}") + return None + # get ocr file img/pdf + def ocr_file(self,path_file): + # get text from api + ocr_custom_username = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_USERNAME"] + ocr_custom_password = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_PASSWORD"] + url_login = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_LOGIN"] + data = { + 'username': ocr_custom_username, + 'password': ocr_custom_password + } + response_login = requests.post(url_login, data=data) + access_token = '' + if response_login.status_code == 200: + response_data = response_login.json() + access_token = response_data.get('access_token','') + else: + logging.error('login: ', response_login.status_code) + + # upload file + get_file_id = '' + url_upload_file = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_UPLOAD_FILE"] + headers = { + 'Authorization': f'Bearer {access_token}' + } + pdf_data = None + with open(path_file, 'rb') as file: + pdf_data = file.read() + + response_upload = requests.post(url_upload_file, files={'file': (str(path_file).split("/")[-1], pdf_data)}, headers=headers) + # logging.debug('pdf file',response_upload) + if response_upload.status_code == 200: + get_file_id = response_upload.json().get('file_id','') + else: + logging.error('upload file: ',response_upload.status_code) + + # ocr by file_id + # logging.debug('gia tri file id:', get_file_id) + params = {'file_id': get_file_id} + url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"] + response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers, params=params) + data_ocr = None + + # logging.error('ocr: ', response_ocr.status_code) + + if response_ocr.status_code == 200: + data_ocr = response_ocr.json() + else: + logging.error('ocr: ', response_ocr.text) + + return data_ocr + def render_pdf_ocr(self, sidecar, mime_type, input_path, output_path): + font_name = 'Arial' + c = None + data = self.ocr_file(input_path) + if not data: + return + font_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'fonts', 'arial-font/arial.ttf') + with open(sidecar, "w") as txt_sidecar: + txt_sidecar.write(data.get("content","")) + if self.is_image(mime_type): + img = Image.open(input_path) + width, height = img.size + c = canvas.Canvas(str(output_path), pagesize=(width, height)) + pdfmetrics.registerFont(TTFont(font_name, font_path)) + # viet text vao file + for page in data["pages"]: + for block in page["blocks"]: + for line in block.get("lines", []): + for word in line.get("words", []): + x1 = word["bbox"][0][0] + y1 = word["bbox"][0][1] + x2 = word["bbox"][1][0] + y2 = word["bbox"][1][1] + value = word["value"] + font_size = (y2-y1) * 72 / 96 + + + # font = ImageFont.truetype(font_path, font_size) + # text = "Hello, world!" + # text_width, text_height = font.textsize(text) + # print(f"Text width: {text_width}, Text height: {text_height}") + x_center_coordinates =x2 - (x2-x1)/2 + y_center_coordinates =y2 - (y2-y1)/2 + w = c.stringWidth(value, font_name, font_size) + self.log.debug('w:', ) + c.setFont('Arial', font_size) + c.drawString(x_center_coordinates - w/2 , height - y_center_coordinates - (font_size/2) , value) + c.drawImage(input_path, 0, 0, width=width, height=height) + c.save() + else: + shutil.copy(str(input_path), str(output_path)) + output_pdf = PdfWriter() + input_pdf = PdfReader(output_path) + + for page_num, page in enumerate(input_pdf.pages): + page_height = input_pdf.pages[page_num].mediabox[3] + page_width = input_pdf.pages[page_num].mediabox[2] + + packet = io.BytesIO() + can = canvas.Canvas(packet, pagesize=letter) + page_image = page.to_image() + page_image.save(packet, "JPG") + pdfmetrics.registerFont(TTFont('Arial', font_path)) + + for block in data["pages"][page_num]["blocks"]: + for line in block.get("lines", []): + for word in line.get("words", []): + x1 = word["bbox"][0][0] + y1 = word["bbox"][0][1] + x2 = word["bbox"][1][0] + y2 = word["bbox"][1][1] + value = word["value"] + font_size = (y2-y1) * 72 / 96 + + x_center_coordinates =x2 - (x2-x1)/2 + y_center_coordinates =y2 - (y2-y1)/2 + w = can.stringWidth(value, font_name, font_size) + self.log.debug('w:', ) + can.setFont('Arial', font_size) + can.drawString(x_center_coordinates - w/2 , int(page_height) - y_center_coordinates - (font_size/2) , value) + + can.showPage() + can.save() + + packet.seek(0) + new_pdf = PdfReader(packet) + page.merge_page(new_pdf.pages[0]) + + output_pdf.add_page(page) + + output_pdf.write(output_path) + shutil.copyfile(str(output_path), "/home/otxtan/python/opt/paperless/pdfa.pdf") + + # create pdf from image + def pdf_create_layer_text(self, sidecar, input_path, output_path): + + + + packet = io.BytesIO() + # test + + c = Canvas(packet, pagesize=letter) + + # get data + data = self.ocr_file(input_path) + if not data: + return PdfFileReader(packet) + # viet text vao file + with open(sidecar, "w") as txt_sidecar: + txt_sidecar.write(data.get("content","")) + + for page in data["pages"]: + if "blocks" in page: + for block in page["blocks"]: + for word in block.get("words",[]): + x1 = word["bbox"][0][0] + y1 = word["bbox"][0][1] + x2 = word["bbox"][1][0] + y2 = word["bbox"][1][1] + value = word["value"] + c.drawString(x1, y1, value) + c.showPage() + c.save() + packet.seek(0) + return PdfFileReader(packet) + + # Tạo lớp 2 (nội dung) + def merge_pdfs(self, mime_type, input_path, output_path, overlay): + pdf = None + if self.is_image(mime_type): + packet = io.BytesIO() + img = Image.open(input_path) + width, height = img.size + c = canvas.Canvas(packet, pagesize=(width, height)) + c.drawImage(input_path, 0, 0, width=width, height=height) + c.save() + packet.seek(0) + pdf = PdfFileReader(packet) + else: + pdf = PdfFileReader(input_path) + pdf_writer = PdfFileWriter() + + for page_number in range(pdf.getNumPages()): + page = pdf.getPage(page_number) + page.mergePage(overlay.getPage(page_number)) + pdf_writer.addPage(page) + + with open(output_path, 'wb') as out: + pdf_writer.write(out) + with open("/home/otxtan/python/opt/paperless/pdfa.pdf", 'wb') as f: + pdf_writer.write(f) + # + pdf_writer1 = PdfFileWriter() + for page_num in range(pdf.numPages): + page = pdf.getPage(page_num) + pdf_writer1.addPage(page) + + with open("/home/otxtan/python/opt/paperless/pdfa1.pdf", 'wb') as output_file: + pdf_writer1.write(output_file) + + def ocr_img_or_pdf(self, document_path, mime_type, sidecar, output_file, **kwargs): + self.log.info('mime_type:',mime_type) + # overlay_text = self.pdf_create_layer_text(sidecar, document_path, output_file) + # self.merge_pdfs(mime_type, input_path = document_path, output_path = output_file, overlay = overlay_text) + self.render_pdf_ocr(sidecar, mime_type, document_path, output_file) + + + def extract_text( + self, + sidecar_file: Optional[Path], + pdf_file: Path, + ) -> Optional[str]: + # When re-doing OCR, the sidecar contains ONLY the new text, not + # the whole text, so do not utilize it in that case + logging.info('đã vào: ',sidecar_file) + if ( + sidecar_file is not None + and os.path.isfile(sidecar_file) + and self.settings.mode != "redo" + ): + text = self.read_file_handle_unicode_errors(sidecar_file) + + if "[OCR skipped on page" not in text: + # This happens when there's already text in the input file. + # The sidecar file will only contain text for OCR'ed pages. + self.log.debug("Using text from sidecar file") + return post_process_text(text) + else: + self.log.debug("Incomplete sidecar file: discarding.") + + # no success with the sidecar file, try PDF + + if not os.path.isfile(pdf_file): + return None + + try: + text = None + with tempfile.NamedTemporaryFile( + mode="w+", + dir=self.tempdir, + ) as tmp: + run_subprocess( + [ + "pdftotext", + "-q", + "-layout", + "-enc", + "UTF-8", + pdf_file, + tmp.name, + ], + logger=self.log, + ) + text = self.read_file_handle_unicode_errors(Path(tmp.name)) + + # data_ocr = self.ocr_file(pdf_file).get('content','') + # if not data_ocr: + # data_ocr = '' + + # logging.info() + return post_process_text(text) + + except Exception: + # If pdftotext fails, fall back to OCR. + self.log.warning( + "Error while getting text from PDF document with pdftotext", + exc_info=True, + ) + # probably not a PDF file. + return None + + def construct_ocrmypdf_parameters( + self, + input_file, + mime_type, + output_file, + sidecar_file, + safe_fallback=False, + ): + if TYPE_CHECKING: + assert isinstance(self.settings, OcrConfig) + ocrmypdf_args = { + "input_file": input_file, + "output_file": output_file, + # need to use threads, since this will be run in daemonized + # processes via the task library. + "use_threads": True, + "jobs": settings.THREADS_PER_WORKER, + "language": self.settings.language, + "output_type": self.settings.output_type, + "progress_bar": False, + } + + if "pdfa" in ocrmypdf_args["output_type"]: + ocrmypdf_args["color_conversion_strategy"] = ( + self.settings.color_conversion_strategy + ) + + if self.settings.mode == ModeChoices.FORCE or safe_fallback: + ocrmypdf_args["force_ocr"] = True + elif self.settings.mode in { + ModeChoices.SKIP, + ModeChoices.SKIP_NO_ARCHIVE, + }: + ocrmypdf_args["skip_text"] = True + elif self.settings.mode == ModeChoices.REDO: + ocrmypdf_args["redo_ocr"] = True + else: # pragma: no cover + raise ParseError(f"Invalid ocr mode: {self.settings.mode}") + + if self.settings.clean == CleanChoices.CLEAN: + ocrmypdf_args["clean"] = True + elif self.settings.clean == CleanChoices.FINAL: + if self.settings.mode == ModeChoices.REDO: + ocrmypdf_args["clean"] = True + else: + # --clean-final is not compatible with --redo-ocr + ocrmypdf_args["clean_final"] = True + + if self.settings.deskew and self.settings.mode != ModeChoices.REDO: + # --deskew is not compatible with --redo-ocr + ocrmypdf_args["deskew"] = True + + if self.settings.rotate: + ocrmypdf_args["rotate_pages"] = True + ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold + + if self.settings.pages is not None and self.settings.pages > 0: + ocrmypdf_args["pages"] = f"1-{self.settings.pages}" + else: + # sidecar is incompatible with pages + ocrmypdf_args["sidecar"] = sidecar_file + + if self.is_image(mime_type): + # This may be required, depending on the known imformation + maybe_override_pixel_limit() + + dpi = self.get_dpi(input_file) + a4_dpi = self.calculate_a4_dpi(input_file) + + if self.has_alpha(input_file): + self.log.info( + f"Removing alpha layer from {input_file} " + "for compatibility with img2pdf", + ) + # Replace the input file with the non-alpha + ocrmypdf_args["input_file"] = self.remove_alpha(input_file) + + if dpi: + self.log.debug(f"Detected DPI for image {input_file}: {dpi}") + ocrmypdf_args["image_dpi"] = dpi + elif self.settings.image_dpi is not None: + ocrmypdf_args["image_dpi"] = self.settings.image_dpi + elif a4_dpi: + ocrmypdf_args["image_dpi"] = a4_dpi + else: + raise ParseError( + f"Cannot produce archive PDF for image {input_file}, " + f"no DPI information is present in this image and " + f"OCR_IMAGE_DPI is not set.", + ) + if ocrmypdf_args["image_dpi"] < 70: # pragma: no cover + self.log.warning( + f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail", + ) + + if self.settings.user_args is not None: + try: + ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args} + except Exception as e: + self.log.warning( + f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " + f"they will not be used. Error: {e}", + ) + + if ( + self.settings.max_image_pixel is not None + and self.settings.max_image_pixel >= 0 + ): + # Convert pixels to mega-pixels and provide to ocrmypdf + max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0 + msg = ( + "OCR pixel limit is disabled!" + if max_pixels_mpixels == 0 + else f"Calculated {max_pixels_mpixels} megapixels for OCR" + ) + self.log.debug(msg) + ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels + + return ocrmypdf_args + + def parse(self, document_path: Path, mime_type, file_name=None): + # This forces tesseract to use one core per page. + os.environ["OMP_THREAD_LIMIT"] = "1" + VALID_TEXT_LENGTH = 50 + + if mime_type == "application/pdf": + text_original = self.extract_text(None, document_path) + original_has_text = ( + text_original is not None and len(text_original) > VALID_TEXT_LENGTH + ) + else: + text_original = None + original_has_text = False + + # If the original has text, and the user doesn't want an archive, + # we're done here + skip_archive_for_text = ( + self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE + or self.settings.skip_archive_file + in { + ArchiveFileChoices.WITH_TEXT, + ArchiveFileChoices.ALWAYS, + } + ) + if skip_archive_for_text and original_has_text: + self.log.debug("Document has text, skipping OCRmyPDF entirely.") + self.text = text_original + return + + # Either no text was in the original or there should be an archive + # file created, so OCR the file and create an archive with any + # text located via OCR + + import ocrmypdf + from ocrmypdf import EncryptedPdfError + from ocrmypdf import InputFileError + from ocrmypdf import SubprocessOutputError + + archive_path = Path(os.path.join(self.tempdir, "archive.pdf")) + sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt")) + + args = self.construct_ocrmypdf_parameters( + document_path, + mime_type, + archive_path, + sidecar_file, + ) + + try: + self.log.debug(f"Calling OCRmyPDF with args: {args}") + # ocrmypdf.ocr(**args) + self.log.info("gia tri document_path: ", document_path) + self.ocr_img_or_pdf(document_path, mime_type,**args) + if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS: + self.archive_path = archive_path + + self.text = self.extract_text(sidecar_file, archive_path) + + if not self.text: + raise NoTextFoundException("No text was found in the original document") + except EncryptedPdfError: + self.log.warning( + "This file is encrypted, OCR is impossible. Using " + "any text present in the original file.", + ) + if original_has_text: + self.text = text_original + except SubprocessOutputError as e: + if "Ghostscript PDF/A rendering" in str(e): + self.log.warning( + "Ghostscript PDF/A rendering failed, consider setting " + "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'", + ) + + raise ParseError( + f"SubprocessOutputError: {e!s}. See logs for more information.", + ) from e + except (NoTextFoundException, InputFileError) as e: + self.log.warning( + f"Encountered an error while running OCR: {e!s}. " + f"Attempting force OCR to get the text.", + ) + + archive_path_fallback = Path( + os.path.join(self.tempdir, "archive-fallback.pdf"), + ) + sidecar_file_fallback = Path( + os.path.join(self.tempdir, "sidecar-fallback.txt"), + ) + + # Attempt to run OCR with safe settings. + + args = self.construct_ocrmypdf_parameters( + document_path, + mime_type, + archive_path_fallback, + sidecar_file_fallback, + safe_fallback=True, + ) + + try: + self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}") + # ocrmypdf.ocr(**args) + self.ocr_img_or_pdf(document_path, mime_type,**args) + # Don't return the archived file here, since this file + # is bigger and blurry due to --force-ocr. + + self.text = self.extract_text( + sidecar_file_fallback, + archive_path_fallback, + ) + + except Exception as e: + # If this fails, we have a serious issue at hand. + raise ParseError(f"{e.__class__.__name__}: {e!s}") from e + + except Exception as e: + # Anything else is probably serious. + raise ParseError(f"{e.__class__.__name__}: {e!s}") from e + + # As a last resort, if we still don't have any text for any reason, + # try to extract the text from the original document. + if not self.text: + if original_has_text: + self.text = text_original + else: + self.log.warning( + f"No text was found in {document_path}, the content will " + f"be empty.", + ) + self.text = "" + + +def post_process_text(text): + if not text: + return None + + collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) + no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces) + no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace) + + # TODO: this needs a rework + # replace \0 prevents issues with saving to postgres. + # text may contain \0 when this character is present in PDF files. + return no_trailing_whitespace.strip().replace("\0", " ") diff --git a/src/paperless_ocr_custom/signals.py b/src/paperless_ocr_custom/signals.py new file mode 100644 index 000000000..d924e0439 --- /dev/null +++ b/src/paperless_ocr_custom/signals.py @@ -0,0 +1,20 @@ +def get_parser(*args, **kwargs): + from paperless_ocr_custom.parsers import RasterisedDocumentParser + + return RasterisedDocumentParser(*args, **kwargs) + + +def tesseract_consumer_declaration(sender, **kwargs): + return { + "parser": get_parser, + "weight": 0, + "mime_types": { + "application/pdf": ".pdf", + "image/jpeg": ".jpg", + "image/png": ".png", + "image/tiff": ".tif", + "image/gif": ".gif", + "image/bmp": ".bmp", + "image/webp": ".webp", + }, + }