diff --git a/.env b/.env index a13d27ae9..823e53076 100644 --- a/.env +++ b/.env @@ -1,6 +1,7 @@ COMPOSE_PROJECT_NAME=paperless PAPERLESS_DEBUG=true +# PAPERLESS_REDIS=redis://localhost:6379 PAPERLESS_REDIS=redis://:@123bytech@172.16.100.203:9377 PAPERLESS_DBHOST=172.16.100.203 @@ -9,3 +10,7 @@ PAPERLESS_DBNAME=tc_edoc PAPERLESS_DBUSER=tc_edoc PAPERLESS_DBPASS=27M2MV58Re2Y PAPERLESS_DBSSLMODE=prefer + + +URL_UPLOAD_FILE = https://ocr-core-api.tcgroup.vn/api/v1/file/upload +URL_OCR_BY_FILEID = https://ocr-core-api.tcgroup.vn/api/v1/ocr/general diff --git a/requirements.txt b/requirements.txt index 97c33a9df..301c2e0bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -123,5 +123,6 @@ whoosh==2.7.4 wrapt==1.16.0; python_version >= '3.6' zstandard==0.22.0; python_version >= '3.8' zxing-cpp==2.2.0; platform_machine == 'x86_64' and python_version >= '3.6' +PyPDF2<3.0 python-decouple==3.8 diff --git a/src-ui/messages.xlf b/src-ui/messages.xlf index 685b5bb80..e35c15ec6 100644 --- a/src-ui/messages.xlf +++ b/src-ui/messages.xlf @@ -425,13 +425,6 @@ 22 - - Read the documentation about this setting - - src/app/components/admin/config/config.component.html - 25 - - Enable @@ -7217,18 +7210,25 @@ 164 + + OCR Key + + src/app/data/paperless-config.ts + 171 + + Application Logo src/app/data/paperless-config.ts - 171 + 178 Application Title src/app/data/paperless-config.ts - 178 + 185 diff --git a/src-ui/src/app/components/admin/config/config.component.html b/src-ui/src/app/components/admin/config/config.component.html index 03ca04b7b..71bb63e76 100644 --- a/src-ui/src/app/components/admin/config/config.component.html +++ b/src-ui/src/app/components/admin/config/config.component.html @@ -22,9 +22,9 @@
{{option.title}} - +
diff --git a/src-ui/src/app/data/paperless-config.ts b/src-ui/src/app/data/paperless-config.ts index 3ae485ff2..f6c203983 100644 --- a/src-ui/src/app/data/paperless-config.ts +++ b/src-ui/src/app/data/paperless-config.ts @@ -166,6 +166,13 @@ export const PaperlessConfigOptions: ConfigOption[] = [ config_key: 'PAPERLESS_OCR_USER_ARGS', category: ConfigCategory.OCR, }, + { + key: 'ocr_key', + title: $localize`OCR Key`, + type: ConfigOptionType.String, + config_key: 'PAPERLESS_APP_TITLE', + category: ConfigCategory.OCR, + }, { key: 'app_logo', title: $localize`Application Logo`, @@ -196,6 +203,7 @@ export interface PaperlessConfig extends ObjectWithId { max_image_pixels: number color_conversion_strategy: ColorConvertConfig user_args: object + ocr_key: string app_logo: string app_title: string } diff --git a/src-ui/src/locale/messages.vi_VN.xlf b/src-ui/src/locale/messages.vi_VN.xlf index 327cf13da..7c3ed0e08 100644 --- a/src-ui/src/locale/messages.vi_VN.xlf +++ b/src-ui/src/locale/messages.vi_VN.xlf @@ -497,7 +497,7 @@ src/app/components/admin/config/config.component.html 34 - Enable + Cho phép Discard @@ -4587,7 +4587,7 @@ src/app/components/common/input/switch/switch.component.html 39 - Note: value has not yet been set and will not apply until explicitly changed + Lưu ý: giá trị chưa được đặt và sẽ không áp dụng cho đến khi thay đổi rõ ràng Add tag @@ -5970,7 +5970,7 @@ src/app/components/document-detail/document-detail.component.ts 724 - Do you really want to delete document ""? + Bạn có thực sự muốn xóa tài liệu ""? The files for this document will be deleted permanently. This operation cannot be undone. @@ -5978,7 +5978,7 @@ src/app/components/document-detail/document-detail.component.ts 725 - The files for this document will be deleted permanently. This operation cannot be undone. + Các tập tin cho tài liệu này sẽ bị xóa vĩnh viễn. Không thể hoàn tác thao tác này Delete document @@ -5986,7 +5986,7 @@ src/app/components/document-detail/document-detail.component.ts 727 - Delete document + Xóa tài liệu Error deleting document @@ -6724,7 +6724,7 @@ src/app/components/document-list/document-list.component.html 146 - Sort by correspondent + Sắp xếp theo người biên tập Sort by title @@ -7132,7 +7132,7 @@ src/app/components/manage/correspondent-list/correspondent-list.component.ts 67 - Do you really want to delete the correspondent ""? + Bạn có thực sự muốn xóa người biên tập ""? Customize the data fields that can be attached to documents. @@ -7564,7 +7564,7 @@ src/app/components/manage/management-list/management-list.component.ts 180 - Successfully updated . + Đã cập nhật thành công. Error occurred while saving . @@ -7668,7 +7668,7 @@ src/app/components/manage/tag-list/tag-list.component.ts 53 - Do you really want to delete the tag ""? + Bạn có thực sự muốn xóa thẻ ""? Use workflows to customize the behavior of TC GROUP when events 'trigger' a workflow. @@ -7932,7 +7932,7 @@ src/app/data/paperless-config.ts 50 - General Settings + Cài đặt chung OCR Settings diff --git a/src/documents/consumer.py b/src/documents/consumer.py index c735ed4c8..fa3bf2e75 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -36,7 +36,7 @@ from documents.models import Tag from documents.models import Workflow from documents.models import WorkflowAction from documents.models import WorkflowTrigger -from documents.parsers import DocumentParser +from documents.parsers import DocumentParser, custom_get_parser_class_for_mime_type from documents.parsers import ParseError from documents.parsers import get_parser_class_for_mime_type from documents.parsers import parse_date @@ -557,7 +557,7 @@ class Consumer(LoggingMixin): self.log.debug(f"Detected mime type: {mime_type}") # Based on the mime type, get the parser for that type - parser_class: Optional[type[DocumentParser]] = get_parser_class_for_mime_type( + parser_class: Optional[type[DocumentParser]] = custom_get_parser_class_for_mime_type( mime_type, ) if not parser_class: diff --git a/src/documents/parsers.py b/src/documents/parsers.py index d781ddb9f..3a43886d1 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -14,11 +14,13 @@ from typing import Optional from django.conf import settings from django.utils import timezone +import requests from documents.loggers import LoggingMixin from documents.signals import document_consumer_declaration from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess +from paperless.models import ApplicationConfiguration # This regular expression will try to find dates in the document at # hand and will match the following formats: @@ -129,6 +131,38 @@ def get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentPar # Return the parser with the highest weight. return best_parser["parser"] +def custom_get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentParser"]]: + """ + Returns the best parser (by weight) for the given mimetype or + None if no parser exists + """ + + options = [] + + for response in document_consumer_declaration.send(None): + parser_declaration = response[1] + supported_mime_types = parser_declaration["mime_types"] + + if mime_type in supported_mime_types: + options.append(parser_declaration) + + if not options: + return None + k = ApplicationConfiguration.objects.filter().first() + best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[1] + if k.ocr_key!='': + headers = { + 'Authorization': f'Bearer {k.ocr_key}' + } + url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"] + response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers) + logger.debug(f'status code: {response_ocr.status_code}') + if response_ocr.status_code != 401: + best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0] + logger.debug('Successful key authentication ...') + logger.debug('Fail key authentication ...', best_parser["parser"]) + # Return the parser with the highest weight. + return best_parser["parser"] def run_convert( input_file, diff --git a/src/paperless/migrations/0004_applicationconfiguration_ocr_key.py b/src/paperless/migrations/0004_applicationconfiguration_ocr_key.py new file mode 100644 index 000000000..a1a115f73 --- /dev/null +++ b/src/paperless/migrations/0004_applicationconfiguration_ocr_key.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.11 on 2024-05-22 02:52 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('paperless', '0003_alter_applicationconfiguration_max_image_pixels'), + ] + + operations = [ + migrations.AddField( + model_name='applicationconfiguration', + name='ocr_key', + field=models.CharField(blank=True, max_length=48, null=True, verbose_name='Sets key for advanced version'), + ), + ] diff --git a/src/paperless/migrations/0005_alter_applicationconfiguration_ocr_key.py b/src/paperless/migrations/0005_alter_applicationconfiguration_ocr_key.py new file mode 100644 index 000000000..1b33698ed --- /dev/null +++ b/src/paperless/migrations/0005_alter_applicationconfiguration_ocr_key.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.11 on 2024-05-22 07:01 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('paperless', '0004_applicationconfiguration_ocr_key'), + ] + + operations = [ + migrations.AlterField( + model_name='applicationconfiguration', + name='ocr_key', + field=models.CharField(blank=True, max_length=100, null=True, verbose_name='Sets key for advanced version'), + ), + ] diff --git a/src/paperless/migrations/0006_alter_applicationconfiguration_ocr_key.py b/src/paperless/migrations/0006_alter_applicationconfiguration_ocr_key.py new file mode 100644 index 000000000..a3225215b --- /dev/null +++ b/src/paperless/migrations/0006_alter_applicationconfiguration_ocr_key.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.11 on 2024-05-22 07:03 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('paperless', '0005_alter_applicationconfiguration_ocr_key'), + ] + + operations = [ + migrations.AlterField( + model_name='applicationconfiguration', + name='ocr_key', + field=models.CharField(blank=True, max_length=200, null=True, verbose_name='Sets key for advanced version'), + ), + ] diff --git a/src/paperless/models.py b/src/paperless/models.py index 1f6cfbced..03b74bbc9 100644 --- a/src/paperless/models.py +++ b/src/paperless/models.py @@ -184,6 +184,13 @@ class ApplicationConfiguration(AbstractSingletonModel): upload_to="logo/", ) + ocr_key = models.CharField( + verbose_name=_("Sets key for advanced version"), + null=True, + blank=True, + max_length=200, + ) + class Meta: verbose_name = _("paperless application settings") diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 4e30db1b1..a63c8931b 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -295,6 +295,7 @@ INSTALLED_APPS = [ "paperless", "documents.apps.DocumentsConfig", "paperless_tesseract.apps.PaperlessTesseractConfig", + "paperless_ocr_custom.apps.PaperlessTesseractConfig", "paperless_text.apps.PaperlessTextConfig", "paperless_mail.apps.PaperlessMailConfig", "django.contrib.admin", @@ -416,6 +417,14 @@ CHANNEL_LAYERS = { }, } +# PAPERLESS_OCR_CUSTOM +TCGROUP_OCR_CUSTOM = { + "URL": { + "URL_UPLOAD_FILE": os.getenv("URL_UPLOAD_FILE","https://ocr-core-api.tcgroup.vn/api/v1/file/upload"), + "URL_OCR_BY_FILEID": os.getenv("URL_OCR_BY_FILEID","https://ocr-core-api.tcgroup.vn/api/v1/ocr/general"), + } +} + ############################################################################### # Security # ############################################################################### diff --git a/src/paperless_ocr_custom/__init__.py b/src/paperless_ocr_custom/__init__.py new file mode 100644 index 000000000..c811b3c76 --- /dev/null +++ b/src/paperless_ocr_custom/__init__.py @@ -0,0 +1,5 @@ +# this is here so that django finds the checks. +from paperless_ocr_custom.checks import check_default_language_available +from paperless_ocr_custom.checks import get_tesseract_langs + +__all__ = ["get_tesseract_langs", "check_default_language_available"] diff --git a/src/paperless_ocr_custom/apps.py b/src/paperless_ocr_custom/apps.py new file mode 100644 index 000000000..e96602cfd --- /dev/null +++ b/src/paperless_ocr_custom/apps.py @@ -0,0 +1,14 @@ +from django.apps import AppConfig + +from paperless_ocr_custom.signals import tesseract_consumer_declaration + + +class PaperlessTesseractConfig(AppConfig): + name = "paperless_ocr_custom" + + def ready(self): + from documents.signals import document_consumer_declaration + + document_consumer_declaration.connect(tesseract_consumer_declaration) + + AppConfig.ready(self) diff --git a/src/paperless_ocr_custom/checks.py b/src/paperless_ocr_custom/checks.py new file mode 100644 index 000000000..82d255005 --- /dev/null +++ b/src/paperless_ocr_custom/checks.py @@ -0,0 +1,46 @@ +import shutil +import subprocess + +from django.conf import settings +from django.core.checks import Error +from django.core.checks import Warning +from django.core.checks import register + + +def get_tesseract_langs(): + proc = subprocess.run( + [shutil.which("tesseract"), "--list-langs"], + capture_output=True, + ) + + # Decode bytes to string, split on newlines, trim out the header + proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:] + + return [x.strip() for x in proc_lines] + + +@register() +def check_default_language_available(app_configs, **kwargs): + installed_langs = get_tesseract_langs() + + if not settings.OCR_LANGUAGE: + return [ + Warning( + "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. " + "This means that tesseract will fallback to english.", + ), + ] + + specified_langs = settings.OCR_LANGUAGE.split("+") + + for lang in specified_langs: + if lang not in installed_langs: + return [ + Error( + f"The selected ocr language {lang} is " + f"not installed. Paperless cannot OCR your documents " + f"without it. Please fix PAPERLESS_OCR_LANGUAGE.", + ), + ] + + return [] diff --git a/src/paperless_ocr_custom/fonts/arial-font/ARIBL0.ttf b/src/paperless_ocr_custom/fonts/arial-font/ARIBL0.ttf new file mode 100644 index 000000000..a1e68a366 Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/ARIBL0.ttf differ diff --git a/src/paperless_ocr_custom/fonts/arial-font/ArialTh.ttf b/src/paperless_ocr_custom/fonts/arial-font/ArialTh.ttf new file mode 100644 index 000000000..614f0af2c Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/ArialTh.ttf differ diff --git a/src/paperless_ocr_custom/fonts/arial-font/Arialn.ttf b/src/paperless_ocr_custom/fonts/arial-font/Arialn.ttf new file mode 100644 index 000000000..5799f4149 Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/Arialn.ttf differ diff --git a/src/paperless_ocr_custom/fonts/arial-font/GEO_AI__.TTF b/src/paperless_ocr_custom/fonts/arial-font/GEO_AI__.TTF new file mode 100644 index 000000000..62572a4a8 Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/GEO_AI__.TTF differ diff --git a/src/paperless_ocr_custom/fonts/arial-font/G_ari_bd.TTF b/src/paperless_ocr_custom/fonts/arial-font/G_ari_bd.TTF new file mode 100644 index 000000000..61c51a0cb Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/G_ari_bd.TTF differ diff --git a/src/paperless_ocr_custom/fonts/arial-font/G_ari_i.TTF b/src/paperless_ocr_custom/fonts/arial-font/G_ari_i.TTF new file mode 100644 index 000000000..5ebda789c Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/G_ari_i.TTF differ diff --git a/src/paperless_ocr_custom/fonts/arial-font/arial.ttf b/src/paperless_ocr_custom/fonts/arial-font/arial.ttf new file mode 100644 index 000000000..ad7d8eab8 Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/arial.ttf differ diff --git a/src/paperless_ocr_custom/parsers.py b/src/paperless_ocr_custom/parsers.py new file mode 100644 index 000000000..3448151a9 --- /dev/null +++ b/src/paperless_ocr_custom/parsers.py @@ -0,0 +1,626 @@ +import io +import logging +import math +import os +import re +import shutil +import tempfile +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Optional + +from django.conf import settings +import requests +from PyPDF2 import PdfReader +from reportlab.pdfgen import canvas +from reportlab.lib.pagesizes import letter +from PIL import Image +from reportlab.pdfbase.ttfonts import TTFont +from reportlab.pdfbase import pdfmetrics +from pdf2image import convert_from_path +from reportlab.lib.utils import ImageReader + +from documents.parsers import DocumentParser +from documents.parsers import ParseError +from documents.parsers import make_thumbnail_from_pdf +from documents.utils import maybe_override_pixel_limit +from documents.utils import run_subprocess +from paperless.config import OcrConfig +from paperless.models import ApplicationConfiguration, ArchiveFileChoices +from paperless.models import CleanChoices +from paperless.models import ModeChoices + + +class NoTextFoundException(Exception): + pass + + +class RtlLanguageException(Exception): + pass + + +class RasterisedDocumentParser(DocumentParser): + """ + This parser uses Tesseract to try and get some text out of a rasterised + image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.) + """ + + logging_name = "paperless.parsing.tesseract" + + def get_settings(self) -> OcrConfig: + """ + This parser uses the OCR configuration settings to parse documents + """ + return OcrConfig() + + def extract_metadata(self, document_path, mime_type): + result = [] + if mime_type == "application/pdf": + import pikepdf + + namespace_pattern = re.compile(r"\{(.*)\}(.*)") + + pdf = pikepdf.open(document_path) + meta = pdf.open_metadata() + for key, value in meta.items(): + if isinstance(value, list): + value = " ".join([str(e) for e in value]) + value = str(value) + try: + m = namespace_pattern.match(key) + if m is None: # pragma: no cover + continue + namespace = m.group(1) + key_value = m.group(2) + try: + namespace.encode("utf-8") + key_value.encode("utf-8") + except UnicodeEncodeError as e: # pragma: no cover + self.log.debug(f"Skipping metadata key {key}: {e}") + continue + result.append( + { + "namespace": namespace, + "prefix": meta.REVERSE_NS[namespace], + "key": key_value, + "value": value, + }, + ) + except Exception as e: + self.log.warning( + f"Error while reading metadata {key}: {value}. Error: {e}", + ) + return result + + def get_thumbnail(self, document_path, mime_type, file_name=None): + return make_thumbnail_from_pdf( + self.archive_path or document_path, + self.tempdir, + self.logging_group, + ) + + def is_image(self, mime_type) -> bool: + return mime_type in [ + "image/png", + "image/jpeg", + "image/tiff", + "image/bmp", + "image/gif", + "image/webp", + ] + + def has_alpha(self, image) -> bool: + with Image.open(image) as im: + return im.mode in ("RGBA", "LA") + + def remove_alpha(self, image_path: str) -> Path: + no_alpha_image = Path(self.tempdir) / "image-no-alpha" + run_subprocess( + [ + settings.CONVERT_BINARY, + "-alpha", + "off", + image_path, + no_alpha_image, + ], + logger=self.log, + ) + return no_alpha_image + + def get_dpi(self, image) -> Optional[int]: + try: + with Image.open(image) as im: + x, y = im.info["dpi"] + return round(x) + except Exception as e: + self.log.warning(f"Error while getting DPI from image {image}: {e}") + return None + + def calculate_a4_dpi(self, image) -> Optional[int]: + try: + with Image.open(image) as im: + width, height = im.size + # divide image width by A4 width (210mm) in inches. + dpi = int(width / (21 / 2.54)) + self.log.debug(f"Estimated DPI {dpi} based on image width {width}") + return dpi + + except Exception as e: + self.log.warning(f"Error while calculating DPI for image {image}: {e}") + return None + # get ocr file img/pdf + def ocr_file(self,path_file): + # get text from api + # ocr_custom_username = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_USERNAME"] + # ocr_custom_password = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_PASSWORD"] + # url_login = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_LOGIN"] + # data = { + # 'username': ocr_custom_username, + # 'password': ocr_custom_password + # } + # response_login = requests.post(url_login, data=data) + # access_token = '' + # if response_login.status_code == 200: + # response_data = response_login.json() + # access_token = response_data.get('access_token','') + # else: + # logging.error('login: ', response_login.status_code) + + k = ApplicationConfiguration.objects.filter().first() + access_token = k.ocr_key + # upload file + get_file_id = '' + url_upload_file = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_UPLOAD_FILE"] + headers = { + 'Authorization': f'Bearer {access_token}' + } + pdf_data = None + with open(path_file, 'rb') as file: + pdf_data = file.read() + + response_upload = requests.post(url_upload_file, files={'file': (str(path_file).split("/")[-1], pdf_data)}, headers=headers) + # logging.debug('pdf file',response_upload) + if response_upload.status_code == 200: + get_file_id = response_upload.json().get('file_id','') + else: + logging.error('upload file: ',response_upload.status_code) + + # ocr by file_id + params = {'file_id': get_file_id} + url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"] + response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers, params=params) + data_ocr = None + # logging.error('ocr: ', response_ocr.status_code) + if response_ocr.status_code == 200: + data_ocr = response_ocr.json() + else: + logging.error('ocr: ', response_ocr.text) + return data_ocr + + + def render_pdf_ocr(self, sidecar, mime_type, input_path, output_path): + font_name = 'Arial' + data = self.ocr_file(input_path) + if not data: + return + font_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'fonts', 'arial-font/arial.ttf') + with open(sidecar, "w") as txt_sidecar: + txt_sidecar.write(data.get("content","")) + if self.is_image(mime_type): + img = Image.open(input_path) + width, height = img.size + c = canvas.Canvas(str(output_path), pagesize=(width, height)) + pdfmetrics.registerFont(TTFont(font_name, font_path)) + # c.drawImage(input_path, 0, 0, width=width, height=height) + for page in data["pages"]: + for block in page["blocks"]: + for line in block.get("lines", []): + y1 = line.get("bbox")[0][1] + y2 = line.get("bbox")[1][1] + font_size = math.floor((y2 - y1) * 72 / 96) + y_center_coordinates = y2 - (y2 - y1)/2 + for word in line.get("words", []): + x1 = word["bbox"][0][0] + # y1 = word["bbox"][0][1] + x2 = word["bbox"][1][0] + # y2 = word["bbox"][1][1] + value = word["value"] + # font_size = math.ceil(float(y2-y1) * 72 / 96) + # font_size = (y2-y1) * 72 / 96 + x_center_coordinates =x2 - (x2-x1)/2 + # y_center_coordinates =y2 - (y2-y1)/2 + w = c.stringWidth(value, font_name, font_size) + c.setFont('Arial', font_size) + c.drawString(x_center_coordinates - w/2, + height - y_center_coordinates - (font_size/2), + value) + c.drawImage(input_path, 0, 0, width=width, height=height) + c.save() + else: + shutil.copy(str(input_path), str(output_path)) + input_pdf = PdfReader(input_path) + images = convert_from_path(input_path, + first_page=1, + last_page=input_pdf.getNumPages()+1) + can = canvas.Canvas(str(output_path), pagesize=letter) + for page_num, page in enumerate(input_pdf.pages): + page_height = input_pdf.pages[page_num].mediabox[3] + page_width = input_pdf.pages[page_num].mediabox[2] + # set size new page + can.setPageSize((page_width, page_height)) + byte_image = io.BytesIO() + images[page_num].save(byte_image, format='JPEG') + jpg_image = byte_image.getvalue() + # can.drawImage(ImageReader(io.BytesIO(jpg_image)), + # 0, 0, + # width=float(page_width), + # height=float(page_height)) + # set font size + pdfmetrics.registerFont(TTFont('Arial', font_path)) + width_api_img = data["pages"][page_num]["dimensions"][1] + height_api_img = data["pages"][page_num]["dimensions"][0] + rolate_height = height_api_img /page_height + rolate_width = width_api_img /page_width + for block in data["pages"][page_num]["blocks"]: + for line in block.get("lines", []): + y1 = (line.get("bbox")[0][1] / float(rolate_height)) + y2 = (line.get("bbox")[1][1] / float(rolate_height)) + font_size = (y2 - y1) * 72 / 96 + y_center_coordinates = y2 - (y2 - y1)/2 + for word in line.get("words", []): + x1 = word["bbox"][0][0] / float(rolate_width) + # y1 = word["bbox"][0][1] / float(rolate_height) + x2 = word["bbox"][1][0] / float(rolate_width) + # y2 = word["bbox"][1][1] / float(rolate_height) + value = word["value"] + # font_size = float(y2-y1) * 72 / 96 + x_center_coordinates = x2 - (x2-x1)/2 + # y_center_coordinates =y2 - (y2-y1)/2 + w = can.stringWidth(value, font_name, font_size) + can.setFont('Arial', font_size) + can.drawString(x_center_coordinates - w/2, + int(page_height) - y_center_coordinates - (font_size/3), + value) + can.drawImage(ImageReader(io.BytesIO(jpg_image)), + 0, 0, + width=float(page_width), + height=float(page_height)) + can.showPage() + can.save() + return + + + + + def ocr_img_or_pdf(self, document_path, mime_type, sidecar, output_file, **kwargs): + self.render_pdf_ocr(sidecar, mime_type, document_path, output_file) + + + def extract_text( + self, + sidecar_file: Optional[Path], + pdf_file: Path, + ) -> Optional[str]: + # When re-doing OCR, the sidecar contains ONLY the new text, not + # the whole text, so do not utilize it in that case + if ( + sidecar_file is not None + and os.path.isfile(sidecar_file) + and self.settings.mode != "redo" + ): + text = self.read_file_handle_unicode_errors(sidecar_file) + + if "[OCR skipped on page" not in text: + # This happens when there's already text in the input file. + # The sidecar file will only contain text for OCR'ed pages. + self.log.debug("Using text from sidecar file") + return post_process_text(text) + else: + self.log.debug("Incomplete sidecar file: discarding.") + + # no success with the sidecar file, try PDF + + if not os.path.isfile(pdf_file): + return None + + try: + text = None + with tempfile.NamedTemporaryFile( + mode="w+", + dir=self.tempdir, + ) as tmp: + run_subprocess( + [ + "pdftotext", + "-q", + "-layout", + "-enc", + "UTF-8", + pdf_file, + tmp.name, + ], + logger=self.log, + ) + text = self.read_file_handle_unicode_errors(Path(tmp.name)) + + # data_ocr = self.ocr_file(pdf_file).get('content','') + # if not data_ocr: + # data_ocr = '' + + # logging.info() + return post_process_text(text) + + except Exception: + # If pdftotext fails, fall back to OCR. + self.log.warning( + "Error while getting text from PDF document with pdftotext", + exc_info=True, + ) + # probably not a PDF file. + return None + + def construct_ocrmypdf_parameters( + self, + input_file, + mime_type, + output_file, + sidecar_file, + safe_fallback=False, + ): + if TYPE_CHECKING: + assert isinstance(self.settings, OcrConfig) + ocrmypdf_args = { + "input_file": input_file, + "output_file": output_file, + # need to use threads, since this will be run in daemonized + # processes via the task library. + "use_threads": True, + "jobs": settings.THREADS_PER_WORKER, + "language": self.settings.language, + "output_type": self.settings.output_type, + "progress_bar": False, + } + + if "pdfa" in ocrmypdf_args["output_type"]: + ocrmypdf_args["color_conversion_strategy"] = ( + self.settings.color_conversion_strategy + ) + + if self.settings.mode == ModeChoices.FORCE or safe_fallback: + ocrmypdf_args["force_ocr"] = True + elif self.settings.mode in { + ModeChoices.SKIP, + ModeChoices.SKIP_NO_ARCHIVE, + }: + ocrmypdf_args["skip_text"] = True + elif self.settings.mode == ModeChoices.REDO: + ocrmypdf_args["redo_ocr"] = True + else: # pragma: no cover + raise ParseError(f"Invalid ocr mode: {self.settings.mode}") + + if self.settings.clean == CleanChoices.CLEAN: + ocrmypdf_args["clean"] = True + elif self.settings.clean == CleanChoices.FINAL: + if self.settings.mode == ModeChoices.REDO: + ocrmypdf_args["clean"] = True + else: + # --clean-final is not compatible with --redo-ocr + ocrmypdf_args["clean_final"] = True + + if self.settings.deskew and self.settings.mode != ModeChoices.REDO: + # --deskew is not compatible with --redo-ocr + ocrmypdf_args["deskew"] = True + + if self.settings.rotate: + ocrmypdf_args["rotate_pages"] = True + ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold + + if self.settings.pages is not None and self.settings.pages > 0: + ocrmypdf_args["pages"] = f"1-{self.settings.pages}" + else: + # sidecar is incompatible with pages + ocrmypdf_args["sidecar"] = sidecar_file + + if self.is_image(mime_type): + # This may be required, depending on the known imformation + maybe_override_pixel_limit() + + dpi = self.get_dpi(input_file) + a4_dpi = self.calculate_a4_dpi(input_file) + + if self.has_alpha(input_file): + self.log.info( + f"Removing alpha layer from {input_file} " + "for compatibility with img2pdf", + ) + # Replace the input file with the non-alpha + ocrmypdf_args["input_file"] = self.remove_alpha(input_file) + + if dpi: + self.log.debug(f"Detected DPI for image {input_file}: {dpi}") + ocrmypdf_args["image_dpi"] = dpi + elif self.settings.image_dpi is not None: + ocrmypdf_args["image_dpi"] = self.settings.image_dpi + elif a4_dpi: + ocrmypdf_args["image_dpi"] = a4_dpi + else: + raise ParseError( + f"Cannot produce archive PDF for image {input_file}, " + f"no DPI information is present in this image and " + f"OCR_IMAGE_DPI is not set.", + ) + if ocrmypdf_args["image_dpi"] < 70: # pragma: no cover + self.log.warning( + f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail", + ) + + if self.settings.user_args is not None: + try: + ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args} + except Exception as e: + self.log.warning( + f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " + f"they will not be used. Error: {e}", + ) + + if ( + self.settings.max_image_pixel is not None + and self.settings.max_image_pixel >= 0 + ): + # Convert pixels to mega-pixels and provide to ocrmypdf + max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0 + msg = ( + "OCR pixel limit is disabled!" + if max_pixels_mpixels == 0 + else f"Calculated {max_pixels_mpixels} megapixels for OCR" + ) + self.log.debug(msg) + ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels + + return ocrmypdf_args + + def parse(self, document_path: Path, mime_type, file_name=None): + # This forces tesseract to use one core per page. + os.environ["OMP_THREAD_LIMIT"] = "1" + VALID_TEXT_LENGTH = 50 + + if mime_type == "application/pdf": + text_original = self.extract_text(None, document_path) + original_has_text = ( + text_original is not None and len(text_original) > VALID_TEXT_LENGTH + ) + else: + text_original = None + original_has_text = False + + # If the original has text, and the user doesn't want an archive, + # we're done here + skip_archive_for_text = ( + self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE + or self.settings.skip_archive_file + in { + ArchiveFileChoices.WITH_TEXT, + ArchiveFileChoices.ALWAYS, + } + ) + if skip_archive_for_text and original_has_text: + self.log.debug("Document has text, skipping OCRmyPDF entirely.") + self.text = text_original + return + + # Either no text was in the original or there should be an archive + # file created, so OCR the file and create an archive with any + # text located via OCR + + import ocrmypdf + from ocrmypdf import EncryptedPdfError + from ocrmypdf import InputFileError + from ocrmypdf import SubprocessOutputError + + archive_path = Path(os.path.join(self.tempdir, "archive.pdf")) + sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt")) + + args = self.construct_ocrmypdf_parameters( + document_path, + mime_type, + archive_path, + sidecar_file, + ) + + try: + self.log.debug(f"Calling OCRmyPDF with args: {args}") + # ocrmypdf.ocr(**args) + self.log.info("gia tri document_path: ", document_path) + self.ocr_img_or_pdf(document_path, mime_type,**args) + if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS: + self.archive_path = archive_path + + self.text = self.extract_text(sidecar_file, archive_path) + + if not self.text: + raise NoTextFoundException("No text was found in the original document") + except EncryptedPdfError: + self.log.warning( + "This file is encrypted, OCR is impossible. Using " + "any text present in the original file.", + ) + if original_has_text: + self.text = text_original + except SubprocessOutputError as e: + if "Ghostscript PDF/A rendering" in str(e): + self.log.warning( + "Ghostscript PDF/A rendering failed, consider setting " + "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'", + ) + + raise ParseError( + f"SubprocessOutputError: {e!s}. See logs for more information.", + ) from e + except (NoTextFoundException, InputFileError) as e: + self.log.warning( + f"Encountered an error while running OCR: {e!s}. " + f"Attempting force OCR to get the text.", + ) + + archive_path_fallback = Path( + os.path.join(self.tempdir, "archive-fallback.pdf"), + ) + sidecar_file_fallback = Path( + os.path.join(self.tempdir, "sidecar-fallback.txt"), + ) + + # Attempt to run OCR with safe settings. + + args = self.construct_ocrmypdf_parameters( + document_path, + mime_type, + archive_path_fallback, + sidecar_file_fallback, + safe_fallback=True, + ) + + try: + self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}") + # ocrmypdf.ocr(**args) + self.ocr_img_or_pdf(document_path, mime_type,**args) + # Don't return the archived file here, since this file + # is bigger and blurry due to --force-ocr. + + self.text = self.extract_text( + sidecar_file_fallback, + archive_path_fallback, + ) + + except Exception as e: + # If this fails, we have a serious issue at hand. + raise ParseError(f"{e.__class__.__name__}: {e!s}") from e + + except Exception as e: + # Anything else is probably serious. + raise ParseError(f"{e.__class__.__name__}: {e!s}") from e + + # As a last resort, if we still don't have any text for any reason, + # try to extract the text from the original document. + if not self.text: + if original_has_text: + self.text = text_original + else: + self.log.warning( + f"No text was found in {document_path}, the content will " + f"be empty.", + ) + self.text = "" + + +def post_process_text(text): + if not text: + return None + + collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) + no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces) + no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace) + + # TODO: this needs a rework + # replace \0 prevents issues with saving to postgres. + # text may contain \0 when this character is present in PDF files. + return no_trailing_whitespace.strip().replace("\0", " ") diff --git a/src/paperless_ocr_custom/signals.py b/src/paperless_ocr_custom/signals.py new file mode 100644 index 000000000..d03450b5f --- /dev/null +++ b/src/paperless_ocr_custom/signals.py @@ -0,0 +1,20 @@ +def get_parser(*args, **kwargs): + from paperless_ocr_custom.parsers import RasterisedDocumentParser + + return RasterisedDocumentParser(*args, **kwargs) + + +def tesseract_consumer_declaration(sender, **kwargs): + return { + "parser": get_parser, + "weight": 1, + "mime_types": { + "application/pdf": ".pdf", + "image/jpeg": ".jpg", + "image/png": ".png", + "image/tiff": ".tif", + "image/gif": ".gif", + "image/bmp": ".bmp", + "image/webp": ".webp", + }, + }