diff --git a/src-ui/src/app/components/admin/config/config.component.html b/src-ui/src/app/components/admin/config/config.component.html
index 03ca04b7b..71bb63e76 100644
--- a/src-ui/src/app/components/admin/config/config.component.html
+++ b/src-ui/src/app/components/admin/config/config.component.html
@@ -22,9 +22,9 @@
diff --git a/src-ui/src/app/data/paperless-config.ts b/src-ui/src/app/data/paperless-config.ts
index 3ae485ff2..f6c203983 100644
--- a/src-ui/src/app/data/paperless-config.ts
+++ b/src-ui/src/app/data/paperless-config.ts
@@ -166,6 +166,13 @@ export const PaperlessConfigOptions: ConfigOption[] = [
config_key: 'PAPERLESS_OCR_USER_ARGS',
category: ConfigCategory.OCR,
},
+ {
+ key: 'ocr_key',
+ title: $localize`OCR Key`,
+ type: ConfigOptionType.String,
+ config_key: 'PAPERLESS_APP_TITLE',
+ category: ConfigCategory.OCR,
+ },
{
key: 'app_logo',
title: $localize`Application Logo`,
@@ -196,6 +203,7 @@ export interface PaperlessConfig extends ObjectWithId {
max_image_pixels: number
color_conversion_strategy: ColorConvertConfig
user_args: object
+ ocr_key: string
app_logo: string
app_title: string
}
diff --git a/src-ui/src/locale/messages.vi_VN.xlf b/src-ui/src/locale/messages.vi_VN.xlf
index 327cf13da..7c3ed0e08 100644
--- a/src-ui/src/locale/messages.vi_VN.xlf
+++ b/src-ui/src/locale/messages.vi_VN.xlf
@@ -497,7 +497,7 @@
src/app/components/admin/config/config.component.html
34
- Enable
+ Cho phép
Discard
@@ -4587,7 +4587,7 @@
src/app/components/common/input/switch/switch.component.html
39
- Note: value has not yet been set and will not apply until explicitly changed
+ Lưu ý: giá trị chưa được đặt và sẽ không áp dụng cho đến khi thay đổi rõ ràng
Add tag
@@ -5970,7 +5970,7 @@
src/app/components/document-detail/document-detail.component.ts
724
- Do you really want to delete document ""?
+ Bạn có thực sự muốn xóa tài liệu ""?
The files for this document will be deleted permanently. This operation cannot be undone.
@@ -5978,7 +5978,7 @@
src/app/components/document-detail/document-detail.component.ts
725
- The files for this document will be deleted permanently. This operation cannot be undone.
+ Các tập tin cho tài liệu này sẽ bị xóa vĩnh viễn. Không thể hoàn tác thao tác này
Delete document
@@ -5986,7 +5986,7 @@
src/app/components/document-detail/document-detail.component.ts
727
- Delete document
+ Xóa tài liệu
Error deleting document
@@ -6724,7 +6724,7 @@
src/app/components/document-list/document-list.component.html
146
- Sort by correspondent
+ Sắp xếp theo người biên tập
Sort by title
@@ -7132,7 +7132,7 @@
src/app/components/manage/correspondent-list/correspondent-list.component.ts
67
- Do you really want to delete the correspondent ""?
+ Bạn có thực sự muốn xóa người biên tập ""?
Customize the data fields that can be attached to documents.
@@ -7564,7 +7564,7 @@
src/app/components/manage/management-list/management-list.component.ts
180
- Successfully updated .
+ Đã cập nhật thành công.
Error occurred while saving .
@@ -7668,7 +7668,7 @@
src/app/components/manage/tag-list/tag-list.component.ts
53
- Do you really want to delete the tag ""?
+ Bạn có thực sự muốn xóa thẻ ""?
Use workflows to customize the behavior of TC GROUP when events 'trigger' a workflow.
@@ -7932,7 +7932,7 @@
src/app/data/paperless-config.ts
50
- General Settings
+ Cài đặt chung
OCR Settings
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index c735ed4c8..fa3bf2e75 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -36,7 +36,7 @@ from documents.models import Tag
from documents.models import Workflow
from documents.models import WorkflowAction
from documents.models import WorkflowTrigger
-from documents.parsers import DocumentParser
+from documents.parsers import DocumentParser, custom_get_parser_class_for_mime_type
from documents.parsers import ParseError
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import parse_date
@@ -557,7 +557,7 @@ class Consumer(LoggingMixin):
self.log.debug(f"Detected mime type: {mime_type}")
# Based on the mime type, get the parser for that type
- parser_class: Optional[type[DocumentParser]] = get_parser_class_for_mime_type(
+ parser_class: Optional[type[DocumentParser]] = custom_get_parser_class_for_mime_type(
mime_type,
)
if not parser_class:
diff --git a/src/documents/parsers.py b/src/documents/parsers.py
index d781ddb9f..3a43886d1 100644
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -14,11 +14,13 @@ from typing import Optional
from django.conf import settings
from django.utils import timezone
+import requests
from documents.loggers import LoggingMixin
from documents.signals import document_consumer_declaration
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
+from paperless.models import ApplicationConfiguration
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
@@ -129,6 +131,38 @@ def get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentPar
# Return the parser with the highest weight.
return best_parser["parser"]
+def custom_get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentParser"]]:
+ """
+ Returns the best parser (by weight) for the given mimetype or
+ None if no parser exists
+ """
+
+ options = []
+
+ for response in document_consumer_declaration.send(None):
+ parser_declaration = response[1]
+ supported_mime_types = parser_declaration["mime_types"]
+
+ if mime_type in supported_mime_types:
+ options.append(parser_declaration)
+
+ if not options:
+ return None
+ k = ApplicationConfiguration.objects.filter().first()
+ best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[1]
+ if k.ocr_key!='':
+ headers = {
+ 'Authorization': f'Bearer {k.ocr_key}'
+ }
+ url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"]
+ response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers)
+ logger.debug(f'status code: {response_ocr.status_code}')
+ if response_ocr.status_code != 401:
+ best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
+ logger.debug('Successful key authentication ...')
+ logger.debug('Fail key authentication ...', best_parser["parser"])
+ # Return the parser with the highest weight.
+ return best_parser["parser"]
def run_convert(
input_file,
diff --git a/src/paperless/migrations/0004_applicationconfiguration_ocr_key.py b/src/paperless/migrations/0004_applicationconfiguration_ocr_key.py
new file mode 100644
index 000000000..a1a115f73
--- /dev/null
+++ b/src/paperless/migrations/0004_applicationconfiguration_ocr_key.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.11 on 2024-05-22 02:52
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('paperless', '0003_alter_applicationconfiguration_max_image_pixels'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='applicationconfiguration',
+ name='ocr_key',
+ field=models.CharField(blank=True, max_length=48, null=True, verbose_name='Sets key for advanced version'),
+ ),
+ ]
diff --git a/src/paperless/migrations/0005_alter_applicationconfiguration_ocr_key.py b/src/paperless/migrations/0005_alter_applicationconfiguration_ocr_key.py
new file mode 100644
index 000000000..1b33698ed
--- /dev/null
+++ b/src/paperless/migrations/0005_alter_applicationconfiguration_ocr_key.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.11 on 2024-05-22 07:01
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('paperless', '0004_applicationconfiguration_ocr_key'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='applicationconfiguration',
+ name='ocr_key',
+ field=models.CharField(blank=True, max_length=100, null=True, verbose_name='Sets key for advanced version'),
+ ),
+ ]
diff --git a/src/paperless/migrations/0006_alter_applicationconfiguration_ocr_key.py b/src/paperless/migrations/0006_alter_applicationconfiguration_ocr_key.py
new file mode 100644
index 000000000..a3225215b
--- /dev/null
+++ b/src/paperless/migrations/0006_alter_applicationconfiguration_ocr_key.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.11 on 2024-05-22 07:03
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('paperless', '0005_alter_applicationconfiguration_ocr_key'),
+ ]
+
+ operations = [
+ migrations.AlterField(
+ model_name='applicationconfiguration',
+ name='ocr_key',
+ field=models.CharField(blank=True, max_length=200, null=True, verbose_name='Sets key for advanced version'),
+ ),
+ ]
diff --git a/src/paperless/models.py b/src/paperless/models.py
index 1f6cfbced..03b74bbc9 100644
--- a/src/paperless/models.py
+++ b/src/paperless/models.py
@@ -184,6 +184,13 @@ class ApplicationConfiguration(AbstractSingletonModel):
upload_to="logo/",
)
+ ocr_key = models.CharField(
+ verbose_name=_("Sets key for advanced version"),
+ null=True,
+ blank=True,
+ max_length=200,
+ )
+
class Meta:
verbose_name = _("paperless application settings")
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 4e30db1b1..a63c8931b 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -295,6 +295,7 @@ INSTALLED_APPS = [
"paperless",
"documents.apps.DocumentsConfig",
"paperless_tesseract.apps.PaperlessTesseractConfig",
+ "paperless_ocr_custom.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig",
"django.contrib.admin",
@@ -416,6 +417,14 @@ CHANNEL_LAYERS = {
},
}
+# PAPERLESS_OCR_CUSTOM
+TCGROUP_OCR_CUSTOM = {
+ "URL": {
+ "URL_UPLOAD_FILE": os.getenv("URL_UPLOAD_FILE","https://ocr-core-api.tcgroup.vn/api/v1/file/upload"),
+ "URL_OCR_BY_FILEID": os.getenv("URL_OCR_BY_FILEID","https://ocr-core-api.tcgroup.vn/api/v1/ocr/general"),
+ }
+}
+
###############################################################################
# Security #
###############################################################################
diff --git a/src/paperless_ocr_custom/__init__.py b/src/paperless_ocr_custom/__init__.py
new file mode 100644
index 000000000..c811b3c76
--- /dev/null
+++ b/src/paperless_ocr_custom/__init__.py
@@ -0,0 +1,5 @@
+# this is here so that django finds the checks.
+from paperless_ocr_custom.checks import check_default_language_available
+from paperless_ocr_custom.checks import get_tesseract_langs
+
+__all__ = ["get_tesseract_langs", "check_default_language_available"]
diff --git a/src/paperless_ocr_custom/apps.py b/src/paperless_ocr_custom/apps.py
new file mode 100644
index 000000000..e96602cfd
--- /dev/null
+++ b/src/paperless_ocr_custom/apps.py
@@ -0,0 +1,14 @@
+from django.apps import AppConfig
+
+from paperless_ocr_custom.signals import tesseract_consumer_declaration
+
+
+class PaperlessTesseractConfig(AppConfig):
+ name = "paperless_ocr_custom"
+
+ def ready(self):
+ from documents.signals import document_consumer_declaration
+
+ document_consumer_declaration.connect(tesseract_consumer_declaration)
+
+ AppConfig.ready(self)
diff --git a/src/paperless_ocr_custom/checks.py b/src/paperless_ocr_custom/checks.py
new file mode 100644
index 000000000..82d255005
--- /dev/null
+++ b/src/paperless_ocr_custom/checks.py
@@ -0,0 +1,46 @@
+import shutil
+import subprocess
+
+from django.conf import settings
+from django.core.checks import Error
+from django.core.checks import Warning
+from django.core.checks import register
+
+
+def get_tesseract_langs():
+ proc = subprocess.run(
+ [shutil.which("tesseract"), "--list-langs"],
+ capture_output=True,
+ )
+
+ # Decode bytes to string, split on newlines, trim out the header
+ proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
+
+ return [x.strip() for x in proc_lines]
+
+
+@register()
+def check_default_language_available(app_configs, **kwargs):
+ installed_langs = get_tesseract_langs()
+
+ if not settings.OCR_LANGUAGE:
+ return [
+ Warning(
+ "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
+ "This means that tesseract will fallback to english.",
+ ),
+ ]
+
+ specified_langs = settings.OCR_LANGUAGE.split("+")
+
+ for lang in specified_langs:
+ if lang not in installed_langs:
+ return [
+ Error(
+ f"The selected ocr language {lang} is "
+ f"not installed. Paperless cannot OCR your documents "
+ f"without it. Please fix PAPERLESS_OCR_LANGUAGE.",
+ ),
+ ]
+
+ return []
diff --git a/src/paperless_ocr_custom/fonts/arial-font/ARIBL0.ttf b/src/paperless_ocr_custom/fonts/arial-font/ARIBL0.ttf
new file mode 100644
index 000000000..a1e68a366
Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/ARIBL0.ttf differ
diff --git a/src/paperless_ocr_custom/fonts/arial-font/ArialTh.ttf b/src/paperless_ocr_custom/fonts/arial-font/ArialTh.ttf
new file mode 100644
index 000000000..614f0af2c
Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/ArialTh.ttf differ
diff --git a/src/paperless_ocr_custom/fonts/arial-font/Arialn.ttf b/src/paperless_ocr_custom/fonts/arial-font/Arialn.ttf
new file mode 100644
index 000000000..5799f4149
Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/Arialn.ttf differ
diff --git a/src/paperless_ocr_custom/fonts/arial-font/GEO_AI__.TTF b/src/paperless_ocr_custom/fonts/arial-font/GEO_AI__.TTF
new file mode 100644
index 000000000..62572a4a8
Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/GEO_AI__.TTF differ
diff --git a/src/paperless_ocr_custom/fonts/arial-font/G_ari_bd.TTF b/src/paperless_ocr_custom/fonts/arial-font/G_ari_bd.TTF
new file mode 100644
index 000000000..61c51a0cb
Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/G_ari_bd.TTF differ
diff --git a/src/paperless_ocr_custom/fonts/arial-font/G_ari_i.TTF b/src/paperless_ocr_custom/fonts/arial-font/G_ari_i.TTF
new file mode 100644
index 000000000..5ebda789c
Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/G_ari_i.TTF differ
diff --git a/src/paperless_ocr_custom/fonts/arial-font/arial.ttf b/src/paperless_ocr_custom/fonts/arial-font/arial.ttf
new file mode 100644
index 000000000..ad7d8eab8
Binary files /dev/null and b/src/paperless_ocr_custom/fonts/arial-font/arial.ttf differ
diff --git a/src/paperless_ocr_custom/parsers.py b/src/paperless_ocr_custom/parsers.py
new file mode 100644
index 000000000..3448151a9
--- /dev/null
+++ b/src/paperless_ocr_custom/parsers.py
@@ -0,0 +1,626 @@
+import io
+import logging
+import math
+import os
+import re
+import shutil
+import tempfile
+from pathlib import Path
+from typing import TYPE_CHECKING
+from typing import Optional
+
+from django.conf import settings
+import requests
+from PyPDF2 import PdfReader
+from reportlab.pdfgen import canvas
+from reportlab.lib.pagesizes import letter
+from PIL import Image
+from reportlab.pdfbase.ttfonts import TTFont
+from reportlab.pdfbase import pdfmetrics
+from pdf2image import convert_from_path
+from reportlab.lib.utils import ImageReader
+
+from documents.parsers import DocumentParser
+from documents.parsers import ParseError
+from documents.parsers import make_thumbnail_from_pdf
+from documents.utils import maybe_override_pixel_limit
+from documents.utils import run_subprocess
+from paperless.config import OcrConfig
+from paperless.models import ApplicationConfiguration, ArchiveFileChoices
+from paperless.models import CleanChoices
+from paperless.models import ModeChoices
+
+
+class NoTextFoundException(Exception):
+ pass
+
+
+class RtlLanguageException(Exception):
+ pass
+
+
+class RasterisedDocumentParser(DocumentParser):
+ """
+ This parser uses Tesseract to try and get some text out of a rasterised
+ image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
+ """
+
+ logging_name = "paperless.parsing.tesseract"
+
+ def get_settings(self) -> OcrConfig:
+ """
+ This parser uses the OCR configuration settings to parse documents
+ """
+ return OcrConfig()
+
+ def extract_metadata(self, document_path, mime_type):
+ result = []
+ if mime_type == "application/pdf":
+ import pikepdf
+
+ namespace_pattern = re.compile(r"\{(.*)\}(.*)")
+
+ pdf = pikepdf.open(document_path)
+ meta = pdf.open_metadata()
+ for key, value in meta.items():
+ if isinstance(value, list):
+ value = " ".join([str(e) for e in value])
+ value = str(value)
+ try:
+ m = namespace_pattern.match(key)
+ if m is None: # pragma: no cover
+ continue
+ namespace = m.group(1)
+ key_value = m.group(2)
+ try:
+ namespace.encode("utf-8")
+ key_value.encode("utf-8")
+ except UnicodeEncodeError as e: # pragma: no cover
+ self.log.debug(f"Skipping metadata key {key}: {e}")
+ continue
+ result.append(
+ {
+ "namespace": namespace,
+ "prefix": meta.REVERSE_NS[namespace],
+ "key": key_value,
+ "value": value,
+ },
+ )
+ except Exception as e:
+ self.log.warning(
+ f"Error while reading metadata {key}: {value}. Error: {e}",
+ )
+ return result
+
+ def get_thumbnail(self, document_path, mime_type, file_name=None):
+ return make_thumbnail_from_pdf(
+ self.archive_path or document_path,
+ self.tempdir,
+ self.logging_group,
+ )
+
+ def is_image(self, mime_type) -> bool:
+ return mime_type in [
+ "image/png",
+ "image/jpeg",
+ "image/tiff",
+ "image/bmp",
+ "image/gif",
+ "image/webp",
+ ]
+
+ def has_alpha(self, image) -> bool:
+ with Image.open(image) as im:
+ return im.mode in ("RGBA", "LA")
+
+ def remove_alpha(self, image_path: str) -> Path:
+ no_alpha_image = Path(self.tempdir) / "image-no-alpha"
+ run_subprocess(
+ [
+ settings.CONVERT_BINARY,
+ "-alpha",
+ "off",
+ image_path,
+ no_alpha_image,
+ ],
+ logger=self.log,
+ )
+ return no_alpha_image
+
+ def get_dpi(self, image) -> Optional[int]:
+ try:
+ with Image.open(image) as im:
+ x, y = im.info["dpi"]
+ return round(x)
+ except Exception as e:
+ self.log.warning(f"Error while getting DPI from image {image}: {e}")
+ return None
+
+ def calculate_a4_dpi(self, image) -> Optional[int]:
+ try:
+ with Image.open(image) as im:
+ width, height = im.size
+ # divide image width by A4 width (210mm) in inches.
+ dpi = int(width / (21 / 2.54))
+ self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
+ return dpi
+
+ except Exception as e:
+ self.log.warning(f"Error while calculating DPI for image {image}: {e}")
+ return None
+ # get ocr file img/pdf
+ def ocr_file(self,path_file):
+ # get text from api
+ # ocr_custom_username = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_USERNAME"]
+ # ocr_custom_password = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_PASSWORD"]
+ # url_login = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_LOGIN"]
+ # data = {
+ # 'username': ocr_custom_username,
+ # 'password': ocr_custom_password
+ # }
+ # response_login = requests.post(url_login, data=data)
+ # access_token = ''
+ # if response_login.status_code == 200:
+ # response_data = response_login.json()
+ # access_token = response_data.get('access_token','')
+ # else:
+ # logging.error('login: ', response_login.status_code)
+
+ k = ApplicationConfiguration.objects.filter().first()
+ access_token = k.ocr_key
+ # upload file
+ get_file_id = ''
+ url_upload_file = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_UPLOAD_FILE"]
+ headers = {
+ 'Authorization': f'Bearer {access_token}'
+ }
+ pdf_data = None
+ with open(path_file, 'rb') as file:
+ pdf_data = file.read()
+
+ response_upload = requests.post(url_upload_file, files={'file': (str(path_file).split("/")[-1], pdf_data)}, headers=headers)
+ # logging.debug('pdf file',response_upload)
+ if response_upload.status_code == 200:
+ get_file_id = response_upload.json().get('file_id','')
+ else:
+ logging.error('upload file: ',response_upload.status_code)
+
+ # ocr by file_id
+ params = {'file_id': get_file_id}
+ url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"]
+ response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers, params=params)
+ data_ocr = None
+ # logging.error('ocr: ', response_ocr.status_code)
+ if response_ocr.status_code == 200:
+ data_ocr = response_ocr.json()
+ else:
+ logging.error('ocr: ', response_ocr.text)
+ return data_ocr
+
+
+ def render_pdf_ocr(self, sidecar, mime_type, input_path, output_path):
+ font_name = 'Arial'
+ data = self.ocr_file(input_path)
+ if not data:
+ return
+ font_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'fonts', 'arial-font/arial.ttf')
+ with open(sidecar, "w") as txt_sidecar:
+ txt_sidecar.write(data.get("content",""))
+ if self.is_image(mime_type):
+ img = Image.open(input_path)
+ width, height = img.size
+ c = canvas.Canvas(str(output_path), pagesize=(width, height))
+ pdfmetrics.registerFont(TTFont(font_name, font_path))
+ # c.drawImage(input_path, 0, 0, width=width, height=height)
+ for page in data["pages"]:
+ for block in page["blocks"]:
+ for line in block.get("lines", []):
+ y1 = line.get("bbox")[0][1]
+ y2 = line.get("bbox")[1][1]
+ font_size = math.floor((y2 - y1) * 72 / 96)
+ y_center_coordinates = y2 - (y2 - y1)/2
+ for word in line.get("words", []):
+ x1 = word["bbox"][0][0]
+ # y1 = word["bbox"][0][1]
+ x2 = word["bbox"][1][0]
+ # y2 = word["bbox"][1][1]
+ value = word["value"]
+ # font_size = math.ceil(float(y2-y1) * 72 / 96)
+ # font_size = (y2-y1) * 72 / 96
+ x_center_coordinates =x2 - (x2-x1)/2
+ # y_center_coordinates =y2 - (y2-y1)/2
+ w = c.stringWidth(value, font_name, font_size)
+ c.setFont('Arial', font_size)
+ c.drawString(x_center_coordinates - w/2,
+ height - y_center_coordinates - (font_size/2),
+ value)
+ c.drawImage(input_path, 0, 0, width=width, height=height)
+ c.save()
+ else:
+ shutil.copy(str(input_path), str(output_path))
+ input_pdf = PdfReader(input_path)
+ images = convert_from_path(input_path,
+ first_page=1,
+ last_page=input_pdf.getNumPages()+1)
+ can = canvas.Canvas(str(output_path), pagesize=letter)
+ for page_num, page in enumerate(input_pdf.pages):
+ page_height = input_pdf.pages[page_num].mediabox[3]
+ page_width = input_pdf.pages[page_num].mediabox[2]
+ # set size new page
+ can.setPageSize((page_width, page_height))
+ byte_image = io.BytesIO()
+ images[page_num].save(byte_image, format='JPEG')
+ jpg_image = byte_image.getvalue()
+ # can.drawImage(ImageReader(io.BytesIO(jpg_image)),
+ # 0, 0,
+ # width=float(page_width),
+ # height=float(page_height))
+ # set font size
+ pdfmetrics.registerFont(TTFont('Arial', font_path))
+ width_api_img = data["pages"][page_num]["dimensions"][1]
+ height_api_img = data["pages"][page_num]["dimensions"][0]
+ rolate_height = height_api_img /page_height
+ rolate_width = width_api_img /page_width
+ for block in data["pages"][page_num]["blocks"]:
+ for line in block.get("lines", []):
+ y1 = (line.get("bbox")[0][1] / float(rolate_height))
+ y2 = (line.get("bbox")[1][1] / float(rolate_height))
+ font_size = (y2 - y1) * 72 / 96
+ y_center_coordinates = y2 - (y2 - y1)/2
+ for word in line.get("words", []):
+ x1 = word["bbox"][0][0] / float(rolate_width)
+ # y1 = word["bbox"][0][1] / float(rolate_height)
+ x2 = word["bbox"][1][0] / float(rolate_width)
+ # y2 = word["bbox"][1][1] / float(rolate_height)
+ value = word["value"]
+ # font_size = float(y2-y1) * 72 / 96
+ x_center_coordinates = x2 - (x2-x1)/2
+ # y_center_coordinates =y2 - (y2-y1)/2
+ w = can.stringWidth(value, font_name, font_size)
+ can.setFont('Arial', font_size)
+ can.drawString(x_center_coordinates - w/2,
+ int(page_height) - y_center_coordinates - (font_size/3),
+ value)
+ can.drawImage(ImageReader(io.BytesIO(jpg_image)),
+ 0, 0,
+ width=float(page_width),
+ height=float(page_height))
+ can.showPage()
+ can.save()
+ return
+
+
+
+
+ def ocr_img_or_pdf(self, document_path, mime_type, sidecar, output_file, **kwargs):
+ self.render_pdf_ocr(sidecar, mime_type, document_path, output_file)
+
+
+ def extract_text(
+ self,
+ sidecar_file: Optional[Path],
+ pdf_file: Path,
+ ) -> Optional[str]:
+ # When re-doing OCR, the sidecar contains ONLY the new text, not
+ # the whole text, so do not utilize it in that case
+ if (
+ sidecar_file is not None
+ and os.path.isfile(sidecar_file)
+ and self.settings.mode != "redo"
+ ):
+ text = self.read_file_handle_unicode_errors(sidecar_file)
+
+ if "[OCR skipped on page" not in text:
+ # This happens when there's already text in the input file.
+ # The sidecar file will only contain text for OCR'ed pages.
+ self.log.debug("Using text from sidecar file")
+ return post_process_text(text)
+ else:
+ self.log.debug("Incomplete sidecar file: discarding.")
+
+ # no success with the sidecar file, try PDF
+
+ if not os.path.isfile(pdf_file):
+ return None
+
+ try:
+ text = None
+ with tempfile.NamedTemporaryFile(
+ mode="w+",
+ dir=self.tempdir,
+ ) as tmp:
+ run_subprocess(
+ [
+ "pdftotext",
+ "-q",
+ "-layout",
+ "-enc",
+ "UTF-8",
+ pdf_file,
+ tmp.name,
+ ],
+ logger=self.log,
+ )
+ text = self.read_file_handle_unicode_errors(Path(tmp.name))
+
+ # data_ocr = self.ocr_file(pdf_file).get('content','')
+ # if not data_ocr:
+ # data_ocr = ''
+
+ # logging.info()
+ return post_process_text(text)
+
+ except Exception:
+ # If pdftotext fails, fall back to OCR.
+ self.log.warning(
+ "Error while getting text from PDF document with pdftotext",
+ exc_info=True,
+ )
+ # probably not a PDF file.
+ return None
+
+ def construct_ocrmypdf_parameters(
+ self,
+ input_file,
+ mime_type,
+ output_file,
+ sidecar_file,
+ safe_fallback=False,
+ ):
+ if TYPE_CHECKING:
+ assert isinstance(self.settings, OcrConfig)
+ ocrmypdf_args = {
+ "input_file": input_file,
+ "output_file": output_file,
+ # need to use threads, since this will be run in daemonized
+ # processes via the task library.
+ "use_threads": True,
+ "jobs": settings.THREADS_PER_WORKER,
+ "language": self.settings.language,
+ "output_type": self.settings.output_type,
+ "progress_bar": False,
+ }
+
+ if "pdfa" in ocrmypdf_args["output_type"]:
+ ocrmypdf_args["color_conversion_strategy"] = (
+ self.settings.color_conversion_strategy
+ )
+
+ if self.settings.mode == ModeChoices.FORCE or safe_fallback:
+ ocrmypdf_args["force_ocr"] = True
+ elif self.settings.mode in {
+ ModeChoices.SKIP,
+ ModeChoices.SKIP_NO_ARCHIVE,
+ }:
+ ocrmypdf_args["skip_text"] = True
+ elif self.settings.mode == ModeChoices.REDO:
+ ocrmypdf_args["redo_ocr"] = True
+ else: # pragma: no cover
+ raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
+
+ if self.settings.clean == CleanChoices.CLEAN:
+ ocrmypdf_args["clean"] = True
+ elif self.settings.clean == CleanChoices.FINAL:
+ if self.settings.mode == ModeChoices.REDO:
+ ocrmypdf_args["clean"] = True
+ else:
+ # --clean-final is not compatible with --redo-ocr
+ ocrmypdf_args["clean_final"] = True
+
+ if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
+ # --deskew is not compatible with --redo-ocr
+ ocrmypdf_args["deskew"] = True
+
+ if self.settings.rotate:
+ ocrmypdf_args["rotate_pages"] = True
+ ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
+
+ if self.settings.pages is not None and self.settings.pages > 0:
+ ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
+ else:
+ # sidecar is incompatible with pages
+ ocrmypdf_args["sidecar"] = sidecar_file
+
+ if self.is_image(mime_type):
+ # This may be required, depending on the known imformation
+ maybe_override_pixel_limit()
+
+ dpi = self.get_dpi(input_file)
+ a4_dpi = self.calculate_a4_dpi(input_file)
+
+ if self.has_alpha(input_file):
+ self.log.info(
+ f"Removing alpha layer from {input_file} "
+ "for compatibility with img2pdf",
+ )
+ # Replace the input file with the non-alpha
+ ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
+
+ if dpi:
+ self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
+ ocrmypdf_args["image_dpi"] = dpi
+ elif self.settings.image_dpi is not None:
+ ocrmypdf_args["image_dpi"] = self.settings.image_dpi
+ elif a4_dpi:
+ ocrmypdf_args["image_dpi"] = a4_dpi
+ else:
+ raise ParseError(
+ f"Cannot produce archive PDF for image {input_file}, "
+ f"no DPI information is present in this image and "
+ f"OCR_IMAGE_DPI is not set.",
+ )
+ if ocrmypdf_args["image_dpi"] < 70: # pragma: no cover
+ self.log.warning(
+ f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
+ )
+
+ if self.settings.user_args is not None:
+ try:
+ ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
+ except Exception as e:
+ self.log.warning(
+ f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
+ f"they will not be used. Error: {e}",
+ )
+
+ if (
+ self.settings.max_image_pixel is not None
+ and self.settings.max_image_pixel >= 0
+ ):
+ # Convert pixels to mega-pixels and provide to ocrmypdf
+ max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
+ msg = (
+ "OCR pixel limit is disabled!"
+ if max_pixels_mpixels == 0
+ else f"Calculated {max_pixels_mpixels} megapixels for OCR"
+ )
+ self.log.debug(msg)
+ ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
+
+ return ocrmypdf_args
+
+ def parse(self, document_path: Path, mime_type, file_name=None):
+ # This forces tesseract to use one core per page.
+ os.environ["OMP_THREAD_LIMIT"] = "1"
+ VALID_TEXT_LENGTH = 50
+
+ if mime_type == "application/pdf":
+ text_original = self.extract_text(None, document_path)
+ original_has_text = (
+ text_original is not None and len(text_original) > VALID_TEXT_LENGTH
+ )
+ else:
+ text_original = None
+ original_has_text = False
+
+ # If the original has text, and the user doesn't want an archive,
+ # we're done here
+ skip_archive_for_text = (
+ self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
+ or self.settings.skip_archive_file
+ in {
+ ArchiveFileChoices.WITH_TEXT,
+ ArchiveFileChoices.ALWAYS,
+ }
+ )
+ if skip_archive_for_text and original_has_text:
+ self.log.debug("Document has text, skipping OCRmyPDF entirely.")
+ self.text = text_original
+ return
+
+ # Either no text was in the original or there should be an archive
+ # file created, so OCR the file and create an archive with any
+ # text located via OCR
+
+ import ocrmypdf
+ from ocrmypdf import EncryptedPdfError
+ from ocrmypdf import InputFileError
+ from ocrmypdf import SubprocessOutputError
+
+ archive_path = Path(os.path.join(self.tempdir, "archive.pdf"))
+ sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt"))
+
+ args = self.construct_ocrmypdf_parameters(
+ document_path,
+ mime_type,
+ archive_path,
+ sidecar_file,
+ )
+
+ try:
+ self.log.debug(f"Calling OCRmyPDF with args: {args}")
+ # ocrmypdf.ocr(**args)
+ self.log.info("gia tri document_path: ", document_path)
+ self.ocr_img_or_pdf(document_path, mime_type,**args)
+ if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
+ self.archive_path = archive_path
+
+ self.text = self.extract_text(sidecar_file, archive_path)
+
+ if not self.text:
+ raise NoTextFoundException("No text was found in the original document")
+ except EncryptedPdfError:
+ self.log.warning(
+ "This file is encrypted, OCR is impossible. Using "
+ "any text present in the original file.",
+ )
+ if original_has_text:
+ self.text = text_original
+ except SubprocessOutputError as e:
+ if "Ghostscript PDF/A rendering" in str(e):
+ self.log.warning(
+ "Ghostscript PDF/A rendering failed, consider setting "
+ "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
+ )
+
+ raise ParseError(
+ f"SubprocessOutputError: {e!s}. See logs for more information.",
+ ) from e
+ except (NoTextFoundException, InputFileError) as e:
+ self.log.warning(
+ f"Encountered an error while running OCR: {e!s}. "
+ f"Attempting force OCR to get the text.",
+ )
+
+ archive_path_fallback = Path(
+ os.path.join(self.tempdir, "archive-fallback.pdf"),
+ )
+ sidecar_file_fallback = Path(
+ os.path.join(self.tempdir, "sidecar-fallback.txt"),
+ )
+
+ # Attempt to run OCR with safe settings.
+
+ args = self.construct_ocrmypdf_parameters(
+ document_path,
+ mime_type,
+ archive_path_fallback,
+ sidecar_file_fallback,
+ safe_fallback=True,
+ )
+
+ try:
+ self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
+ # ocrmypdf.ocr(**args)
+ self.ocr_img_or_pdf(document_path, mime_type,**args)
+ # Don't return the archived file here, since this file
+ # is bigger and blurry due to --force-ocr.
+
+ self.text = self.extract_text(
+ sidecar_file_fallback,
+ archive_path_fallback,
+ )
+
+ except Exception as e:
+ # If this fails, we have a serious issue at hand.
+ raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
+
+ except Exception as e:
+ # Anything else is probably serious.
+ raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
+
+ # As a last resort, if we still don't have any text for any reason,
+ # try to extract the text from the original document.
+ if not self.text:
+ if original_has_text:
+ self.text = text_original
+ else:
+ self.log.warning(
+ f"No text was found in {document_path}, the content will "
+ f"be empty.",
+ )
+ self.text = ""
+
+
+def post_process_text(text):
+ if not text:
+ return None
+
+ collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
+ no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
+ no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
+
+ # TODO: this needs a rework
+ # replace \0 prevents issues with saving to postgres.
+ # text may contain \0 when this character is present in PDF files.
+ return no_trailing_whitespace.strip().replace("\0", " ")
diff --git a/src/paperless_ocr_custom/signals.py b/src/paperless_ocr_custom/signals.py
new file mode 100644
index 000000000..d03450b5f
--- /dev/null
+++ b/src/paperless_ocr_custom/signals.py
@@ -0,0 +1,20 @@
+def get_parser(*args, **kwargs):
+ from paperless_ocr_custom.parsers import RasterisedDocumentParser
+
+ return RasterisedDocumentParser(*args, **kwargs)
+
+
+def tesseract_consumer_declaration(sender, **kwargs):
+ return {
+ "parser": get_parser,
+ "weight": 1,
+ "mime_types": {
+ "application/pdf": ".pdf",
+ "image/jpeg": ".jpg",
+ "image/png": ".png",
+ "image/tiff": ".tif",
+ "image/gif": ".gif",
+ "image/bmp": ".bmp",
+ "image/webp": ".webp",
+ },
+ }