Merge branch 'dev' of https://github.com/tienthienhd/tc-edoc into dev

2024-05-23 16:27:46 +07:00
parent fa7283cff1 771d9fb5f3
commit 54129f9db6
25 changed files with 852 additions and 23 deletions
--- a/.env
+++ b/.env
@@ -1,6 +1,7 @@
 COMPOSE_PROJECT_NAME=paperless
 PAPERLESS_DEBUG=true
 # PAPERLESS_REDIS=redis://localhost:6379
 PAPERLESS_REDIS=redis://:@123bytech@172.16.100.203:9377
 PAPERLESS_DBHOST=172.16.100.203
@@ -9,3 +10,7 @@ PAPERLESS_DBNAME=tc_edoc
 PAPERLESS_DBUSER=tc_edoc
 PAPERLESS_DBPASS=27M2MV58Re2Y
 PAPERLESS_DBSSLMODE=prefer
 URL_UPLOAD_FILE = https://ocr-core-api.tcgroup.vn/api/v1/file/upload
 URL_OCR_BY_FILEID = https://ocr-core-api.tcgroup.vn/api/v1/ocr/general
--- a/requirements.txt
+++ b/requirements.txt
@@ -123,5 +123,6 @@ whoosh==2.7.4
 wrapt==1.16.0; python_version >= '3.6'
 zstandard==0.22.0; python_version >= '3.8'
 zxing-cpp==2.2.0; platform_machine == 'x86_64' and python_version >= '3.6'
 PyPDF2<3.0
 python-decouple==3.8
--- a/src-ui/messages.xlf
+++ b/src-ui/messages.xlf
@@ -425,13 +425,6 @@
          <context context-type="linenumber">22</context>
        </context-group>
      </trans-unit>
      <trans-unit id="7991430199894172363" datatype="html">
        <source>Read the documentation about this setting</source>
        <context-group purpose="location">
          <context context-type="sourcefile">src/app/components/admin/config/config.component.html</context>
          <context context-type="linenumber">25</context>
        </context-group>
      </trans-unit>
      <trans-unit id="2180291763949669799" datatype="html">
        <source>Enable</source>
        <context-group purpose="location">
@@ -7217,18 +7210,25 @@
          <context context-type="linenumber">164</context>
        </context-group>
      </trans-unit>
      <trans-unit id="4301951240854951353" datatype="html">
        <source>OCR Key</source>
        <context-group purpose="location">
          <context context-type="sourcefile">src/app/data/paperless-config.ts</context>
          <context context-type="linenumber">171</context>
        </context-group>
      </trans-unit>
      <trans-unit id="7106327322456204362" datatype="html">
        <source>Application Logo</source>
        <context-group purpose="location">
          <context context-type="sourcefile">src/app/data/paperless-config.ts</context>
-          <context context-type="linenumber">171</context>
+          <context context-type="linenumber">178</context>
        </context-group>
      </trans-unit>
      <trans-unit id="2684743776608068095" datatype="html">
        <source>Application Title</source>
        <context-group purpose="location">
          <context context-type="sourcefile">src/app/data/paperless-config.ts</context>
-          <context context-type="linenumber">178</context>
+          <context context-type="linenumber">185</context>
        </context-group>
      </trans-unit>
      <trans-unit id="5948496158474272829" datatype="html">
--- a/src-ui/src/app/components/admin/config/config.component.html
+++ b/src-ui/src/app/components/admin/config/config.component.html
@@ -22,9 +22,9 @@
                                            <div class="card-title">
                                                <h6>
                                                    {{option.title}}
-                                                    <a class="btn btn-sm btn-link" title="Read the documentation about this setting" i18n-title [href]="getDocsUrl(option.config_key)" target="_blank" referrerpolicy="no-referrer">
+                                                    <!-- <a class="btn btn-sm btn-link" title="Read the documentation about this setting" i18n-title [href]="getDocsUrl(option.config_key)" target="_blank" referrerpolicy="no-referrer">
                                                        <i-bs  name="info-circle"></i-bs>
-                                                    </a>
+                                                    </a> -->
                                                </h6>
                                            </div>
                                            <div class="mb-n3">
--- a/src-ui/src/app/data/paperless-config.ts
+++ b/src-ui/src/app/data/paperless-config.ts
@@ -166,6 +166,13 @@ export const PaperlessConfigOptions: ConfigOption[] = [
    config_key: 'PAPERLESS_OCR_USER_ARGS',
    category: ConfigCategory.OCR,
  },
  {
    key: 'ocr_key',
    title: $localize`OCR Key`,
    type: ConfigOptionType.String,
    config_key: 'PAPERLESS_APP_TITLE',
    category: ConfigCategory.OCR,
  },
  {
    key: 'app_logo',
    title: $localize`Application Logo`,
@@ -196,6 +203,7 @@ export interface PaperlessConfig extends ObjectWithId {
  max_image_pixels: number
  color_conversion_strategy: ColorConvertConfig
  user_args: object
  ocr_key: string
  app_logo: string
  app_title: string
 }
--- a/src-ui/src/locale/messages.vi_VN.xlf
+++ b/src-ui/src/locale/messages.vi_VN.xlf
@@ -497,7 +497,7 @@
          <context context-type="sourcefile">src/app/components/admin/config/config.component.html</context>
          <context context-type="linenumber">34</context>
        </context-group>
-        <target state="needs-translation">Enable</target>
+        <target state="needs-translation">Cho phép</target>
      </trans-unit>
      <trans-unit id="3823219296477075982" datatype="html">
        <source>Discard</source>
@@ -4587,7 +4587,7 @@
          <context context-type="sourcefile">src/app/components/common/input/switch/switch.component.html</context>
          <context context-type="linenumber">39</context>
        </context-group>
-        <target state="needs-translation">Note: value has not yet been set and will not apply until explicitly changed</target>
+        <target state="needs-translation">Lưu ý: giá trị chưa được đặt và sẽ không áp dụng cho đến khi thay đổi rõ ràng</target>
      </trans-unit>
      <trans-unit id="6560126119609945418" datatype="html">
        <source>Add tag</source>
@@ -5970,7 +5970,7 @@
          <context context-type="sourcefile">src/app/components/document-detail/document-detail.component.ts</context>
          <context context-type="linenumber">724</context>
        </context-group>
-        <target state="needs-translation">Do you really want to delete document "<x id="PH" equiv-text="this.document.title"/>"?</target>
+        <target state="needs-translation">Bạn có thực sự muốn xóa tài liệu "<x id="PH" equiv-text="this.document.title"/>"?</target>
      </trans-unit>
      <trans-unit id="6691075929777935948" datatype="html">
        <source>The files for this document will be deleted permanently. This operation cannot be undone.</source>
@@ -5978,7 +5978,7 @@
          <context context-type="sourcefile">src/app/components/document-detail/document-detail.component.ts</context>
          <context context-type="linenumber">725</context>
        </context-group>
-        <target state="needs-translation">The files for this document will be deleted permanently. This operation cannot be undone.</target>
+        <target state="needs-translation">Các tập tin cho tài liệu này sẽ bị xóa vĩnh viễn. Không thể hoàn tác thao tác này</target>
      </trans-unit>
      <trans-unit id="719892092227206532" datatype="html">
        <source>Delete document</source>
@@ -5986,7 +5986,7 @@
          <context context-type="sourcefile">src/app/components/document-detail/document-detail.component.ts</context>
          <context context-type="linenumber">727</context>
        </context-group>
-        <target state="needs-translation">Delete document</target>
+        <target state="needs-translation">Xóa tài liệu</target>
      </trans-unit>
      <trans-unit id="7295637485862454066" datatype="html">
        <source>Error deleting document</source>
@@ -6724,7 +6724,7 @@
          <context context-type="sourcefile">src/app/components/document-list/document-list.component.html</context>
          <context context-type="linenumber">146</context>
        </context-group>
-        <target state="needs-translation">Sort by correspondent</target>
+        <target state="needs-translation">Sắp xếp theo người biên tập</target>
      </trans-unit>
      <trans-unit id="2066713941761361709" datatype="html">
        <source>Sort by title</source>
@@ -7132,7 +7132,7 @@
          <context context-type="sourcefile">src/app/components/manage/correspondent-list/correspondent-list.component.ts</context>
          <context context-type="linenumber">67</context>
        </context-group>
-        <target state="needs-translation">Do you really want to delete the correspondent "<x id="PH" equiv-text="object.name"/>"?</target>
+        <target state="needs-translation">Bạn có thực sự muốn xóa người biên tập "<x id="PH" equiv-text="object.name"/>"?</target>
      </trans-unit>
      <trans-unit id="8384138406252790442" datatype="html">
        <source>Customize the data fields that can be attached to documents.</source>
@@ -7564,7 +7564,7 @@
          <context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.ts</context>
          <context context-type="linenumber">180</context>
        </context-group>
-        <target state="needs-translation">Successfully updated <x id="PH" equiv-text="this.typeName"/>.</target>
+        <target state="needs-translation">Đã cập nhật thành công<x id="PH" equiv-text="this.typeName"/>.</target>
      </trans-unit>
      <trans-unit id="6442673774206210733" datatype="html">
        <source>Error occurred while saving <x id="PH" equiv-text="this.typeName"/>.</source>
@@ -7668,7 +7668,7 @@
          <context context-type="sourcefile">src/app/components/manage/tag-list/tag-list.component.ts</context>
          <context context-type="linenumber">53</context>
        </context-group>
-        <target state="needs-translation">Do you really want to delete the tag "<x id="PH" equiv-text="object.name"/>"?</target>
+        <target state="needs-translation">Bạn có thực sự muốn xóa thẻ "<x id="PH" equiv-text="object.name"/>"?</target>
      </trans-unit>
      <trans-unit id="1229748338333965418" datatype="html">
        <source>Use workflows to customize the behavior of TC GROUP when events &apos;trigger&apos; a workflow.</source>
@@ -7932,7 +7932,7 @@
          <context context-type="sourcefile">src/app/data/paperless-config.ts</context>
          <context context-type="linenumber">50</context>
        </context-group>
-        <target state="needs-translation">General Settings</target>
+        <target state="needs-translation">Cài đặt chung</target>
      </trans-unit>
      <trans-unit id="2762851116637676072" datatype="html">
        <source>OCR Settings</source>
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -36,7 +36,7 @@ from documents.models import Tag
 from documents.models import Workflow
 from documents.models import WorkflowAction
 from documents.models import WorkflowTrigger
-from documents.parsers import DocumentParser
+from documents.parsers import DocumentParser, custom_get_parser_class_for_mime_type
 from documents.parsers import ParseError
 from documents.parsers import get_parser_class_for_mime_type
 from documents.parsers import parse_date
@@ -557,7 +557,7 @@ class Consumer(LoggingMixin):
        self.log.debug(f"Detected mime type: {mime_type}")
        # Based on the mime type, get the parser for that type
-        parser_class: Optional[type[DocumentParser]] = get_parser_class_for_mime_type(
+        parser_class: Optional[type[DocumentParser]] = custom_get_parser_class_for_mime_type(
            mime_type,
        )
        if not parser_class:
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -14,11 +14,13 @@ from typing import Optional
 from django.conf import settings
 from django.utils import timezone
 import requests
 from documents.loggers import LoggingMixin
 from documents.signals import document_consumer_declaration
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
 from paperless.models import ApplicationConfiguration
 # This regular expression will try to find dates in the document at
 # hand and will match the following formats:
@@ -129,6 +131,38 @@ def get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentPar
    # Return the parser with the highest weight.
    return best_parser["parser"]
 def custom_get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentParser"]]:
    """
    Returns the best parser (by weight) for the given mimetype or
    None if no parser exists
    """
    options = []
    for response in document_consumer_declaration.send(None):
        parser_declaration = response[1]
        supported_mime_types = parser_declaration["mime_types"]
        if mime_type in supported_mime_types:
            options.append(parser_declaration)
    if not options:
        return None
    k = ApplicationConfiguration.objects.filter().first()
    best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[1]
    if k.ocr_key!='':
        headers = {
            'Authorization': f'Bearer {k.ocr_key}'
        }
        url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"]
        response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers)
        logger.debug(f'status code: {response_ocr.status_code}')
        if response_ocr.status_code != 401:
            best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
            logger.debug('Successful key authentication ...')
    logger.debug('Fail key authentication ...', best_parser["parser"])
    # Return the parser with the highest weight.
    return best_parser["parser"]
 def run_convert(
    input_file,
--- a/src/paperless/migrations/0004_applicationconfiguration_ocr_key.py
+++ b/src/paperless/migrations/0004_applicationconfiguration_ocr_key.py
@@ -0,0 +1,18 @@
 # Generated by Django 4.2.11 on 2024-05-22 02:52
 from django.db import migrations, models
 class Migration(migrations.Migration):
    dependencies = [
        ('paperless', '0003_alter_applicationconfiguration_max_image_pixels'),
    ]
    operations = [
        migrations.AddField(
            model_name='applicationconfiguration',
            name='ocr_key',
            field=models.CharField(blank=True, max_length=48, null=True, verbose_name='Sets key for advanced version'),
        ),
    ]
--- a/src/paperless/migrations/0005_alter_applicationconfiguration_ocr_key.py
+++ b/src/paperless/migrations/0005_alter_applicationconfiguration_ocr_key.py
@@ -0,0 +1,18 @@
 # Generated by Django 4.2.11 on 2024-05-22 07:01
 from django.db import migrations, models
 class Migration(migrations.Migration):
    dependencies = [
        ('paperless', '0004_applicationconfiguration_ocr_key'),
    ]
    operations = [
        migrations.AlterField(
            model_name='applicationconfiguration',
            name='ocr_key',
            field=models.CharField(blank=True, max_length=100, null=True, verbose_name='Sets key for advanced version'),
        ),
    ]
--- a/src/paperless/migrations/0006_alter_applicationconfiguration_ocr_key.py
+++ b/src/paperless/migrations/0006_alter_applicationconfiguration_ocr_key.py
@@ -0,0 +1,18 @@
 # Generated by Django 4.2.11 on 2024-05-22 07:03
 from django.db import migrations, models
 class Migration(migrations.Migration):
    dependencies = [
        ('paperless', '0005_alter_applicationconfiguration_ocr_key'),
    ]
    operations = [
        migrations.AlterField(
            model_name='applicationconfiguration',
            name='ocr_key',
            field=models.CharField(blank=True, max_length=200, null=True, verbose_name='Sets key for advanced version'),
        ),
    ]
--- a/src/paperless/models.py
+++ b/src/paperless/models.py
@@ -184,6 +184,13 @@ class ApplicationConfiguration(AbstractSingletonModel):
        upload_to="logo/",
    )
    ocr_key = models.CharField(
        verbose_name=_("Sets key for advanced version"),
        null=True,
        blank=True,
        max_length=200,
    )
    class Meta:
        verbose_name = _("paperless application settings")
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -295,6 +295,7 @@ INSTALLED_APPS = [
    "paperless",
    "documents.apps.DocumentsConfig",
    "paperless_tesseract.apps.PaperlessTesseractConfig",
    "paperless_ocr_custom.apps.PaperlessTesseractConfig",
    "paperless_text.apps.PaperlessTextConfig",
    "paperless_mail.apps.PaperlessMailConfig",
    "django.contrib.admin",
@@ -416,6 +417,14 @@ CHANNEL_LAYERS = {
    },
 }
 # PAPERLESS_OCR_CUSTOM
 TCGROUP_OCR_CUSTOM = {
    "URL": {
        "URL_UPLOAD_FILE": os.getenv("URL_UPLOAD_FILE","https://ocr-core-api.tcgroup.vn/api/v1/file/upload"),
        "URL_OCR_BY_FILEID": os.getenv("URL_OCR_BY_FILEID","https://ocr-core-api.tcgroup.vn/api/v1/ocr/general"),
    }
 }
 ###############################################################################
 # Security                                                                    #
 ###############################################################################
--- a/src/paperless_ocr_custom/init.py
+++ b/src/paperless_ocr_custom/init.py
@@ -0,0 +1,5 @@
 # this is here so that django finds the checks.
 from paperless_ocr_custom.checks import check_default_language_available
 from paperless_ocr_custom.checks import get_tesseract_langs
 __all__ = ["get_tesseract_langs", "check_default_language_available"]
--- a/src/paperless_ocr_custom/apps.py
+++ b/src/paperless_ocr_custom/apps.py
@@ -0,0 +1,14 @@
 from django.apps import AppConfig
 from paperless_ocr_custom.signals import tesseract_consumer_declaration
 class PaperlessTesseractConfig(AppConfig):
    name = "paperless_ocr_custom"
    def ready(self):
        from documents.signals import document_consumer_declaration
        document_consumer_declaration.connect(tesseract_consumer_declaration)
        AppConfig.ready(self)
--- a/src/paperless_ocr_custom/checks.py
+++ b/src/paperless_ocr_custom/checks.py
@@ -0,0 +1,46 @@
 import shutil
 import subprocess
 from django.conf import settings
 from django.core.checks import Error
 from django.core.checks import Warning
 from django.core.checks import register
 def get_tesseract_langs():
    proc = subprocess.run(
        [shutil.which("tesseract"), "--list-langs"],
        capture_output=True,
    )
    # Decode bytes to string, split on newlines, trim out the header
    proc_lines = proc.stdout.decode("utf8", errors="ignore").strip().split("\n")[1:]
    return [x.strip() for x in proc_lines]
@register()
 def check_default_language_available(app_configs, **kwargs):
    installed_langs = get_tesseract_langs()
    if not settings.OCR_LANGUAGE:
        return [
            Warning(
                "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. "
                "This means that tesseract will fallback to english.",
            ),
        ]
    specified_langs = settings.OCR_LANGUAGE.split("+")
    for lang in specified_langs:
        if lang not in installed_langs:
            return [
                Error(
                    f"The selected ocr language {lang} is "
                    f"not installed. Paperless cannot OCR your documents "
                    f"without it. Please fix PAPERLESS_OCR_LANGUAGE.",
                ),
            ]
    return []
--- a/src/paperless_ocr_custom/fonts/arial-font/ARIBL0.ttf
+++ b/src/paperless_ocr_custom/fonts/arial-font/ARIBL0.ttf
--- a/src/paperless_ocr_custom/fonts/arial-font/ArialTh.ttf
+++ b/src/paperless_ocr_custom/fonts/arial-font/ArialTh.ttf
--- a/src/paperless_ocr_custom/fonts/arial-font/Arialn.ttf
+++ b/src/paperless_ocr_custom/fonts/arial-font/Arialn.ttf
--- a/src/paperless_ocr_custom/fonts/arial-font/GEO_AI__.TTF
+++ b/src/paperless_ocr_custom/fonts/arial-font/GEO_AI__.TTF
--- a/src/paperless_ocr_custom/fonts/arial-font/G_ari_bd.TTF
+++ b/src/paperless_ocr_custom/fonts/arial-font/G_ari_bd.TTF
--- a/src/paperless_ocr_custom/fonts/arial-font/G_ari_i.TTF
+++ b/src/paperless_ocr_custom/fonts/arial-font/G_ari_i.TTF
--- a/src/paperless_ocr_custom/fonts/arial-font/arial.ttf
+++ b/src/paperless_ocr_custom/fonts/arial-font/arial.ttf
--- a/src/paperless_ocr_custom/parsers.py
+++ b/src/paperless_ocr_custom/parsers.py
@@ -0,0 +1,626 @@
 import io
 import logging
 import math
 import os
 import re
 import shutil
 import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING
 from typing import Optional
 from django.conf import settings
 import requests
 from PyPDF2 import PdfReader
 from reportlab.pdfgen import canvas
 from reportlab.lib.pagesizes import letter
 from PIL import Image
 from reportlab.pdfbase.ttfonts import TTFont
 from reportlab.pdfbase import pdfmetrics
 from pdf2image import convert_from_path
 from reportlab.lib.utils import ImageReader
 from documents.parsers import DocumentParser
 from documents.parsers import ParseError
 from documents.parsers import make_thumbnail_from_pdf
 from documents.utils import maybe_override_pixel_limit
 from documents.utils import run_subprocess
 from paperless.config import OcrConfig
 from paperless.models import ApplicationConfiguration, ArchiveFileChoices
 from paperless.models import CleanChoices
 from paperless.models import ModeChoices
 class NoTextFoundException(Exception):
    pass
 class RtlLanguageException(Exception):
    pass
 class RasterisedDocumentParser(DocumentParser):
    """
    This parser uses Tesseract to try and get some text out of a rasterised
    image, whether it's a PDF, or other graphical format (JPEG, TIFF, etc.)
    """
    logging_name = "paperless.parsing.tesseract"
    def get_settings(self) -> OcrConfig:
        """
        This parser uses the OCR configuration settings to parse documents
        """
        return OcrConfig()
    def extract_metadata(self, document_path, mime_type):
        result = []
        if mime_type == "application/pdf":
            import pikepdf
            namespace_pattern = re.compile(r"\{(.*)\}(.*)")
            pdf = pikepdf.open(document_path)
            meta = pdf.open_metadata()
            for key, value in meta.items():
                if isinstance(value, list):
                    value = " ".join([str(e) for e in value])
                value = str(value)
                try:
                    m = namespace_pattern.match(key)
                    if m is None:  # pragma: no cover
                        continue
                    namespace = m.group(1)
                    key_value = m.group(2)
                    try:
                        namespace.encode("utf-8")
                        key_value.encode("utf-8")
                    except UnicodeEncodeError as e:  # pragma: no cover
                        self.log.debug(f"Skipping metadata key {key}: {e}")
                        continue
                    result.append(
                        {
                            "namespace": namespace,
                            "prefix": meta.REVERSE_NS[namespace],
                            "key": key_value,
                            "value": value,
                        },
                    )
                except Exception as e:
                    self.log.warning(
                        f"Error while reading metadata {key}: {value}. Error: {e}",
                    )
        return result
    def get_thumbnail(self, document_path, mime_type, file_name=None):
        return make_thumbnail_from_pdf(
            self.archive_path or document_path,
            self.tempdir,
            self.logging_group,
        )
    def is_image(self, mime_type) -> bool:
        return mime_type in [
            "image/png",
            "image/jpeg",
            "image/tiff",
            "image/bmp",
            "image/gif",
            "image/webp",
        ]
    def has_alpha(self, image) -> bool:
        with Image.open(image) as im:
            return im.mode in ("RGBA", "LA")
    def remove_alpha(self, image_path: str) -> Path:
        no_alpha_image = Path(self.tempdir) / "image-no-alpha"
        run_subprocess(
            [
                settings.CONVERT_BINARY,
                "-alpha",
                "off",
                image_path,
                no_alpha_image,
            ],
            logger=self.log,
        )
        return no_alpha_image
    def get_dpi(self, image) -> Optional[int]:
        try:
            with Image.open(image) as im:
                x, y = im.info["dpi"]
                return round(x)
        except Exception as e:
            self.log.warning(f"Error while getting DPI from image {image}: {e}")
            return None
    def calculate_a4_dpi(self, image) -> Optional[int]:
        try:
            with Image.open(image) as im:
                width, height = im.size
                # divide image width by A4 width (210mm) in inches.
                dpi = int(width / (21 / 2.54))
                self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
                return dpi
        except Exception as e:
            self.log.warning(f"Error while calculating DPI for image {image}: {e}")
            return None
    # get ocr file img/pdf
    def ocr_file(self,path_file):
        # get text from api 
        # ocr_custom_username = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_USERNAME"]
        # ocr_custom_password = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_PASSWORD"]
        # url_login = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_LOGIN"]
        # data = {
        #     'username': ocr_custom_username,
        #     'password': ocr_custom_password
        # }
        # response_login = requests.post(url_login, data=data)
        # access_token = ''
        # if response_login.status_code == 200:
        #     response_data = response_login.json()
        #     access_token = response_data.get('access_token','')
        # else:
        #     logging.error('login: ', response_login.status_code)
        k = ApplicationConfiguration.objects.filter().first()
        access_token = k.ocr_key
        # upload file
        get_file_id = ''
        url_upload_file = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_UPLOAD_FILE"]
        headers = {
            'Authorization': f'Bearer {access_token}'
        }
        pdf_data = None
        with open(path_file, 'rb') as file:
            pdf_data = file.read()
        response_upload = requests.post(url_upload_file, files={'file': (str(path_file).split("/")[-1], pdf_data)}, headers=headers)
        # logging.debug('pdf file',response_upload)
        if response_upload.status_code == 200:
            get_file_id = response_upload.json().get('file_id','')
        else:
            logging.error('upload file: ',response_upload.status_code) 
        # ocr by file_id
        params = {'file_id': get_file_id}
        url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"]
        response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers, params=params)
        data_ocr = None
        # logging.error('ocr: ', response_ocr.status_code)
        if response_ocr.status_code == 200:
            data_ocr = response_ocr.json()
        else:
            logging.error('ocr: ', response_ocr.text)
        return data_ocr
    def render_pdf_ocr(self, sidecar, mime_type, input_path, output_path):
        font_name = 'Arial'
        data = self.ocr_file(input_path)
        if not data:
                return
        font_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'fonts', 'arial-font/arial.ttf')            
        with open(sidecar, "w") as txt_sidecar:
            txt_sidecar.write(data.get("content",""))
        if self.is_image(mime_type):
            img = Image.open(input_path)
            width, height = img.size
            c = canvas.Canvas(str(output_path), pagesize=(width, height))
            pdfmetrics.registerFont(TTFont(font_name, font_path))
            # c.drawImage(input_path, 0, 0, width=width, height=height)
            for page in data["pages"]:
                for block in page["blocks"]:
                    for line in block.get("lines", []):
                        y1 = line.get("bbox")[0][1]
                        y2 = line.get("bbox")[1][1]
                        font_size = math.floor((y2 - y1)  * 72 / 96)
                        y_center_coordinates = y2 - (y2 - y1)/2
                        for word in line.get("words", []):   
                            x1 = word["bbox"][0][0]
                            # y1 = word["bbox"][0][1]
                            x2 = word["bbox"][1][0]
                            # y2 = word["bbox"][1][1]
                            value = word["value"]
                            # font_size = math.ceil(float(y2-y1) * 72 / 96)
                            # font_size = (y2-y1) * 72 / 96
                            x_center_coordinates =x2 - (x2-x1)/2
                            # y_center_coordinates =y2 - (y2-y1)/2
                            w = c.stringWidth(value, font_name, font_size)
                            c.setFont('Arial', font_size)
                            c.drawString(x_center_coordinates - w/2,
                                         height - y_center_coordinates - (font_size/2),
                                         value)            
            c.drawImage(input_path, 0, 0, width=width, height=height)
            c.save()
        else:
            shutil.copy(str(input_path), str(output_path))
            input_pdf = PdfReader(input_path)
            images = convert_from_path(input_path,
                                       first_page=1,
                                       last_page=input_pdf.getNumPages()+1)
            can = canvas.Canvas(str(output_path), pagesize=letter)
            for page_num, page in enumerate(input_pdf.pages):
                page_height = input_pdf.pages[page_num].mediabox[3]
                page_width = input_pdf.pages[page_num].mediabox[2]
                # set size new page
                can.setPageSize((page_width, page_height))
                byte_image = io.BytesIO()
                images[page_num].save(byte_image, format='JPEG')
                jpg_image = byte_image.getvalue()
                # can.drawImage(ImageReader(io.BytesIO(jpg_image)),
                #               0, 0, 
                #               width=float(page_width),
                #               height=float(page_height))
                # set font size
                pdfmetrics.registerFont(TTFont('Arial', font_path))
                width_api_img = data["pages"][page_num]["dimensions"][1]
                height_api_img = data["pages"][page_num]["dimensions"][0]
                rolate_height =  height_api_img /page_height
                rolate_width = width_api_img /page_width
                for block in data["pages"][page_num]["blocks"]:
                    for line in block.get("lines", []):
                        y1 = (line.get("bbox")[0][1] / float(rolate_height))
                        y2 = (line.get("bbox")[1][1] / float(rolate_height))
                        font_size = (y2 - y1)  * 72 / 96
                        y_center_coordinates = y2 - (y2 - y1)/2
                        for word in line.get("words", []):   
                            x1 = word["bbox"][0][0] / float(rolate_width)
                            # y1 = word["bbox"][0][1] / float(rolate_height)
                            x2 = word["bbox"][1][0] / float(rolate_width)
                            # y2 = word["bbox"][1][1] / float(rolate_height)
                            value = word["value"]
                            # font_size = float(y2-y1) * 72 / 96 
                            x_center_coordinates = x2 - (x2-x1)/2
                            # y_center_coordinates =y2 - (y2-y1)/2
                            w = can.stringWidth(value, font_name, font_size)
                            can.setFont('Arial', font_size)
                            can.drawString(x_center_coordinates - w/2,
                                           int(page_height) - y_center_coordinates - (font_size/3),
                                           value)            
                can.drawImage(ImageReader(io.BytesIO(jpg_image)),
                              0, 0, 
                              width=float(page_width),
                              height=float(page_height))
                can.showPage()
            can.save()
        return
    def ocr_img_or_pdf(self, document_path, mime_type, sidecar, output_file, **kwargs):
        self.render_pdf_ocr(sidecar, mime_type, document_path, output_file)
    def extract_text(
        self,
        sidecar_file: Optional[Path],
        pdf_file: Path,
    ) -> Optional[str]:
        # When re-doing OCR, the sidecar contains ONLY the new text, not
        # the whole text, so do not utilize it in that case
        if (
            sidecar_file is not None
            and os.path.isfile(sidecar_file)
            and self.settings.mode != "redo"
        ):
            text = self.read_file_handle_unicode_errors(sidecar_file)
            if "[OCR skipped on page" not in text:
                # This happens when there's already text in the input file.
                # The sidecar file will only contain text for OCR'ed pages.
                self.log.debug("Using text from sidecar file")
                return post_process_text(text)
            else:
                self.log.debug("Incomplete sidecar file: discarding.")
        # no success with the sidecar file, try PDF
        if not os.path.isfile(pdf_file):
            return None
        try:
            text = None
            with tempfile.NamedTemporaryFile(
                mode="w+",
                dir=self.tempdir,
            ) as tmp:
                run_subprocess(
                    [
                        "pdftotext",
                        "-q",
                        "-layout",
                        "-enc",
                        "UTF-8",
                        pdf_file,
                        tmp.name,
                    ],
                    logger=self.log,
                )
            text = self.read_file_handle_unicode_errors(Path(tmp.name))
            # data_ocr = self.ocr_file(pdf_file).get('content','')
            # if not data_ocr:
            #     data_ocr = ''        
            # logging.info()    
            return post_process_text(text)
        except Exception:
            #  If pdftotext fails, fall back to OCR.
            self.log.warning(
                "Error while getting text from PDF document with pdftotext",
                exc_info=True,
            )
            # probably not a PDF file.
            return None
    def construct_ocrmypdf_parameters(
        self,
        input_file,
        mime_type,
        output_file,
        sidecar_file,
        safe_fallback=False,
    ):
        if TYPE_CHECKING:
            assert isinstance(self.settings, OcrConfig)
        ocrmypdf_args = {
            "input_file": input_file,
            "output_file": output_file,
            # need to use threads, since this will be run in daemonized
            # processes via the task library.
            "use_threads": True,
            "jobs": settings.THREADS_PER_WORKER,
            "language": self.settings.language,
            "output_type": self.settings.output_type,
            "progress_bar": False,
        }
        if "pdfa" in ocrmypdf_args["output_type"]:
            ocrmypdf_args["color_conversion_strategy"] = (
                self.settings.color_conversion_strategy
            )
        if self.settings.mode == ModeChoices.FORCE or safe_fallback:
            ocrmypdf_args["force_ocr"] = True
        elif self.settings.mode in {
            ModeChoices.SKIP,
            ModeChoices.SKIP_NO_ARCHIVE,
        }:
            ocrmypdf_args["skip_text"] = True
        elif self.settings.mode == ModeChoices.REDO:
            ocrmypdf_args["redo_ocr"] = True
        else:  # pragma: no cover
            raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
        if self.settings.clean == CleanChoices.CLEAN:
            ocrmypdf_args["clean"] = True
        elif self.settings.clean == CleanChoices.FINAL:
            if self.settings.mode == ModeChoices.REDO:
                ocrmypdf_args["clean"] = True
            else:
                # --clean-final is not compatible with --redo-ocr
                ocrmypdf_args["clean_final"] = True
        if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
            # --deskew is not compatible with --redo-ocr
            ocrmypdf_args["deskew"] = True
        if self.settings.rotate:
            ocrmypdf_args["rotate_pages"] = True
            ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
        if self.settings.pages is not None and self.settings.pages > 0:
            ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
        else:
            # sidecar is incompatible with pages
            ocrmypdf_args["sidecar"] = sidecar_file
        if self.is_image(mime_type):
            # This may be required, depending on the known imformation
            maybe_override_pixel_limit()
            dpi = self.get_dpi(input_file)
            a4_dpi = self.calculate_a4_dpi(input_file)
            if self.has_alpha(input_file):
                self.log.info(
                    f"Removing alpha layer from {input_file} "
                    "for compatibility with img2pdf",
                )
                # Replace the input file with the non-alpha
                ocrmypdf_args["input_file"] = self.remove_alpha(input_file)
            if dpi:
                self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
                ocrmypdf_args["image_dpi"] = dpi
            elif self.settings.image_dpi is not None:
                ocrmypdf_args["image_dpi"] = self.settings.image_dpi
            elif a4_dpi:
                ocrmypdf_args["image_dpi"] = a4_dpi
            else:
                raise ParseError(
                    f"Cannot produce archive PDF for image {input_file}, "
                    f"no DPI information is present in this image and "
                    f"OCR_IMAGE_DPI is not set.",
                )
            if ocrmypdf_args["image_dpi"] < 70:  # pragma: no cover
                self.log.warning(
                    f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
                )
        if self.settings.user_args is not None:
            try:
                ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
            except Exception as e:
                self.log.warning(
                    f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
                    f"they will not be used. Error: {e}",
                )
        if (
            self.settings.max_image_pixel is not None
            and self.settings.max_image_pixel >= 0
        ):
            # Convert pixels to mega-pixels and provide to ocrmypdf
            max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
            msg = (
                "OCR pixel limit is disabled!"
                if max_pixels_mpixels == 0
                else f"Calculated {max_pixels_mpixels} megapixels for OCR"
            )
            self.log.debug(msg)
            ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
        return ocrmypdf_args
    def parse(self, document_path: Path, mime_type, file_name=None):
        # This forces tesseract to use one core per page.
        os.environ["OMP_THREAD_LIMIT"] = "1"
        VALID_TEXT_LENGTH = 50
        if mime_type == "application/pdf":
            text_original = self.extract_text(None, document_path)
            original_has_text = (
                text_original is not None and len(text_original) > VALID_TEXT_LENGTH
            )
        else:
            text_original = None
            original_has_text = False
        # If the original has text, and the user doesn't want an archive,
        # we're done here
        skip_archive_for_text = (
            self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
            or self.settings.skip_archive_file
            in {
                ArchiveFileChoices.WITH_TEXT,
                ArchiveFileChoices.ALWAYS,
            }
        )
        if skip_archive_for_text and original_has_text:
            self.log.debug("Document has text, skipping OCRmyPDF entirely.")
            self.text = text_original
            return
        # Either no text was in the original or there should be an archive
        # file created, so OCR the file and create an archive with any
        # text located via OCR
        import ocrmypdf
        from ocrmypdf import EncryptedPdfError
        from ocrmypdf import InputFileError
        from ocrmypdf import SubprocessOutputError
        archive_path = Path(os.path.join(self.tempdir, "archive.pdf"))
        sidecar_file = Path(os.path.join(self.tempdir, "sidecar.txt"))
        args = self.construct_ocrmypdf_parameters(
            document_path,
            mime_type,
            archive_path,
            sidecar_file,
        )
        try:
            self.log.debug(f"Calling OCRmyPDF with args: {args}")
            # ocrmypdf.ocr(**args)
            self.log.info("gia tri document_path: ", document_path)
            self.ocr_img_or_pdf(document_path, mime_type,**args)
            if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
                self.archive_path = archive_path
            self.text = self.extract_text(sidecar_file, archive_path)
            if not self.text:
                raise NoTextFoundException("No text was found in the original document")
        except EncryptedPdfError:
            self.log.warning(
                "This file is encrypted, OCR is impossible. Using "
                "any text present in the original file.",
            )
            if original_has_text:
                self.text = text_original
        except SubprocessOutputError as e:
            if "Ghostscript PDF/A rendering" in str(e):
                self.log.warning(
                    "Ghostscript PDF/A rendering failed, consider setting "
                    "PAPERLESS_OCR_USER_ARGS: '{\"continue_on_soft_render_error\": true}'",
                )
            raise ParseError(
                f"SubprocessOutputError: {e!s}. See logs for more information.",
            ) from e
        except (NoTextFoundException, InputFileError) as e:
            self.log.warning(
                f"Encountered an error while running OCR: {e!s}. "
                f"Attempting force OCR to get the text.",
            )
            archive_path_fallback = Path(
                os.path.join(self.tempdir, "archive-fallback.pdf"),
            )
            sidecar_file_fallback = Path(
                os.path.join(self.tempdir, "sidecar-fallback.txt"),
            )
            # Attempt to run OCR with safe settings.
            args = self.construct_ocrmypdf_parameters(
                document_path,
                mime_type,
                archive_path_fallback,
                sidecar_file_fallback,
                safe_fallback=True,
            )
            try:
                self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
                # ocrmypdf.ocr(**args)
                self.ocr_img_or_pdf(document_path, mime_type,**args)
                # Don't return the archived file here, since this file
                # is bigger and blurry due to --force-ocr.
                self.text = self.extract_text(
                    sidecar_file_fallback,
                    archive_path_fallback,
                )
            except Exception as e:
                # If this fails, we have a serious issue at hand.
                raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
        except Exception as e:
            # Anything else is probably serious.
            raise ParseError(f"{e.__class__.__name__}: {e!s}") from e
        # As a last resort, if we still don't have any text for any reason,
        # try to extract the text from the original document.
        if not self.text:
            if original_has_text:
                self.text = text_original
            else:
                self.log.warning(
                    f"No text was found in {document_path}, the content will "
                    f"be empty.",
                )
                self.text = ""
 def post_process_text(text):
    if not text:
        return None
    collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text)
    no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces)
    no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace)
    # TODO: this needs a rework
    # replace \0 prevents issues with saving to postgres.
    # text may contain \0 when this character is present in PDF files.
    return no_trailing_whitespace.strip().replace("\0", " ")
--- a/src/paperless_ocr_custom/signals.py
+++ b/src/paperless_ocr_custom/signals.py
@@ -0,0 +1,20 @@
 def get_parser(*args, **kwargs):
    from paperless_ocr_custom.parsers import RasterisedDocumentParser
    return RasterisedDocumentParser(*args, **kwargs)
 def tesseract_consumer_declaration(sender, **kwargs):
    return {
        "parser": get_parser,
        "weight": 1,
        "mime_types": {
            "application/pdf": ".pdf",
            "image/jpeg": ".jpg",
            "image/png": ".png",
            "image/tiff": ".tif",
            "image/gif": ".gif",
            "image/bmp": ".bmp",
            "image/webp": ".webp",
        },
    }