diff --git a/.env b/.env index 68fa622f5..823e53076 100644 --- a/.env +++ b/.env @@ -10,8 +10,7 @@ PAPERLESS_DBNAME=tc_edoc PAPERLESS_DBUSER=tc_edoc PAPERLESS_DBPASS=27M2MV58Re2Y PAPERLESS_DBSSLMODE=prefer -OCR_CUSTOM_USERNAME = test -OCR_CUSTOM_PASSWORD = test -URL_LOGIN = https://ocr-core-api.tcgroup.vn/token + + URL_UPLOAD_FILE = https://ocr-core-api.tcgroup.vn/api/v1/file/upload URL_OCR_BY_FILEID = https://ocr-core-api.tcgroup.vn/api/v1/ocr/general diff --git a/src-ui/messages.xlf b/src-ui/messages.xlf index 685b5bb80..e35c15ec6 100644 --- a/src-ui/messages.xlf +++ b/src-ui/messages.xlf @@ -425,13 +425,6 @@ 22 - - Read the documentation about this setting - - src/app/components/admin/config/config.component.html - 25 - - Enable @@ -7217,18 +7210,25 @@ 164 + + OCR Key + + src/app/data/paperless-config.ts + 171 + + Application Logo src/app/data/paperless-config.ts - 171 + 178 Application Title src/app/data/paperless-config.ts - 178 + 185 diff --git a/src-ui/src/app/components/admin/config/config.component.html b/src-ui/src/app/components/admin/config/config.component.html index 03ca04b7b..71bb63e76 100644 --- a/src-ui/src/app/components/admin/config/config.component.html +++ b/src-ui/src/app/components/admin/config/config.component.html @@ -22,9 +22,9 @@
{{option.title}} - +
diff --git a/src-ui/src/app/data/paperless-config.ts b/src-ui/src/app/data/paperless-config.ts index 3ae485ff2..f6c203983 100644 --- a/src-ui/src/app/data/paperless-config.ts +++ b/src-ui/src/app/data/paperless-config.ts @@ -166,6 +166,13 @@ export const PaperlessConfigOptions: ConfigOption[] = [ config_key: 'PAPERLESS_OCR_USER_ARGS', category: ConfigCategory.OCR, }, + { + key: 'ocr_key', + title: $localize`OCR Key`, + type: ConfigOptionType.String, + config_key: 'PAPERLESS_APP_TITLE', + category: ConfigCategory.OCR, + }, { key: 'app_logo', title: $localize`Application Logo`, @@ -196,6 +203,7 @@ export interface PaperlessConfig extends ObjectWithId { max_image_pixels: number color_conversion_strategy: ColorConvertConfig user_args: object + ocr_key: string app_logo: string app_title: string } diff --git a/src-ui/src/locale/messages.vi_VN.xlf b/src-ui/src/locale/messages.vi_VN.xlf index 327cf13da..7c3ed0e08 100644 --- a/src-ui/src/locale/messages.vi_VN.xlf +++ b/src-ui/src/locale/messages.vi_VN.xlf @@ -497,7 +497,7 @@ src/app/components/admin/config/config.component.html 34 - Enable + Cho phép Discard @@ -4587,7 +4587,7 @@ src/app/components/common/input/switch/switch.component.html 39 - Note: value has not yet been set and will not apply until explicitly changed + Lưu ý: giá trị chưa được đặt và sẽ không áp dụng cho đến khi thay đổi rõ ràng Add tag @@ -5970,7 +5970,7 @@ src/app/components/document-detail/document-detail.component.ts 724 - Do you really want to delete document ""? + Bạn có thực sự muốn xóa tài liệu ""? The files for this document will be deleted permanently. This operation cannot be undone. @@ -5978,7 +5978,7 @@ src/app/components/document-detail/document-detail.component.ts 725 - The files for this document will be deleted permanently. This operation cannot be undone. + Các tập tin cho tài liệu này sẽ bị xóa vĩnh viễn. Không thể hoàn tác thao tác này Delete document @@ -5986,7 +5986,7 @@ src/app/components/document-detail/document-detail.component.ts 727 - Delete document + Xóa tài liệu Error deleting document @@ -6724,7 +6724,7 @@ src/app/components/document-list/document-list.component.html 146 - Sort by correspondent + Sắp xếp theo người biên tập Sort by title @@ -7132,7 +7132,7 @@ src/app/components/manage/correspondent-list/correspondent-list.component.ts 67 - Do you really want to delete the correspondent ""? + Bạn có thực sự muốn xóa người biên tập ""? Customize the data fields that can be attached to documents. @@ -7564,7 +7564,7 @@ src/app/components/manage/management-list/management-list.component.ts 180 - Successfully updated . + Đã cập nhật thành công. Error occurred while saving . @@ -7668,7 +7668,7 @@ src/app/components/manage/tag-list/tag-list.component.ts 53 - Do you really want to delete the tag ""? + Bạn có thực sự muốn xóa thẻ ""? Use workflows to customize the behavior of TC GROUP when events 'trigger' a workflow. @@ -7932,7 +7932,7 @@ src/app/data/paperless-config.ts 50 - General Settings + Cài đặt chung OCR Settings diff --git a/src/documents/consumer.py b/src/documents/consumer.py index c735ed4c8..fa3bf2e75 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -36,7 +36,7 @@ from documents.models import Tag from documents.models import Workflow from documents.models import WorkflowAction from documents.models import WorkflowTrigger -from documents.parsers import DocumentParser +from documents.parsers import DocumentParser, custom_get_parser_class_for_mime_type from documents.parsers import ParseError from documents.parsers import get_parser_class_for_mime_type from documents.parsers import parse_date @@ -557,7 +557,7 @@ class Consumer(LoggingMixin): self.log.debug(f"Detected mime type: {mime_type}") # Based on the mime type, get the parser for that type - parser_class: Optional[type[DocumentParser]] = get_parser_class_for_mime_type( + parser_class: Optional[type[DocumentParser]] = custom_get_parser_class_for_mime_type( mime_type, ) if not parser_class: diff --git a/src/documents/migrations/1047_warehouse.py b/src/documents/migrations/1047_warehouse.py deleted file mode 100644 index 1ac590460..000000000 --- a/src/documents/migrations/1047_warehouse.py +++ /dev/null @@ -1,30 +0,0 @@ -# Generated by Django 4.2.11 on 2024-05-15 04:18 - -from django.conf import settings -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ('documents', '1046_workflowaction_remove_all_correspondents_and_more'), - ] - - operations = [ - migrations.CreateModel( - name='Warehouse', - fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('name', models.CharField(max_length=256, unique=True, verbose_name='name')), - ('type', models.CharField(blank=True, choices=[(1, 'Warehouse'), (2, 'Shelf'), (3, 'Boxcase')], default=1, max_length=20, null=True)), - ('owner', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL, verbose_name='owner')), - ('parent_warehouse', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='parent_warehouses', to='documents.warehouse')), - ], - options={ - 'verbose_name': 'warehouse', - 'verbose_name_plural': 'warehouses', - }, - ), - ] diff --git a/src/documents/parsers.py b/src/documents/parsers.py index d781ddb9f..3a43886d1 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -14,11 +14,13 @@ from typing import Optional from django.conf import settings from django.utils import timezone +import requests from documents.loggers import LoggingMixin from documents.signals import document_consumer_declaration from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess +from paperless.models import ApplicationConfiguration # This regular expression will try to find dates in the document at # hand and will match the following formats: @@ -129,6 +131,38 @@ def get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentPar # Return the parser with the highest weight. return best_parser["parser"] +def custom_get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentParser"]]: + """ + Returns the best parser (by weight) for the given mimetype or + None if no parser exists + """ + + options = [] + + for response in document_consumer_declaration.send(None): + parser_declaration = response[1] + supported_mime_types = parser_declaration["mime_types"] + + if mime_type in supported_mime_types: + options.append(parser_declaration) + + if not options: + return None + k = ApplicationConfiguration.objects.filter().first() + best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[1] + if k.ocr_key!='': + headers = { + 'Authorization': f'Bearer {k.ocr_key}' + } + url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"] + response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers) + logger.debug(f'status code: {response_ocr.status_code}') + if response_ocr.status_code != 401: + best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0] + logger.debug('Successful key authentication ...') + logger.debug('Fail key authentication ...', best_parser["parser"]) + # Return the parser with the highest weight. + return best_parser["parser"] def run_convert( input_file, diff --git a/src/paperless/migrations/0004_applicationconfiguration_ocr_key.py b/src/paperless/migrations/0004_applicationconfiguration_ocr_key.py new file mode 100644 index 000000000..a1a115f73 --- /dev/null +++ b/src/paperless/migrations/0004_applicationconfiguration_ocr_key.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.11 on 2024-05-22 02:52 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('paperless', '0003_alter_applicationconfiguration_max_image_pixels'), + ] + + operations = [ + migrations.AddField( + model_name='applicationconfiguration', + name='ocr_key', + field=models.CharField(blank=True, max_length=48, null=True, verbose_name='Sets key for advanced version'), + ), + ] diff --git a/src/paperless/migrations/0005_alter_applicationconfiguration_ocr_key.py b/src/paperless/migrations/0005_alter_applicationconfiguration_ocr_key.py new file mode 100644 index 000000000..1b33698ed --- /dev/null +++ b/src/paperless/migrations/0005_alter_applicationconfiguration_ocr_key.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.11 on 2024-05-22 07:01 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('paperless', '0004_applicationconfiguration_ocr_key'), + ] + + operations = [ + migrations.AlterField( + model_name='applicationconfiguration', + name='ocr_key', + field=models.CharField(blank=True, max_length=100, null=True, verbose_name='Sets key for advanced version'), + ), + ] diff --git a/src/paperless/migrations/0006_alter_applicationconfiguration_ocr_key.py b/src/paperless/migrations/0006_alter_applicationconfiguration_ocr_key.py new file mode 100644 index 000000000..a3225215b --- /dev/null +++ b/src/paperless/migrations/0006_alter_applicationconfiguration_ocr_key.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.11 on 2024-05-22 07:03 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('paperless', '0005_alter_applicationconfiguration_ocr_key'), + ] + + operations = [ + migrations.AlterField( + model_name='applicationconfiguration', + name='ocr_key', + field=models.CharField(blank=True, max_length=200, null=True, verbose_name='Sets key for advanced version'), + ), + ] diff --git a/src/paperless/models.py b/src/paperless/models.py index 1f6cfbced..03b74bbc9 100644 --- a/src/paperless/models.py +++ b/src/paperless/models.py @@ -184,6 +184,13 @@ class ApplicationConfiguration(AbstractSingletonModel): upload_to="logo/", ) + ocr_key = models.CharField( + verbose_name=_("Sets key for advanced version"), + null=True, + blank=True, + max_length=200, + ) + class Meta: verbose_name = _("paperless application settings") diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 33e882101..a63c8931b 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -294,7 +294,7 @@ INSTALLED_APPS = [ "django_extensions", "paperless", "documents.apps.DocumentsConfig", - # "paperless_tesseract.apps.PaperlessTesseractConfig", + "paperless_tesseract.apps.PaperlessTesseractConfig", "paperless_ocr_custom.apps.PaperlessTesseractConfig", "paperless_text.apps.PaperlessTextConfig", "paperless_mail.apps.PaperlessMailConfig", @@ -419,12 +419,7 @@ CHANNEL_LAYERS = { # PAPERLESS_OCR_CUSTOM TCGROUP_OCR_CUSTOM = { - "ACCOUNT": { - "OCR_CUSTOM_USERNAME": os.getenv("OCR_CUSTOM_USERNAME", "test"), - "OCR_CUSTOM_PASSWORD": os.getenv("OCR_CUSTOM_PASSWORD", "test"), - }, "URL": { - "URL_LOGIN": os.getenv("URL_LOGIN","https://ocr-core-api.tcgroup.vn/token"), "URL_UPLOAD_FILE": os.getenv("URL_UPLOAD_FILE","https://ocr-core-api.tcgroup.vn/api/v1/file/upload"), "URL_OCR_BY_FILEID": os.getenv("URL_OCR_BY_FILEID","https://ocr-core-api.tcgroup.vn/api/v1/ocr/general"), } diff --git a/src/paperless_ocr_custom/parsers.py b/src/paperless_ocr_custom/parsers.py index de1d7e2c3..3448151a9 100644 --- a/src/paperless_ocr_custom/parsers.py +++ b/src/paperless_ocr_custom/parsers.py @@ -9,20 +9,16 @@ from pathlib import Path from typing import TYPE_CHECKING from typing import Optional -import PyPDF2 from django.conf import settings import requests -from PyPDF2 import PdfFileWriter, PdfFileReader, PdfReader, PdfWriter +from PyPDF2 import PdfReader from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import letter -from PIL import Image,ImageDraw,ImageFont -from reportlab.pdfgen.canvas import Canvas +from PIL import Image from reportlab.pdfbase.ttfonts import TTFont from reportlab.pdfbase import pdfmetrics from pdf2image import convert_from_path from reportlab.lib.utils import ImageReader -from reportlab.lib.styles import getSampleStyleSheet -from reportlab.platypus import Paragraph from documents.parsers import DocumentParser from documents.parsers import ParseError @@ -30,7 +26,7 @@ from documents.parsers import make_thumbnail_from_pdf from documents.utils import maybe_override_pixel_limit from documents.utils import run_subprocess from paperless.config import OcrConfig -from paperless.models import ArchiveFileChoices +from paperless.models import ApplicationConfiguration, ArchiveFileChoices from paperless.models import CleanChoices from paperless.models import ModeChoices @@ -155,21 +151,23 @@ class RasterisedDocumentParser(DocumentParser): # get ocr file img/pdf def ocr_file(self,path_file): # get text from api - ocr_custom_username = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_USERNAME"] - ocr_custom_password = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_PASSWORD"] - url_login = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_LOGIN"] - data = { - 'username': ocr_custom_username, - 'password': ocr_custom_password - } - response_login = requests.post(url_login, data=data) - access_token = '' - if response_login.status_code == 200: - response_data = response_login.json() - access_token = response_data.get('access_token','') - else: - logging.error('login: ', response_login.status_code) + # ocr_custom_username = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_USERNAME"] + # ocr_custom_password = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_PASSWORD"] + # url_login = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_LOGIN"] + # data = { + # 'username': ocr_custom_username, + # 'password': ocr_custom_password + # } + # response_login = requests.post(url_login, data=data) + # access_token = '' + # if response_login.status_code == 200: + # response_data = response_login.json() + # access_token = response_data.get('access_token','') + # else: + # logging.error('login: ', response_login.status_code) + k = ApplicationConfiguration.objects.filter().first() + access_token = k.ocr_key # upload file get_file_id = '' url_upload_file = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_UPLOAD_FILE"] diff --git a/src/paperless_ocr_custom/signals.py b/src/paperless_ocr_custom/signals.py index d924e0439..d03450b5f 100644 --- a/src/paperless_ocr_custom/signals.py +++ b/src/paperless_ocr_custom/signals.py @@ -7,7 +7,7 @@ def get_parser(*args, **kwargs): def tesseract_consumer_declaration(sender, **kwargs): return { "parser": get_parser, - "weight": 0, + "weight": 1, "mime_types": { "application/pdf": ".pdf", "image/jpeg": ".jpg",