update feature ocr
This commit is contained in:
@@ -36,7 +36,7 @@ from documents.models import Tag
|
||||
from documents.models import Workflow
|
||||
from documents.models import WorkflowAction
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import DocumentParser, custom_get_parser_class_for_mime_type
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.parsers import parse_date
|
||||
@@ -557,7 +557,7 @@ class Consumer(LoggingMixin):
|
||||
self.log.debug(f"Detected mime type: {mime_type}")
|
||||
|
||||
# Based on the mime type, get the parser for that type
|
||||
parser_class: Optional[type[DocumentParser]] = get_parser_class_for_mime_type(
|
||||
parser_class: Optional[type[DocumentParser]] = custom_get_parser_class_for_mime_type(
|
||||
mime_type,
|
||||
)
|
||||
if not parser_class:
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
# Generated by Django 4.2.11 on 2024-05-15 04:18
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
|
||||
('documents', '1046_workflowaction_remove_all_correspondents_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Warehouse',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('name', models.CharField(max_length=256, unique=True, verbose_name='name')),
|
||||
('type', models.CharField(blank=True, choices=[(1, 'Warehouse'), (2, 'Shelf'), (3, 'Boxcase')], default=1, max_length=20, null=True)),
|
||||
('owner', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL, verbose_name='owner')),
|
||||
('parent_warehouse', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='parent_warehouses', to='documents.warehouse')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'warehouse',
|
||||
'verbose_name_plural': 'warehouses',
|
||||
},
|
||||
),
|
||||
]
|
||||
@@ -14,11 +14,13 @@ from typing import Optional
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
import requests
|
||||
|
||||
from documents.loggers import LoggingMixin
|
||||
from documents.signals import document_consumer_declaration
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.models import ApplicationConfiguration
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
@@ -129,6 +131,38 @@ def get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentPar
|
||||
# Return the parser with the highest weight.
|
||||
return best_parser["parser"]
|
||||
|
||||
def custom_get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentParser"]]:
|
||||
"""
|
||||
Returns the best parser (by weight) for the given mimetype or
|
||||
None if no parser exists
|
||||
"""
|
||||
|
||||
options = []
|
||||
|
||||
for response in document_consumer_declaration.send(None):
|
||||
parser_declaration = response[1]
|
||||
supported_mime_types = parser_declaration["mime_types"]
|
||||
|
||||
if mime_type in supported_mime_types:
|
||||
options.append(parser_declaration)
|
||||
|
||||
if not options:
|
||||
return None
|
||||
k = ApplicationConfiguration.objects.filter().first()
|
||||
best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[1]
|
||||
if k.ocr_key!='':
|
||||
headers = {
|
||||
'Authorization': f'Bearer {k.ocr_key}'
|
||||
}
|
||||
url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"]
|
||||
response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers)
|
||||
logger.debug(f'status code: {response_ocr.status_code}')
|
||||
if response_ocr.status_code != 401:
|
||||
best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
|
||||
logger.debug('Successful key authentication ...')
|
||||
logger.debug('Fail key authentication ...', best_parser["parser"])
|
||||
# Return the parser with the highest weight.
|
||||
return best_parser["parser"]
|
||||
|
||||
def run_convert(
|
||||
input_file,
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
# Generated by Django 4.2.11 on 2024-05-22 02:52
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('paperless', '0003_alter_applicationconfiguration_max_image_pixels'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='applicationconfiguration',
|
||||
name='ocr_key',
|
||||
field=models.CharField(blank=True, max_length=48, null=True, verbose_name='Sets key for advanced version'),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,18 @@
|
||||
# Generated by Django 4.2.11 on 2024-05-22 07:01
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('paperless', '0004_applicationconfiguration_ocr_key'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='applicationconfiguration',
|
||||
name='ocr_key',
|
||||
field=models.CharField(blank=True, max_length=100, null=True, verbose_name='Sets key for advanced version'),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,18 @@
|
||||
# Generated by Django 4.2.11 on 2024-05-22 07:03
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('paperless', '0005_alter_applicationconfiguration_ocr_key'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='applicationconfiguration',
|
||||
name='ocr_key',
|
||||
field=models.CharField(blank=True, max_length=200, null=True, verbose_name='Sets key for advanced version'),
|
||||
),
|
||||
]
|
||||
@@ -184,6 +184,13 @@ class ApplicationConfiguration(AbstractSingletonModel):
|
||||
upload_to="logo/",
|
||||
)
|
||||
|
||||
ocr_key = models.CharField(
|
||||
verbose_name=_("Sets key for advanced version"),
|
||||
null=True,
|
||||
blank=True,
|
||||
max_length=200,
|
||||
)
|
||||
|
||||
class Meta:
|
||||
verbose_name = _("paperless application settings")
|
||||
|
||||
|
||||
@@ -294,7 +294,7 @@ INSTALLED_APPS = [
|
||||
"django_extensions",
|
||||
"paperless",
|
||||
"documents.apps.DocumentsConfig",
|
||||
# "paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
"paperless_ocr_custom.apps.PaperlessTesseractConfig",
|
||||
"paperless_text.apps.PaperlessTextConfig",
|
||||
"paperless_mail.apps.PaperlessMailConfig",
|
||||
@@ -419,12 +419,7 @@ CHANNEL_LAYERS = {
|
||||
|
||||
# PAPERLESS_OCR_CUSTOM
|
||||
TCGROUP_OCR_CUSTOM = {
|
||||
"ACCOUNT": {
|
||||
"OCR_CUSTOM_USERNAME": os.getenv("OCR_CUSTOM_USERNAME", "test"),
|
||||
"OCR_CUSTOM_PASSWORD": os.getenv("OCR_CUSTOM_PASSWORD", "test"),
|
||||
},
|
||||
"URL": {
|
||||
"URL_LOGIN": os.getenv("URL_LOGIN","https://ocr-core-api.tcgroup.vn/token"),
|
||||
"URL_UPLOAD_FILE": os.getenv("URL_UPLOAD_FILE","https://ocr-core-api.tcgroup.vn/api/v1/file/upload"),
|
||||
"URL_OCR_BY_FILEID": os.getenv("URL_OCR_BY_FILEID","https://ocr-core-api.tcgroup.vn/api/v1/ocr/general"),
|
||||
}
|
||||
|
||||
@@ -9,20 +9,16 @@ from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import Optional
|
||||
|
||||
import PyPDF2
|
||||
from django.conf import settings
|
||||
import requests
|
||||
from PyPDF2 import PdfFileWriter, PdfFileReader, PdfReader, PdfWriter
|
||||
from PyPDF2 import PdfReader
|
||||
from reportlab.pdfgen import canvas
|
||||
from reportlab.lib.pagesizes import letter
|
||||
from PIL import Image,ImageDraw,ImageFont
|
||||
from reportlab.pdfgen.canvas import Canvas
|
||||
from PIL import Image
|
||||
from reportlab.pdfbase.ttfonts import TTFont
|
||||
from reportlab.pdfbase import pdfmetrics
|
||||
from pdf2image import convert_from_path
|
||||
from reportlab.lib.utils import ImageReader
|
||||
from reportlab.lib.styles import getSampleStyleSheet
|
||||
from reportlab.platypus import Paragraph
|
||||
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
@@ -30,7 +26,7 @@ from documents.parsers import make_thumbnail_from_pdf
|
||||
from documents.utils import maybe_override_pixel_limit
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.models import ArchiveFileChoices
|
||||
from paperless.models import ApplicationConfiguration, ArchiveFileChoices
|
||||
from paperless.models import CleanChoices
|
||||
from paperless.models import ModeChoices
|
||||
|
||||
@@ -155,21 +151,23 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
# get ocr file img/pdf
|
||||
def ocr_file(self,path_file):
|
||||
# get text from api
|
||||
ocr_custom_username = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_USERNAME"]
|
||||
ocr_custom_password = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_PASSWORD"]
|
||||
url_login = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_LOGIN"]
|
||||
data = {
|
||||
'username': ocr_custom_username,
|
||||
'password': ocr_custom_password
|
||||
}
|
||||
response_login = requests.post(url_login, data=data)
|
||||
access_token = ''
|
||||
if response_login.status_code == 200:
|
||||
response_data = response_login.json()
|
||||
access_token = response_data.get('access_token','')
|
||||
else:
|
||||
logging.error('login: ', response_login.status_code)
|
||||
# ocr_custom_username = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_USERNAME"]
|
||||
# ocr_custom_password = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_PASSWORD"]
|
||||
# url_login = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_LOGIN"]
|
||||
# data = {
|
||||
# 'username': ocr_custom_username,
|
||||
# 'password': ocr_custom_password
|
||||
# }
|
||||
# response_login = requests.post(url_login, data=data)
|
||||
# access_token = ''
|
||||
# if response_login.status_code == 200:
|
||||
# response_data = response_login.json()
|
||||
# access_token = response_data.get('access_token','')
|
||||
# else:
|
||||
# logging.error('login: ', response_login.status_code)
|
||||
|
||||
k = ApplicationConfiguration.objects.filter().first()
|
||||
access_token = k.ocr_key
|
||||
# upload file
|
||||
get_file_id = ''
|
||||
url_upload_file = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_UPLOAD_FILE"]
|
||||
|
||||
@@ -7,7 +7,7 @@ def get_parser(*args, **kwargs):
|
||||
def tesseract_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 0,
|
||||
"weight": 1,
|
||||
"mime_types": {
|
||||
"application/pdf": ".pdf",
|
||||
"image/jpeg": ".jpg",
|
||||
|
||||
Reference in New Issue
Block a user