update feature ocr

This commit is contained in:
otxtan@gmail.com
2024-05-22 14:17:12 +07:00
parent fcd079b2fe
commit 572506720f
15 changed files with 149 additions and 84 deletions

View File

@@ -36,7 +36,7 @@ from documents.models import Tag
from documents.models import Workflow
from documents.models import WorkflowAction
from documents.models import WorkflowTrigger
from documents.parsers import DocumentParser
from documents.parsers import DocumentParser, custom_get_parser_class_for_mime_type
from documents.parsers import ParseError
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import parse_date
@@ -557,7 +557,7 @@ class Consumer(LoggingMixin):
self.log.debug(f"Detected mime type: {mime_type}")
# Based on the mime type, get the parser for that type
parser_class: Optional[type[DocumentParser]] = get_parser_class_for_mime_type(
parser_class: Optional[type[DocumentParser]] = custom_get_parser_class_for_mime_type(
mime_type,
)
if not parser_class:

View File

@@ -1,30 +0,0 @@
# Generated by Django 4.2.11 on 2024-05-15 04:18
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
('documents', '1046_workflowaction_remove_all_correspondents_and_more'),
]
operations = [
migrations.CreateModel(
name='Warehouse',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('name', models.CharField(max_length=256, unique=True, verbose_name='name')),
('type', models.CharField(blank=True, choices=[(1, 'Warehouse'), (2, 'Shelf'), (3, 'Boxcase')], default=1, max_length=20, null=True)),
('owner', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL, verbose_name='owner')),
('parent_warehouse', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='parent_warehouses', to='documents.warehouse')),
],
options={
'verbose_name': 'warehouse',
'verbose_name_plural': 'warehouses',
},
),
]

View File

@@ -14,11 +14,13 @@ from typing import Optional
from django.conf import settings
from django.utils import timezone
import requests
from documents.loggers import LoggingMixin
from documents.signals import document_consumer_declaration
from documents.utils import copy_file_with_basic_stats
from documents.utils import run_subprocess
from paperless.models import ApplicationConfiguration
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
@@ -129,6 +131,38 @@ def get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentPar
# Return the parser with the highest weight.
return best_parser["parser"]
def custom_get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentParser"]]:
"""
Returns the best parser (by weight) for the given mimetype or
None if no parser exists
"""
options = []
for response in document_consumer_declaration.send(None):
parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"]
if mime_type in supported_mime_types:
options.append(parser_declaration)
if not options:
return None
k = ApplicationConfiguration.objects.filter().first()
best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[1]
if k.ocr_key!='':
headers = {
'Authorization': f'Bearer {k.ocr_key}'
}
url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"]
response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers)
logger.debug(f'status code: {response_ocr.status_code}')
if response_ocr.status_code != 401:
best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
logger.debug('Successful key authentication ...')
logger.debug('Fail key authentication ...', best_parser["parser"])
# Return the parser with the highest weight.
return best_parser["parser"]
def run_convert(
input_file,