update feature ocr

2024-05-22 14:17:12 +07:00
parent fcd079b2fe
commit 572506720f
15 changed files with 149 additions and 84 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -36,7 +36,7 @@ from documents.models import Tag
 from documents.models import Workflow
 from documents.models import WorkflowAction
 from documents.models import WorkflowTrigger
-from documents.parsers import DocumentParser
+from documents.parsers import DocumentParser, custom_get_parser_class_for_mime_type
 from documents.parsers import ParseError
 from documents.parsers import get_parser_class_for_mime_type
 from documents.parsers import parse_date
@@ -557,7 +557,7 @@ class Consumer(LoggingMixin):
        self.log.debug(f"Detected mime type: {mime_type}")

        # Based on the mime type, get the parser for that type
-        parser_class: Optional[type[DocumentParser]] = get_parser_class_for_mime_type(
+        parser_class: Optional[type[DocumentParser]] = custom_get_parser_class_for_mime_type(
            mime_type,
        )
        if not parser_class:
--- a/src/documents/migrations/1047_warehouse.py
+++ b/src/documents/migrations/1047_warehouse.py
@@ -1,30 +0,0 @@
-# Generated by Django 4.2.11 on 2024-05-15 04:18
-
-from django.conf import settings
-from django.db import migrations, models
-import django.db.models.deletion
-
-
-class Migration(migrations.Migration):
-
-    dependencies = [
-        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
-        ('documents', '1046_workflowaction_remove_all_correspondents_and_more'),
-    ]
-
-    operations = [
-        migrations.CreateModel(
-            name='Warehouse',
-            fields=[
-                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
-                ('name', models.CharField(max_length=256, unique=True, verbose_name='name')),
-                ('type', models.CharField(blank=True, choices=[(1, 'Warehouse'), (2, 'Shelf'), (3, 'Boxcase')], default=1, max_length=20, null=True)),
-                ('owner', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL, verbose_name='owner')),
-                ('parent_warehouse', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='parent_warehouses', to='documents.warehouse')),
-            ],
-            options={
-                'verbose_name': 'warehouse',
-                'verbose_name_plural': 'warehouses',
-            },
-        ),
-    ]
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -14,11 +14,13 @@ from typing import Optional

 from django.conf import settings
 from django.utils import timezone
+import requests

 from documents.loggers import LoggingMixin
 from documents.signals import document_consumer_declaration
 from documents.utils import copy_file_with_basic_stats
 from documents.utils import run_subprocess
+from paperless.models import ApplicationConfiguration

 # This regular expression will try to find dates in the document at
 # hand and will match the following formats:
@@ -129,6 +131,38 @@ def get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentPar
    # Return the parser with the highest weight.
    return best_parser["parser"]

+def custom_get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentParser"]]:
+    """
+    Returns the best parser (by weight) for the given mimetype or
+    None if no parser exists
+    """
+
+    options = []
+
+    for response in document_consumer_declaration.send(None):
+        parser_declaration = response[1]
+        supported_mime_types = parser_declaration["mime_types"]
+
+        if mime_type in supported_mime_types:
+            options.append(parser_declaration)
+
+    if not options:
+        return None
+    k = ApplicationConfiguration.objects.filter().first()
+    best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[1]
+    if k.ocr_key!='':
+        headers = {
+            'Authorization': f'Bearer {k.ocr_key}'
+        }
+        url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"]
+        response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers)
+        logger.debug(f'status code: {response_ocr.status_code}')
+        if response_ocr.status_code != 401:
+            best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
+            logger.debug('Successful key authentication ...')
+    logger.debug('Fail key authentication ...', best_parser["parser"])
+    # Return the parser with the highest weight.
+    return best_parser["parser"]

 def run_convert(
    input_file,