Messing with conversion of azure output to hocr

Working GCV
Rename to remote ocr
2024-03-08 22:28:14 -08:00 · 2024-03-08 21:19:15 -08:00 · 2024-03-08 21:19:15 -08:00 · 2024-03-08 21:19:15 -08:00 · 2024-03-08 21:18:02 -08:00 · 2024-03-08 21:17:27 -08:00
12 changed files with 1129 additions and 244 deletions
--- a/4
+++ b/4
@ -4,6 +4,8 @@ verify_ssl = true
 name = "pypi"

 [packages]
+azure-ai-formrecognizer = "*"
+boto3 = "*"
 dateparser = "~=1.2"
 # WARNING: django does not use semver.
 #          Only patch versions are guaranteed to not introduce breaking changes.
@ -27,6 +29,8 @@ channels-redis = "*"
 concurrent-log-handler = "*"
 filelock = "*"
 flower = "*"
+google-cloud-vision = "*"
+google-cloud-storage = "*"
 gotenberg-client = "*"
 gunicorn = "*"
 imap-tools = "*"
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@ -297,6 +297,7 @@ INSTALLED_APPS = [
    "paperless_tesseract.apps.PaperlessTesseractConfig",
    "paperless_text.apps.PaperlessTextConfig",
    "paperless_mail.apps.PaperlessMailConfig",
+    "paperless_remote.apps.PaperlessRemoteParserConfig",
    "django.contrib.admin",
    "rest_framework",
    "rest_framework.authtoken",
@ -1149,3 +1150,14 @@ EMAIL_SUBJECT_PREFIX: Final[str] = "[Paperless-ngx] "
 if DEBUG:  # pragma: no cover
    EMAIL_BACKEND = "django.core.mail.backends.filebased.EmailBackend"
    EMAIL_FILE_PATH = BASE_DIR / "sent_emails"
+
+###############################################################################
+# Remote Parser                                                               #
+###############################################################################
+
+REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
+REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
+REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
+REMOTE_OCR_API_KEY_ID = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY_ID")
+REMOTE_OCR_REGION = os.getenv("PAPERLESS_REMOTE_OCR_REGION")
+REMOTE_OCR_CREDENTIALS_FILE = os.getenv("PAPERLESS_REMOTE_OCR_CREDENTIALS_FILE")
--- a/src/paperless_remote/init.py
+++ b/src/paperless_remote/init.py
@ -0,0 +1,4 @@
+# this is here so that django finds the checks.
+from paperless_remote.checks import check_remote_parser_configured
+
+__all__ = ["check_remote_parser_configured"]
--- a/src/paperless_remote/apps.py
+++ b/src/paperless_remote/apps.py
@ -0,0 +1,14 @@
+from django.apps import AppConfig
+
+from paperless_remote.signals import remote_consumer_declaration
+
+
+class PaperlessRemoteParserConfig(AppConfig):
+    name = "paperless_remote"
+
+    def ready(self):
+        from documents.signals import document_consumer_declaration
+
+        document_consumer_declaration.connect(remote_consumer_declaration)
+
+        AppConfig.ready(self)
--- a/src/paperless_remote/checks.py
+++ b/src/paperless_remote/checks.py
@ -0,0 +1,39 @@
+from pathlib import Path
+
+from django.conf import settings
+from django.core.checks import Error
+from django.core.checks import register
+
+
+@register()
+def check_remote_parser_configured(app_configs, **kwargs):
+    if (
+        settings.REMOTE_OCR_ENGINE == "azureaivision"
+        and not settings.REMOTE_OCR_ENDPOINT
+    ):
+        return [
+            Error(
+                "Azure AI Vision remote parser requires endpoint to be configured.",
+            ),
+        ]
+
+    if settings.REMOTE_OCR_ENGINE == "awstextract" and (
+        not settings.REMOTE_OCR_API_KEY_ID or not settings.REMOTE_OCR_REGION
+    ):
+        return [
+            Error(
+                "AWS Textract remote parser requires access key ID and region to be configured.",
+            ),
+        ]
+
+    if settings.REMOTE_OCR_ENGINE == "googlecloudvision" and (
+        not settings.REMOTE_OCR_CREDENTIALS_FILE
+        or not Path(settings.REMOTE_OCR_CREDENTIALS_FILE).exists()
+    ):
+        return [
+            Error(
+                "Google Cloud Vision remote parser requires a valid credentials file to be configured.",
+            ),
+        ]
+
+    return []
--- a/src/paperless_remote/parsers.py
+++ b/src/paperless_remote/parsers.py
@ -0,0 +1,239 @@
+import json
+from pathlib import Path
+from typing import Optional
+
+from django.conf import settings
+
+from paperless_tesseract.parsers import RasterisedDocumentParser
+
+
+class RemoteEngineConfig:
+    def __init__(
+        self,
+        engine: str,
+        api_key: Optional[str] = None,
+        endpoint: Optional[str] = None,
+        api_key_id: Optional[str] = None,
+        region: Optional[str] = None,
+        credentials_file: Optional[str] = None,
+    ):
+        self.engine = engine
+        self.api_key = api_key
+        self.endpoint = endpoint
+        self.api_key_id = api_key_id
+        self.region = region
+        self.credentials_file = credentials_file
+
+    def engine_is_valid(self):
+        valid = (
+            self.engine in ["azureaivision", "awstextract", "googlecloudvision"]
+            and self.api_key is not None
+        )
+        if self.engine == "azureaivision":
+            valid = valid and self.endpoint is not None
+        if self.engine == "awstextract":
+            valid = valid and self.region is not None and self.api_key_id is not None
+        if self.engine == "googlecloudvision":
+            valid = self.credentials_file is not None
+        return valid
+
+
+class RemoteDocumentParser(RasterisedDocumentParser):
+    """
+    This parser uses a remote ocr engine to parse documents
+    """
+
+    logging_name = "paperless.parsing.remote"
+
+    def get_settings(self) -> RemoteEngineConfig:
+        """
+        This parser uses the OCR configuration settings to parse documents
+        """
+        return RemoteEngineConfig(
+            engine=settings.REMOTE_OCR_ENGINE,
+            api_key=settings.REMOTE_OCR_API_KEY,
+            endpoint=settings.REMOTE_OCR_ENDPOINT,
+            api_key_id=settings.REMOTE_OCR_API_KEY_ID,
+            region=settings.REMOTE_OCR_REGION,
+            credentials_file=settings.REMOTE_OCR_CREDENTIALS_FILE,
+        )
+
+    def supported_mime_types(self):
+        if self.settings.engine_is_valid():
+            if self.settings.engine == "googlecloudvision":
+                return [
+                    "application/pdf",
+                    "image/tiff",
+                ]
+            else:
+                return [
+                    "application/pdf",
+                    "image/png",
+                    "image/jpeg",
+                    "image/tiff",
+                    "image/bmp",
+                    "image/gif",
+                    "image/webp",
+                ]
+        else:
+            return []
+
+    def aws_textract_parse(
+        self,
+        file: Path,
+    ) -> Optional[str]:
+        import boto3
+
+        client = boto3.client(
+            "textract",
+            region_name=self.settings.region,
+            aws_access_key_id=self.settings.api_key_id,
+            aws_secret_access_key=self.settings.api_key,
+        )
+
+        lines = []
+        with open(file, "rb") as f:
+            file_bytes = f.read()
+            file_bytearray = bytearray(file_bytes)
+
+        self.log.info("Analyzing document with AWS Textract...")
+        response = client.analyze_document(
+            Document={"Bytes": file_bytearray},
+            FeatureTypes=["TABLES"],
+        )
+
+        blocks = response["Blocks"]
+        for block in blocks:
+            if block["BlockType"] == "LINE":
+                lines.append(block["Text"])
+
+        return "\n".join(lines)
+
+    def get_bbox_from_polygon(self, polygon) -> str:  # Sequence[Point]
+        if not polygon:
+            return "0 0 0 0"
+        x_coordinates = [point.x for point in polygon]
+        y_coordinates = [point.y for point in polygon]
+        return f"{min(x_coordinates)} {min(y_coordinates)} {max(x_coordinates)} {max(y_coordinates)}"
+
+    def azure_ai_vision_parse(
+        self,
+        file: Path,
+    ) -> Optional[str]:
+        from azure.ai.formrecognizer import DocumentAnalysisClient
+        from azure.core.credentials import AzureKeyCredential
+
+        credential = AzureKeyCredential(self.settings.api_key)
+        document_analysis_client = DocumentAnalysisClient(
+            endpoint=self.settings.endpoint,
+            credential=credential,
+        )
+
+        with open(file, "rb") as f:
+            self.log.info("Analyzing document with Azure Vision AI...")
+            poller = document_analysis_client.begin_analyze_document(
+                "prebuilt-layout",
+                document=f,
+            )
+        result = poller.result()
+
+        hocr = "<html><body>"
+
+        for page_number, page in enumerate(result.pages, start=1):
+            hocr += f'<div class="ocr_page" id="page_{page_number}" title="bbox 0 0 {page.width} {page.height}">'
+
+            for idx, word in enumerate(page.words):
+                bbox = self.get_bbox_from_polygon(word.polygon)
+                hocr += f'<span class="ocr_word" id="line_{page_number}_{idx}" title="bbox {bbox}">{word.content}</span>'
+
+            hocr += "</div>"
+
+        hocr += "</body></html>"
+
+        self.log.info(f"HOCR output: {hocr}")
+
+        return result.content
+
+    def google_cloud_vision_parse(
+        self,
+        file: Path,
+        mime_type: str,
+    ) -> Optional[str]:
+        # https://cloud.google.com/vision/docs/pdf
+        from django.utils import timezone
+        from google.cloud import storage
+        from google.cloud import vision
+        from google.oauth2 import service_account
+
+        credentials = service_account.Credentials.from_service_account_file(
+            self.settings.credentials_file,
+        )
+
+        client = vision.ImageAnnotatorClient(credentials=credentials)
+        storage_client = storage.Client(credentials=credentials)
+
+        self.log.info("Uploading document to Google Cloud Storage...")
+        bucket_name = f"pngx_{credentials.project_id}_ocrstorage"
+        bucket = storage_client.lookup_bucket(bucket_name)
+        if bucket is None:
+            bucket = storage_client.create_bucket(bucket_name)
+
+        prefix = timezone.now().timestamp()
+        blob = bucket.blob(f"{prefix}/{file.name}")
+        blob.upload_from_filename(str(file))
+        gcs_source_uri = f"gs://{bucket_name}/{prefix}/{file.name}"
+        gcs_destination_uri = f"{gcs_source_uri}.json"
+
+        gcs_source = vision.GcsSource(uri=gcs_source_uri)
+        input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)
+
+        gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
+        output_config = vision.OutputConfig(
+            gcs_destination=gcs_destination,
+        )
+
+        self.log.info("Analyzing document with Google Cloud Vision...")
+        feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
+        async_request = vision.AsyncAnnotateFileRequest(
+            features=[feature],
+            input_config=input_config,
+            output_config=output_config,
+        )
+
+        operation = client.async_batch_annotate_files(requests=[async_request])
+
+        self.log.info("Waiting for Google cloud operation to complete...")
+        operation.result(timeout=180)
+
+        # List objects with the given prefix, filtering out folders.
+        blob_list = [
+            blob
+            for blob in list(bucket.list_blobs(prefix=prefix))
+            if not blob.name.endswith("/")
+        ]
+        # second item is the json
+        output = blob_list[1]
+
+        json_string = output.download_as_bytes().decode("utf-8")
+        response = json.loads(json_string)
+
+        text = ""
+        for response in response["responses"]:
+            annotation = response["fullTextAnnotation"]
+            text += annotation["text"]
+
+        return text
+
+    def parse(self, document_path: Path, mime_type, file_name=None):
+        if not self.settings.engine_is_valid():
+            self.log.warning(
+                "No valid remote parser engine is configured, content will be empty.",
+            )
+            self.text = ""
+            return
+        elif self.settings.engine == "azureaivision":
+            self.text = self.azure_ai_vision_parse(document_path)
+        elif self.settings.engine == "awstextract":
+            self.text = self.aws_textract_parse(document_path)
+        elif self.settings.engine == "googlecloudvision":
+            self.text = self.google_cloud_vision_parse(document_path, mime_type)
--- a/src/paperless_remote/signals.py
+++ b/src/paperless_remote/signals.py
@ -0,0 +1,18 @@
+def get_parser(*args, **kwargs):
+    from paperless_remote.parsers import RemoteDocumentParser
+
+    return RemoteDocumentParser(*args, **kwargs)
+
+
+def get_supported_mime_types():
+    from paperless_remote.parsers import RemoteDocumentParser
+
+    return RemoteDocumentParser(None).supported_mime_types()
+
+
+def remote_consumer_declaration(sender, **kwargs):
+    return {
+        "parser": get_parser,
+        "weight": 5,
+        "mime_types": get_supported_mime_types(),
+    }
--- a/src/paperless_remote/tests/init.py
+++ b/src/paperless_remote/tests/init.py
--- a/src/paperless_remote/tests/samples/simple-digital.pdf
+++ b/src/paperless_remote/tests/samples/simple-digital.pdf
--- a/src/paperless_remote/tests/test_checks.py
+++ b/src/paperless_remote/tests/test_checks.py
@ -0,0 +1,53 @@
+from django.test import TestCase
+from django.test import override_settings
+
+from paperless_remote import check_remote_parser_configured
+
+
+class TestChecks(TestCase):
+    @override_settings(REMOTE_OCR_ENGINE=None)
+    def test_no_engine(self):
+        msgs = check_remote_parser_configured(None)
+        self.assertEqual(len(msgs), 0)
+
+    @override_settings(REMOTE_OCR_ENGINE="azureaivision")
+    @override_settings(REMOTE_OCR_API_KEY="somekey")
+    @override_settings(REMOTE_OCR_ENDPOINT=None)
+    def test_azure_no_endpoint(self):
+        msgs = check_remote_parser_configured(None)
+        self.assertEqual(len(msgs), 1)
+        self.assertTrue(
+            msgs[0].msg.startswith(
+                "Azure AI Vision remote parser requires endpoint to be configured.",
+            ),
+        )
+
+    @override_settings(REMOTE_OCR_ENGINE="awstextract")
+    @override_settings(REMOTE_OCR_API_KEY="somekey")
+    @override_settings(REMOTE_OCR_API_KEY_ID=None)
+    @override_settings(REMOTE_OCR_REGION=None)
+    def test_aws_no_id_or_region(self):
+        msgs = check_remote_parser_configured(None)
+        self.assertEqual(len(msgs), 1)
+        self.assertTrue(
+            msgs[0].msg.startswith(
+                "AWS Textract remote parser requires access key ID and region to be configured.",
+            ),
+        )
+
+    @override_settings(REMOTE_OCR_ENGINE="googlecloudvision")
+    @override_settings(REMOTE_OCR_CREDENTIALS_FILE=None)
+    def test_gcv_no_creds_file(self):
+        msgs = check_remote_parser_configured(None)
+        self.assertEqual(len(msgs), 1)
+        self.assertTrue(
+            msgs[0].msg.startswith(
+                "Google Cloud Vision remote parser requires a valid credentials file to be configured.",
+            ),
+        )
+
+    @override_settings(REMOTE_OCR_ENGINE="something")
+    @override_settings(REMOTE_OCR_API_KEY="somekey")
+    def test_valid_configuration(self):
+        msgs = check_remote_parser_configured(None)
+        self.assertEqual(len(msgs), 0)
--- a/src/paperless_remote/tests/test_parser.py
+++ b/src/paperless_remote/tests/test_parser.py
@ -0,0 +1,176 @@
+import json
+import sys
+import uuid
+from pathlib import Path
+from unittest import mock
+
+import pytest
+from django.test import TestCase
+from django.test import override_settings
+
+from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import FileSystemAssertsMixin
+from paperless_remote.parsers import RemoteDocumentParser
+
+
+class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
+    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
+
+    def assertContainsStrings(self, content, strings):
+        # Asserts that all strings appear in content, in the given order.
+        indices = []
+        for s in strings:
+            if s in content:
+                indices.append(content.index(s))
+            else:
+                self.fail(f"'{s}' is not in '{content}'")
+        self.assertListEqual(indices, sorted(indices))
+
+    @pytest.mark.skipif(
+        sys.version_info > (3, 10),
+        reason="Fails on 3.11 only on CI, for some reason",
+    )  # TODO: investigate
+    @mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient")
+    def test_get_text_with_azure(self, mock_azure_client):
+        result = mock.Mock()
+        result.content = "This is a test document."
+        result.pages = [
+            mock.Mock(
+                width=100,
+                height=100,
+                words=[
+                    mock.Mock(
+                        content="This",
+                        polygon=[
+                            mock.Mock(x=0, y=0),
+                        ],
+                    ),
+                    mock.Mock(
+                        content="is",
+                        polygon=[
+                            mock.Mock(x=10, y=10),
+                        ],
+                    ),
+                    mock.Mock(
+                        content="a",
+                        polygon=[
+                            mock.Mock(x=20, y=20),
+                        ],
+                    ),
+                    mock.Mock(
+                        content="test",
+                        polygon=[
+                            mock.Mock(x=30, y=30),
+                        ],
+                    ),
+                    mock.Mock(
+                        content="document.",
+                        polygon=[
+                            mock.Mock(x=40, y=40),
+                        ],
+                    ),
+                ],
+            ),
+        ]
+
+        mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = (
+            result
+        )
+
+        with override_settings(
+            REMOTE_OCR_ENGINE="azureaivision",
+            REMOTE_OCR_API_KEY="somekey",
+            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com/",
+        ):
+            parser = RemoteDocumentParser(uuid.uuid4())
+            parser.parse(
+                self.SAMPLE_FILES / "simple-digital.pdf",
+                "application/pdf",
+            )
+
+            self.assertContainsStrings(
+                parser.text.strip(),
+                ["This is a test document."],
+            )
+
+    @mock.patch("boto3.client")
+    def test_get_text_with_awstextract(self, mock_aws_client):
+        mock_aws_client.return_value.analyze_document.return_value = {
+            "Blocks": [
+                {
+                    "BlockType": "LINE",
+                    "Text": "This is a test document.",
+                },
+            ],
+        }
+
+        with override_settings(
+            REMOTE_OCR_ENGINE="awstextract",
+            REMOTE_OCR_API_KEY="somekey",
+            REMOTE_OCR_API_KEY_ID="somekeyid",
+            REMOTE_OCR_REGION="us-west-2",
+        ):
+            parser = RemoteDocumentParser(uuid.uuid4())
+            parser.parse(
+                self.SAMPLE_FILES / "simple-digital.pdf",
+                "application/pdf",
+            )
+
+            self.assertContainsStrings(
+                parser.text.strip(),
+                ["This is a test document."],
+            )
+
+    @mock.patch("google.cloud.vision.ImageAnnotatorClient")
+    @mock.patch("google.cloud.storage.Client")
+    @mock.patch("google.oauth2.service_account.Credentials.from_service_account_file")
+    def test_get_text_with_googlecloudvision(
+        self,
+        mock_credentials_from_file,
+        mock_gcs_client,
+        mock_gcv_client,
+    ):
+        credentials = mock.Mock()
+        credentials.project_id = "someproject"
+        mock_credentials_from_file.return_value = credentials
+
+        blob_mock0 = mock.Mock()
+        blob_mock0.name = "somefile.pdf"
+        blob_mock1 = mock.Mock()
+        blob_mock1.name = "somefile.json"
+
+        blob_mock1.download_as_bytes.return_value.decode.return_value = json.dumps(
+            {
+                "responses": [
+                    {
+                        "fullTextAnnotation": {
+                            "text": "This is a test document.",
+                        },
+                    },
+                ],
+            },
+        )
+
+        mock_gcs_client.return_value.lookup_bucket.return_value.list_blobs.return_value = [
+            blob_mock0,
+            blob_mock1,
+        ]
+
+        result = mock.Mock()
+        result.result = mock.Mock()
+        mock_gcv_client.return_value.async_batch_annotate_files.return_value = result
+
+        with override_settings(
+            REMOTE_OCR_ENGINE="googlecloudvision",
+            REMOTE_OCR_CREDENTIALS_FILE="somefile.json",
+        ):
+            parser = RemoteDocumentParser(uuid.uuid4())
+            parser.parse(
+                self.SAMPLE_FILES / "simple-digital.pdf",
+                "application/pdf",
+            )
+
+            self.assertContainsStrings(
+                parser.text.strip(),
+                ["This is a test document."],
+            )
Author	SHA1	Message	Date
shamoon	0e9d2f6831	Messing with conversion of azure output to hocr	2024-03-08 22:28:14 -08:00
shamoon	ec505e41fa	Working GCV	2024-03-08 21:19:15 -08:00
shamoon	24c40bbc5e	Rename to remote ocr	2024-03-08 21:19:15 -08:00
shamoon	fba4ce9147	Try deps update again	2024-03-08 21:19:15 -08:00
shamoon	a0c6d25d9a	Add aws textract, remove chatgpt	2024-03-08 21:18:02 -08:00
shamoon	6e7e40e7a2	Add (non-working) Google cloud vision	2024-03-08 21:17:27 -08:00
shamoon	eacafbcb36	Oh wow this works for azure, not chatgpt	2024-03-08 21:17:27 -08:00