Messing with conversion of azure output to hocr

Working GCV
Rename to remote ocr
2024-03-08 22:28:14 -08:00 · 2024-03-08 21:19:15 -08:00 · 2024-03-08 21:19:15 -08:00 · 2024-03-08 21:19:15 -08:00 · 2024-03-08 21:18:02 -08:00 · 2024-03-08 21:17:27 -08:00
12 changed files with 1129 additions and 244 deletions
--- a/4
+++ b/4
@ -4,6 +4,8 @@ verify_ssl = true
 name = "pypi"
 [packages]
 azure-ai-formrecognizer = "*"
 boto3 = "*"
 dateparser = "~=1.2"
 # WARNING: django does not use semver.
 #          Only patch versions are guaranteed to not introduce breaking changes.
@ -27,6 +29,8 @@ channels-redis = "*"
 concurrent-log-handler = "*"
 filelock = "*"
 flower = "*"
 google-cloud-vision = "*"
 google-cloud-storage = "*"
 gotenberg-client = "*"
 gunicorn = "*"
 imap-tools = "*"
--- a/Pipfile.lock
+++ b/Pipfile.lock
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@ -297,6 +297,7 @@ INSTALLED_APPS = [
    "paperless_tesseract.apps.PaperlessTesseractConfig",
    "paperless_text.apps.PaperlessTextConfig",
    "paperless_mail.apps.PaperlessMailConfig",
    "paperless_remote.apps.PaperlessRemoteParserConfig",
    "django.contrib.admin",
    "rest_framework",
    "rest_framework.authtoken",
@ -1149,3 +1150,14 @@ EMAIL_SUBJECT_PREFIX: Final[str] = "[Paperless-ngx] "
 if DEBUG:  # pragma: no cover
    EMAIL_BACKEND = "django.core.mail.backends.filebased.EmailBackend"
    EMAIL_FILE_PATH = BASE_DIR / "sent_emails"
 ###############################################################################
 # Remote Parser                                                               #
 ###############################################################################
 REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
 REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
 REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
 REMOTE_OCR_API_KEY_ID = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY_ID")
 REMOTE_OCR_REGION = os.getenv("PAPERLESS_REMOTE_OCR_REGION")
 REMOTE_OCR_CREDENTIALS_FILE = os.getenv("PAPERLESS_REMOTE_OCR_CREDENTIALS_FILE")
--- a/src/paperless_remote/init.py
+++ b/src/paperless_remote/init.py
@ -0,0 +1,4 @@
 # this is here so that django finds the checks.
 from paperless_remote.checks import check_remote_parser_configured
 __all__ = ["check_remote_parser_configured"]
--- a/src/paperless_remote/apps.py
+++ b/src/paperless_remote/apps.py
@ -0,0 +1,14 @@
 from django.apps import AppConfig
 from paperless_remote.signals import remote_consumer_declaration
 class PaperlessRemoteParserConfig(AppConfig):
    name = "paperless_remote"
    def ready(self):
        from documents.signals import document_consumer_declaration
        document_consumer_declaration.connect(remote_consumer_declaration)
        AppConfig.ready(self)
--- a/src/paperless_remote/checks.py
+++ b/src/paperless_remote/checks.py
@ -0,0 +1,39 @@
 from pathlib import Path
 from django.conf import settings
 from django.core.checks import Error
 from django.core.checks import register
@register()
 def check_remote_parser_configured(app_configs, **kwargs):
    if (
        settings.REMOTE_OCR_ENGINE == "azureaivision"
        and not settings.REMOTE_OCR_ENDPOINT
    ):
        return [
            Error(
                "Azure AI Vision remote parser requires endpoint to be configured.",
            ),
        ]
    if settings.REMOTE_OCR_ENGINE == "awstextract" and (
        not settings.REMOTE_OCR_API_KEY_ID or not settings.REMOTE_OCR_REGION
    ):
        return [
            Error(
                "AWS Textract remote parser requires access key ID and region to be configured.",
            ),
        ]
    if settings.REMOTE_OCR_ENGINE == "googlecloudvision" and (
        not settings.REMOTE_OCR_CREDENTIALS_FILE
        or not Path(settings.REMOTE_OCR_CREDENTIALS_FILE).exists()
    ):
        return [
            Error(
                "Google Cloud Vision remote parser requires a valid credentials file to be configured.",
            ),
        ]
    return []
--- a/src/paperless_remote/parsers.py
+++ b/src/paperless_remote/parsers.py
@ -0,0 +1,239 @@
 import json
 from pathlib import Path
 from typing import Optional
 from django.conf import settings
 from paperless_tesseract.parsers import RasterisedDocumentParser
 class RemoteEngineConfig:
    def __init__(
        self,
        engine: str,
        api_key: Optional[str] = None,
        endpoint: Optional[str] = None,
        api_key_id: Optional[str] = None,
        region: Optional[str] = None,
        credentials_file: Optional[str] = None,
    ):
        self.engine = engine
        self.api_key = api_key
        self.endpoint = endpoint
        self.api_key_id = api_key_id
        self.region = region
        self.credentials_file = credentials_file
    def engine_is_valid(self):
        valid = (
            self.engine in ["azureaivision", "awstextract", "googlecloudvision"]
            and self.api_key is not None
        )
        if self.engine == "azureaivision":
            valid = valid and self.endpoint is not None
        if self.engine == "awstextract":
            valid = valid and self.region is not None and self.api_key_id is not None
        if self.engine == "googlecloudvision":
            valid = self.credentials_file is not None
        return valid
 class RemoteDocumentParser(RasterisedDocumentParser):
    """
    This parser uses a remote ocr engine to parse documents
    """
    logging_name = "paperless.parsing.remote"
    def get_settings(self) -> RemoteEngineConfig:
        """
        This parser uses the OCR configuration settings to parse documents
        """
        return RemoteEngineConfig(
            engine=settings.REMOTE_OCR_ENGINE,
            api_key=settings.REMOTE_OCR_API_KEY,
            endpoint=settings.REMOTE_OCR_ENDPOINT,
            api_key_id=settings.REMOTE_OCR_API_KEY_ID,
            region=settings.REMOTE_OCR_REGION,
            credentials_file=settings.REMOTE_OCR_CREDENTIALS_FILE,
        )
    def supported_mime_types(self):
        if self.settings.engine_is_valid():
            if self.settings.engine == "googlecloudvision":
                return [
                    "application/pdf",
                    "image/tiff",
                ]
            else:
                return [
                    "application/pdf",
                    "image/png",
                    "image/jpeg",
                    "image/tiff",
                    "image/bmp",
                    "image/gif",
                    "image/webp",
                ]
        else:
            return []
    def aws_textract_parse(
        self,
        file: Path,
    ) -> Optional[str]:
        import boto3
        client = boto3.client(
            "textract",
            region_name=self.settings.region,
            aws_access_key_id=self.settings.api_key_id,
            aws_secret_access_key=self.settings.api_key,
        )
        lines = []
        with open(file, "rb") as f:
            file_bytes = f.read()
            file_bytearray = bytearray(file_bytes)
        self.log.info("Analyzing document with AWS Textract...")
        response = client.analyze_document(
            Document={"Bytes": file_bytearray},
            FeatureTypes=["TABLES"],
        )
        blocks = response["Blocks"]
        for block in blocks:
            if block["BlockType"] == "LINE":
                lines.append(block["Text"])
        return "\n".join(lines)
    def get_bbox_from_polygon(self, polygon) -> str:  # Sequence[Point]
        if not polygon:
            return "0 0 0 0"
        x_coordinates = [point.x for point in polygon]
        y_coordinates = [point.y for point in polygon]
        return f"{min(x_coordinates)} {min(y_coordinates)} {max(x_coordinates)} {max(y_coordinates)}"
    def azure_ai_vision_parse(
        self,
        file: Path,
    ) -> Optional[str]:
        from azure.ai.formrecognizer import DocumentAnalysisClient
        from azure.core.credentials import AzureKeyCredential
        credential = AzureKeyCredential(self.settings.api_key)
        document_analysis_client = DocumentAnalysisClient(
            endpoint=self.settings.endpoint,
            credential=credential,
        )
        with open(file, "rb") as f:
            self.log.info("Analyzing document with Azure Vision AI...")
            poller = document_analysis_client.begin_analyze_document(
                "prebuilt-layout",
                document=f,
            )
        result = poller.result()
        hocr = "<html><body>"
        for page_number, page in enumerate(result.pages, start=1):
            hocr += f'<div class="ocr_page" id="page_{page_number}" title="bbox 0 0 {page.width} {page.height}">'
            for idx, word in enumerate(page.words):
                bbox = self.get_bbox_from_polygon(word.polygon)
                hocr += f'<span class="ocr_word" id="line_{page_number}_{idx}" title="bbox {bbox}">{word.content}</span>'
            hocr += "</div>"
        hocr += "</body></html>"
        self.log.info(f"HOCR output: {hocr}")
        return result.content
    def google_cloud_vision_parse(
        self,
        file: Path,
        mime_type: str,
    ) -> Optional[str]:
        # https://cloud.google.com/vision/docs/pdf
        from django.utils import timezone
        from google.cloud import storage
        from google.cloud import vision
        from google.oauth2 import service_account
        credentials = service_account.Credentials.from_service_account_file(
            self.settings.credentials_file,
        )
        client = vision.ImageAnnotatorClient(credentials=credentials)
        storage_client = storage.Client(credentials=credentials)
        self.log.info("Uploading document to Google Cloud Storage...")
        bucket_name = f"pngx_{credentials.project_id}_ocrstorage"
        bucket = storage_client.lookup_bucket(bucket_name)
        if bucket is None:
            bucket = storage_client.create_bucket(bucket_name)
        prefix = timezone.now().timestamp()
        blob = bucket.blob(f"{prefix}/{file.name}")
        blob.upload_from_filename(str(file))
        gcs_source_uri = f"gs://{bucket_name}/{prefix}/{file.name}"
        gcs_destination_uri = f"{gcs_source_uri}.json"
        gcs_source = vision.GcsSource(uri=gcs_source_uri)
        input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)
        gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
        output_config = vision.OutputConfig(
            gcs_destination=gcs_destination,
        )
        self.log.info("Analyzing document with Google Cloud Vision...")
        feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
        async_request = vision.AsyncAnnotateFileRequest(
            features=[feature],
            input_config=input_config,
            output_config=output_config,
        )
        operation = client.async_batch_annotate_files(requests=[async_request])
        self.log.info("Waiting for Google cloud operation to complete...")
        operation.result(timeout=180)
        # List objects with the given prefix, filtering out folders.
        blob_list = [
            blob
            for blob in list(bucket.list_blobs(prefix=prefix))
            if not blob.name.endswith("/")
        ]
        # second item is the json
        output = blob_list[1]
        json_string = output.download_as_bytes().decode("utf-8")
        response = json.loads(json_string)
        text = ""
        for response in response["responses"]:
            annotation = response["fullTextAnnotation"]
            text += annotation["text"]
        return text
    def parse(self, document_path: Path, mime_type, file_name=None):
        if not self.settings.engine_is_valid():
            self.log.warning(
                "No valid remote parser engine is configured, content will be empty.",
            )
            self.text = ""
            return
        elif self.settings.engine == "azureaivision":
            self.text = self.azure_ai_vision_parse(document_path)
        elif self.settings.engine == "awstextract":
            self.text = self.aws_textract_parse(document_path)
        elif self.settings.engine == "googlecloudvision":
            self.text = self.google_cloud_vision_parse(document_path, mime_type)
--- a/src/paperless_remote/signals.py
+++ b/src/paperless_remote/signals.py
@ -0,0 +1,18 @@
 def get_parser(*args, **kwargs):
    from paperless_remote.parsers import RemoteDocumentParser
    return RemoteDocumentParser(*args, **kwargs)
 def get_supported_mime_types():
    from paperless_remote.parsers import RemoteDocumentParser
    return RemoteDocumentParser(None).supported_mime_types()
 def remote_consumer_declaration(sender, **kwargs):
    return {
        "parser": get_parser,
        "weight": 5,
        "mime_types": get_supported_mime_types(),
    }
--- a/src/paperless_remote/tests/init.py
+++ b/src/paperless_remote/tests/init.py
--- a/src/paperless_remote/tests/samples/simple-digital.pdf
+++ b/src/paperless_remote/tests/samples/simple-digital.pdf
--- a/src/paperless_remote/tests/test_checks.py
+++ b/src/paperless_remote/tests/test_checks.py
@ -0,0 +1,53 @@
 from django.test import TestCase
 from django.test import override_settings
 from paperless_remote import check_remote_parser_configured
 class TestChecks(TestCase):
    @override_settings(REMOTE_OCR_ENGINE=None)
    def test_no_engine(self):
        msgs = check_remote_parser_configured(None)
        self.assertEqual(len(msgs), 0)
    @override_settings(REMOTE_OCR_ENGINE="azureaivision")
    @override_settings(REMOTE_OCR_API_KEY="somekey")
    @override_settings(REMOTE_OCR_ENDPOINT=None)
    def test_azure_no_endpoint(self):
        msgs = check_remote_parser_configured(None)
        self.assertEqual(len(msgs), 1)
        self.assertTrue(
            msgs[0].msg.startswith(
                "Azure AI Vision remote parser requires endpoint to be configured.",
            ),
        )
    @override_settings(REMOTE_OCR_ENGINE="awstextract")
    @override_settings(REMOTE_OCR_API_KEY="somekey")
    @override_settings(REMOTE_OCR_API_KEY_ID=None)
    @override_settings(REMOTE_OCR_REGION=None)
    def test_aws_no_id_or_region(self):
        msgs = check_remote_parser_configured(None)
        self.assertEqual(len(msgs), 1)
        self.assertTrue(
            msgs[0].msg.startswith(
                "AWS Textract remote parser requires access key ID and region to be configured.",
            ),
        )
    @override_settings(REMOTE_OCR_ENGINE="googlecloudvision")
    @override_settings(REMOTE_OCR_CREDENTIALS_FILE=None)
    def test_gcv_no_creds_file(self):
        msgs = check_remote_parser_configured(None)
        self.assertEqual(len(msgs), 1)
        self.assertTrue(
            msgs[0].msg.startswith(
                "Google Cloud Vision remote parser requires a valid credentials file to be configured.",
            ),
        )
    @override_settings(REMOTE_OCR_ENGINE="something")
    @override_settings(REMOTE_OCR_API_KEY="somekey")
    def test_valid_configuration(self):
        msgs = check_remote_parser_configured(None)
        self.assertEqual(len(msgs), 0)
--- a/src/paperless_remote/tests/test_parser.py
+++ b/src/paperless_remote/tests/test_parser.py
@ -0,0 +1,176 @@
 import json
 import sys
 import uuid
 from pathlib import Path
 from unittest import mock
 import pytest
 from django.test import TestCase
 from django.test import override_settings
 from documents.tests.utils import DirectoriesMixin
 from documents.tests.utils import FileSystemAssertsMixin
 from paperless_remote.parsers import RemoteDocumentParser
 class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
    def assertContainsStrings(self, content, strings):
        # Asserts that all strings appear in content, in the given order.
        indices = []
        for s in strings:
            if s in content:
                indices.append(content.index(s))
            else:
                self.fail(f"'{s}' is not in '{content}'")
        self.assertListEqual(indices, sorted(indices))
    @pytest.mark.skipif(
        sys.version_info > (3, 10),
        reason="Fails on 3.11 only on CI, for some reason",
    )  # TODO: investigate
    @mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient")
    def test_get_text_with_azure(self, mock_azure_client):
        result = mock.Mock()
        result.content = "This is a test document."
        result.pages = [
            mock.Mock(
                width=100,
                height=100,
                words=[
                    mock.Mock(
                        content="This",
                        polygon=[
                            mock.Mock(x=0, y=0),
                        ],
                    ),
                    mock.Mock(
                        content="is",
                        polygon=[
                            mock.Mock(x=10, y=10),
                        ],
                    ),
                    mock.Mock(
                        content="a",
                        polygon=[
                            mock.Mock(x=20, y=20),
                        ],
                    ),
                    mock.Mock(
                        content="test",
                        polygon=[
                            mock.Mock(x=30, y=30),
                        ],
                    ),
                    mock.Mock(
                        content="document.",
                        polygon=[
                            mock.Mock(x=40, y=40),
                        ],
                    ),
                ],
            ),
        ]
        mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = (
            result
        )
        with override_settings(
            REMOTE_OCR_ENGINE="azureaivision",
            REMOTE_OCR_API_KEY="somekey",
            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com/",
        ):
            parser = RemoteDocumentParser(uuid.uuid4())
            parser.parse(
                self.SAMPLE_FILES / "simple-digital.pdf",
                "application/pdf",
            )
            self.assertContainsStrings(
                parser.text.strip(),
                ["This is a test document."],
            )
    @mock.patch("boto3.client")
    def test_get_text_with_awstextract(self, mock_aws_client):
        mock_aws_client.return_value.analyze_document.return_value = {
            "Blocks": [
                {
                    "BlockType": "LINE",
                    "Text": "This is a test document.",
                },
            ],
        }
        with override_settings(
            REMOTE_OCR_ENGINE="awstextract",
            REMOTE_OCR_API_KEY="somekey",
            REMOTE_OCR_API_KEY_ID="somekeyid",
            REMOTE_OCR_REGION="us-west-2",
        ):
            parser = RemoteDocumentParser(uuid.uuid4())
            parser.parse(
                self.SAMPLE_FILES / "simple-digital.pdf",
                "application/pdf",
            )
            self.assertContainsStrings(
                parser.text.strip(),
                ["This is a test document."],
            )
    @mock.patch("google.cloud.vision.ImageAnnotatorClient")
    @mock.patch("google.cloud.storage.Client")
    @mock.patch("google.oauth2.service_account.Credentials.from_service_account_file")
    def test_get_text_with_googlecloudvision(
        self,
        mock_credentials_from_file,
        mock_gcs_client,
        mock_gcv_client,
    ):
        credentials = mock.Mock()
        credentials.project_id = "someproject"
        mock_credentials_from_file.return_value = credentials
        blob_mock0 = mock.Mock()
        blob_mock0.name = "somefile.pdf"
        blob_mock1 = mock.Mock()
        blob_mock1.name = "somefile.json"
        blob_mock1.download_as_bytes.return_value.decode.return_value = json.dumps(
            {
                "responses": [
                    {
                        "fullTextAnnotation": {
                            "text": "This is a test document.",
                        },
                    },
                ],
            },
        )
        mock_gcs_client.return_value.lookup_bucket.return_value.list_blobs.return_value = [
            blob_mock0,
            blob_mock1,
        ]
        result = mock.Mock()
        result.result = mock.Mock()
        mock_gcv_client.return_value.async_batch_annotate_files.return_value = result
        with override_settings(
            REMOTE_OCR_ENGINE="googlecloudvision",
            REMOTE_OCR_CREDENTIALS_FILE="somefile.json",
        ):
            parser = RemoteDocumentParser(uuid.uuid4())
            parser.parse(
                self.SAMPLE_FILES / "simple-digital.pdf",
                "application/pdf",
            )
            self.assertContainsStrings(
                parser.text.strip(),
                ["This is a test document."],
            )
Author	SHA1	Message	Date
shamoon	0e9d2f6831	Messing with conversion of azure output to hocr	2024-03-08 22:28:14 -08:00
shamoon	ec505e41fa	Working GCV	2024-03-08 21:19:15 -08:00
shamoon	24c40bbc5e	Rename to remote ocr	2024-03-08 21:19:15 -08:00
shamoon	fba4ce9147	Try deps update again	2024-03-08 21:19:15 -08:00
shamoon	a0c6d25d9a	Add aws textract, remove chatgpt	2024-03-08 21:18:02 -08:00
shamoon	6e7e40e7a2	Add (non-working) Google cloud vision	2024-03-08 21:17:27 -08:00
shamoon	eacafbcb36	Oh wow this works for azure, not chatgpt	2024-03-08 21:17:27 -08:00