138 lines
4.4 KiB
Python
138 lines
4.4 KiB
Python
import sys
|
|
import uuid
|
|
from pathlib import Path
|
|
from unittest import mock
|
|
|
|
import pytest
|
|
from django.test import TestCase
|
|
from django.test import override_settings
|
|
|
|
from documents.tests.utils import DirectoriesMixin
|
|
from documents.tests.utils import FileSystemAssertsMixin
|
|
from paperless_remote.parsers import RemoteDocumentParser
|
|
|
|
|
|
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
|
|
|
def assertContainsStrings(self, content, strings):
|
|
# Asserts that all strings appear in content, in the given order.
|
|
indices = []
|
|
for s in strings:
|
|
if s in content:
|
|
indices.append(content.index(s))
|
|
else:
|
|
self.fail(f"'{s}' is not in '{content}'")
|
|
self.assertListEqual(indices, sorted(indices))
|
|
|
|
@pytest.mark.skipif(
|
|
sys.version_info > (3, 10),
|
|
reason="Fails on 3.11 only on CI, for some reason",
|
|
) # TODO: investigate
|
|
@mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient")
|
|
def test_get_text_with_azure(self, mock_azure_client):
|
|
result = mock.Mock()
|
|
result.content = "This is a test document."
|
|
|
|
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = (
|
|
result
|
|
)
|
|
|
|
with override_settings(
|
|
REMOTE_OCR_ENGINE="azureaivision",
|
|
REMOTE_OCR_API_KEY="somekey",
|
|
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com/",
|
|
):
|
|
parser = RemoteDocumentParser(uuid.uuid4())
|
|
parser.parse(
|
|
self.SAMPLE_FILES / "simple-digital.pdf",
|
|
"application/pdf",
|
|
)
|
|
|
|
self.assertContainsStrings(
|
|
parser.text.strip(),
|
|
["This is a test document."],
|
|
)
|
|
|
|
@mock.patch("boto3.client")
|
|
def test_get_text_with_awstextract(self, mock_aws_client):
|
|
mock_aws_client.return_value.analyze_document.return_value = {
|
|
"Blocks": [
|
|
{
|
|
"BlockType": "LINE",
|
|
"Text": "This is a test document.",
|
|
},
|
|
],
|
|
}
|
|
|
|
with override_settings(
|
|
REMOTE_OCR_ENGINE="awstextract",
|
|
REMOTE_OCR_API_KEY="somekey",
|
|
REMOTE_OCR_API_KEY_ID="somekeyid",
|
|
REMOTE_OCR_REGION="us-west-2",
|
|
):
|
|
parser = RemoteDocumentParser(uuid.uuid4())
|
|
parser.parse(
|
|
self.SAMPLE_FILES / "simple-digital.pdf",
|
|
"application/pdf",
|
|
)
|
|
|
|
self.assertContainsStrings(
|
|
parser.text.strip(),
|
|
["This is a test document."],
|
|
)
|
|
|
|
@mock.patch("google.cloud.vision.ImageAnnotatorClient")
|
|
@mock.patch("google.cloud.storage.Client")
|
|
@mock.patch("google.oauth2.service_account.Credentials.from_service_account_file")
|
|
def test_get_text_with_googlecloudvision(
|
|
self,
|
|
mock_credentials_from_file,
|
|
mock_gcs_client,
|
|
mock_gcv_client,
|
|
):
|
|
credentials = mock.Mock()
|
|
credentials.project_id = "someproject"
|
|
mock_credentials_from_file.return_value = credentials
|
|
|
|
blob_mock0 = mock.Mock()
|
|
blob_mock0.name = "somefile.pdf"
|
|
blob_mock1 = mock.Mock()
|
|
blob_mock1.name = "somefile.json"
|
|
|
|
blob_mock1.download_as_bytes.return_value.decode.return_value = json.dumps(
|
|
{
|
|
"responses": [
|
|
{
|
|
"fullTextAnnotation": {
|
|
"text": "This is a test document.",
|
|
},
|
|
},
|
|
],
|
|
},
|
|
)
|
|
|
|
mock_gcs_client.return_value.lookup_bucket.return_value.list_blobs.return_value = [
|
|
blob_mock0,
|
|
blob_mock1,
|
|
]
|
|
|
|
result = mock.Mock()
|
|
result.result = mock.Mock()
|
|
mock_gcv_client.return_value.async_batch_annotate_files.return_value = result
|
|
|
|
with override_settings(
|
|
REMOTE_OCR_ENGINE="googlecloudvision",
|
|
REMOTE_OCR_CREDENTIALS_FILE="somefile.json",
|
|
):
|
|
parser = RemoteDocumentParser(uuid.uuid4())
|
|
parser.parse(
|
|
self.SAMPLE_FILES / "simple-digital.pdf",
|
|
"application/pdf",
|
|
)
|
|
|
|
self.assertContainsStrings(
|
|
parser.text.strip(),
|
|
["This is a test document."],
|
|
)
|