Compare commits
7 Commits
dev
...
feature-re
Author | SHA1 | Date | |
---|---|---|---|
|
0e9d2f6831 | ||
|
ec505e41fa | ||
|
24c40bbc5e | ||
|
fba4ce9147 | ||
|
a0c6d25d9a | ||
|
6e7e40e7a2 | ||
|
eacafbcb36 |
4
Pipfile
4
Pipfile
@ -4,6 +4,8 @@ verify_ssl = true
|
|||||||
name = "pypi"
|
name = "pypi"
|
||||||
|
|
||||||
[packages]
|
[packages]
|
||||||
|
azure-ai-formrecognizer = "*"
|
||||||
|
boto3 = "*"
|
||||||
dateparser = "~=1.2"
|
dateparser = "~=1.2"
|
||||||
# WARNING: django does not use semver.
|
# WARNING: django does not use semver.
|
||||||
# Only patch versions are guaranteed to not introduce breaking changes.
|
# Only patch versions are guaranteed to not introduce breaking changes.
|
||||||
@ -27,6 +29,8 @@ channels-redis = "*"
|
|||||||
concurrent-log-handler = "*"
|
concurrent-log-handler = "*"
|
||||||
filelock = "*"
|
filelock = "*"
|
||||||
flower = "*"
|
flower = "*"
|
||||||
|
google-cloud-vision = "*"
|
||||||
|
google-cloud-storage = "*"
|
||||||
gotenberg-client = "*"
|
gotenberg-client = "*"
|
||||||
gunicorn = "*"
|
gunicorn = "*"
|
||||||
imap-tools = "*"
|
imap-tools = "*"
|
||||||
|
814
Pipfile.lock
generated
814
Pipfile.lock
generated
File diff suppressed because it is too large
Load Diff
@ -297,6 +297,7 @@ INSTALLED_APPS = [
|
|||||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||||
"paperless_text.apps.PaperlessTextConfig",
|
"paperless_text.apps.PaperlessTextConfig",
|
||||||
"paperless_mail.apps.PaperlessMailConfig",
|
"paperless_mail.apps.PaperlessMailConfig",
|
||||||
|
"paperless_remote.apps.PaperlessRemoteParserConfig",
|
||||||
"django.contrib.admin",
|
"django.contrib.admin",
|
||||||
"rest_framework",
|
"rest_framework",
|
||||||
"rest_framework.authtoken",
|
"rest_framework.authtoken",
|
||||||
@ -1149,3 +1150,14 @@ EMAIL_SUBJECT_PREFIX: Final[str] = "[Paperless-ngx] "
|
|||||||
if DEBUG: # pragma: no cover
|
if DEBUG: # pragma: no cover
|
||||||
EMAIL_BACKEND = "django.core.mail.backends.filebased.EmailBackend"
|
EMAIL_BACKEND = "django.core.mail.backends.filebased.EmailBackend"
|
||||||
EMAIL_FILE_PATH = BASE_DIR / "sent_emails"
|
EMAIL_FILE_PATH = BASE_DIR / "sent_emails"
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
# Remote Parser #
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
|
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
|
||||||
|
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
|
||||||
|
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
|
||||||
|
REMOTE_OCR_API_KEY_ID = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY_ID")
|
||||||
|
REMOTE_OCR_REGION = os.getenv("PAPERLESS_REMOTE_OCR_REGION")
|
||||||
|
REMOTE_OCR_CREDENTIALS_FILE = os.getenv("PAPERLESS_REMOTE_OCR_CREDENTIALS_FILE")
|
||||||
|
4
src/paperless_remote/__init__.py
Normal file
4
src/paperless_remote/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# this is here so that django finds the checks.
|
||||||
|
from paperless_remote.checks import check_remote_parser_configured
|
||||||
|
|
||||||
|
__all__ = ["check_remote_parser_configured"]
|
14
src/paperless_remote/apps.py
Normal file
14
src/paperless_remote/apps.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
from paperless_remote.signals import remote_consumer_declaration
|
||||||
|
|
||||||
|
|
||||||
|
class PaperlessRemoteParserConfig(AppConfig):
|
||||||
|
name = "paperless_remote"
|
||||||
|
|
||||||
|
def ready(self):
|
||||||
|
from documents.signals import document_consumer_declaration
|
||||||
|
|
||||||
|
document_consumer_declaration.connect(remote_consumer_declaration)
|
||||||
|
|
||||||
|
AppConfig.ready(self)
|
39
src/paperless_remote/checks.py
Normal file
39
src/paperless_remote/checks.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from django.core.checks import Error
|
||||||
|
from django.core.checks import register
|
||||||
|
|
||||||
|
|
||||||
|
@register()
|
||||||
|
def check_remote_parser_configured(app_configs, **kwargs):
|
||||||
|
if (
|
||||||
|
settings.REMOTE_OCR_ENGINE == "azureaivision"
|
||||||
|
and not settings.REMOTE_OCR_ENDPOINT
|
||||||
|
):
|
||||||
|
return [
|
||||||
|
Error(
|
||||||
|
"Azure AI Vision remote parser requires endpoint to be configured.",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
if settings.REMOTE_OCR_ENGINE == "awstextract" and (
|
||||||
|
not settings.REMOTE_OCR_API_KEY_ID or not settings.REMOTE_OCR_REGION
|
||||||
|
):
|
||||||
|
return [
|
||||||
|
Error(
|
||||||
|
"AWS Textract remote parser requires access key ID and region to be configured.",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
if settings.REMOTE_OCR_ENGINE == "googlecloudvision" and (
|
||||||
|
not settings.REMOTE_OCR_CREDENTIALS_FILE
|
||||||
|
or not Path(settings.REMOTE_OCR_CREDENTIALS_FILE).exists()
|
||||||
|
):
|
||||||
|
return [
|
||||||
|
Error(
|
||||||
|
"Google Cloud Vision remote parser requires a valid credentials file to be configured.",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
return []
|
239
src/paperless_remote/parsers.py
Normal file
239
src/paperless_remote/parsers.py
Normal file
@ -0,0 +1,239 @@
|
|||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||||
|
|
||||||
|
|
||||||
|
class RemoteEngineConfig:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
engine: str,
|
||||||
|
api_key: Optional[str] = None,
|
||||||
|
endpoint: Optional[str] = None,
|
||||||
|
api_key_id: Optional[str] = None,
|
||||||
|
region: Optional[str] = None,
|
||||||
|
credentials_file: Optional[str] = None,
|
||||||
|
):
|
||||||
|
self.engine = engine
|
||||||
|
self.api_key = api_key
|
||||||
|
self.endpoint = endpoint
|
||||||
|
self.api_key_id = api_key_id
|
||||||
|
self.region = region
|
||||||
|
self.credentials_file = credentials_file
|
||||||
|
|
||||||
|
def engine_is_valid(self):
|
||||||
|
valid = (
|
||||||
|
self.engine in ["azureaivision", "awstextract", "googlecloudvision"]
|
||||||
|
and self.api_key is not None
|
||||||
|
)
|
||||||
|
if self.engine == "azureaivision":
|
||||||
|
valid = valid and self.endpoint is not None
|
||||||
|
if self.engine == "awstextract":
|
||||||
|
valid = valid and self.region is not None and self.api_key_id is not None
|
||||||
|
if self.engine == "googlecloudvision":
|
||||||
|
valid = self.credentials_file is not None
|
||||||
|
return valid
|
||||||
|
|
||||||
|
|
||||||
|
class RemoteDocumentParser(RasterisedDocumentParser):
|
||||||
|
"""
|
||||||
|
This parser uses a remote ocr engine to parse documents
|
||||||
|
"""
|
||||||
|
|
||||||
|
logging_name = "paperless.parsing.remote"
|
||||||
|
|
||||||
|
def get_settings(self) -> RemoteEngineConfig:
|
||||||
|
"""
|
||||||
|
This parser uses the OCR configuration settings to parse documents
|
||||||
|
"""
|
||||||
|
return RemoteEngineConfig(
|
||||||
|
engine=settings.REMOTE_OCR_ENGINE,
|
||||||
|
api_key=settings.REMOTE_OCR_API_KEY,
|
||||||
|
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||||
|
api_key_id=settings.REMOTE_OCR_API_KEY_ID,
|
||||||
|
region=settings.REMOTE_OCR_REGION,
|
||||||
|
credentials_file=settings.REMOTE_OCR_CREDENTIALS_FILE,
|
||||||
|
)
|
||||||
|
|
||||||
|
def supported_mime_types(self):
|
||||||
|
if self.settings.engine_is_valid():
|
||||||
|
if self.settings.engine == "googlecloudvision":
|
||||||
|
return [
|
||||||
|
"application/pdf",
|
||||||
|
"image/tiff",
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
return [
|
||||||
|
"application/pdf",
|
||||||
|
"image/png",
|
||||||
|
"image/jpeg",
|
||||||
|
"image/tiff",
|
||||||
|
"image/bmp",
|
||||||
|
"image/gif",
|
||||||
|
"image/webp",
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def aws_textract_parse(
|
||||||
|
self,
|
||||||
|
file: Path,
|
||||||
|
) -> Optional[str]:
|
||||||
|
import boto3
|
||||||
|
|
||||||
|
client = boto3.client(
|
||||||
|
"textract",
|
||||||
|
region_name=self.settings.region,
|
||||||
|
aws_access_key_id=self.settings.api_key_id,
|
||||||
|
aws_secret_access_key=self.settings.api_key,
|
||||||
|
)
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
with open(file, "rb") as f:
|
||||||
|
file_bytes = f.read()
|
||||||
|
file_bytearray = bytearray(file_bytes)
|
||||||
|
|
||||||
|
self.log.info("Analyzing document with AWS Textract...")
|
||||||
|
response = client.analyze_document(
|
||||||
|
Document={"Bytes": file_bytearray},
|
||||||
|
FeatureTypes=["TABLES"],
|
||||||
|
)
|
||||||
|
|
||||||
|
blocks = response["Blocks"]
|
||||||
|
for block in blocks:
|
||||||
|
if block["BlockType"] == "LINE":
|
||||||
|
lines.append(block["Text"])
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
def get_bbox_from_polygon(self, polygon) -> str: # Sequence[Point]
|
||||||
|
if not polygon:
|
||||||
|
return "0 0 0 0"
|
||||||
|
x_coordinates = [point.x for point in polygon]
|
||||||
|
y_coordinates = [point.y for point in polygon]
|
||||||
|
return f"{min(x_coordinates)} {min(y_coordinates)} {max(x_coordinates)} {max(y_coordinates)}"
|
||||||
|
|
||||||
|
def azure_ai_vision_parse(
|
||||||
|
self,
|
||||||
|
file: Path,
|
||||||
|
) -> Optional[str]:
|
||||||
|
from azure.ai.formrecognizer import DocumentAnalysisClient
|
||||||
|
from azure.core.credentials import AzureKeyCredential
|
||||||
|
|
||||||
|
credential = AzureKeyCredential(self.settings.api_key)
|
||||||
|
document_analysis_client = DocumentAnalysisClient(
|
||||||
|
endpoint=self.settings.endpoint,
|
||||||
|
credential=credential,
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(file, "rb") as f:
|
||||||
|
self.log.info("Analyzing document with Azure Vision AI...")
|
||||||
|
poller = document_analysis_client.begin_analyze_document(
|
||||||
|
"prebuilt-layout",
|
||||||
|
document=f,
|
||||||
|
)
|
||||||
|
result = poller.result()
|
||||||
|
|
||||||
|
hocr = "<html><body>"
|
||||||
|
|
||||||
|
for page_number, page in enumerate(result.pages, start=1):
|
||||||
|
hocr += f'<div class="ocr_page" id="page_{page_number}" title="bbox 0 0 {page.width} {page.height}">'
|
||||||
|
|
||||||
|
for idx, word in enumerate(page.words):
|
||||||
|
bbox = self.get_bbox_from_polygon(word.polygon)
|
||||||
|
hocr += f'<span class="ocr_word" id="line_{page_number}_{idx}" title="bbox {bbox}">{word.content}</span>'
|
||||||
|
|
||||||
|
hocr += "</div>"
|
||||||
|
|
||||||
|
hocr += "</body></html>"
|
||||||
|
|
||||||
|
self.log.info(f"HOCR output: {hocr}")
|
||||||
|
|
||||||
|
return result.content
|
||||||
|
|
||||||
|
def google_cloud_vision_parse(
|
||||||
|
self,
|
||||||
|
file: Path,
|
||||||
|
mime_type: str,
|
||||||
|
) -> Optional[str]:
|
||||||
|
# https://cloud.google.com/vision/docs/pdf
|
||||||
|
from django.utils import timezone
|
||||||
|
from google.cloud import storage
|
||||||
|
from google.cloud import vision
|
||||||
|
from google.oauth2 import service_account
|
||||||
|
|
||||||
|
credentials = service_account.Credentials.from_service_account_file(
|
||||||
|
self.settings.credentials_file,
|
||||||
|
)
|
||||||
|
|
||||||
|
client = vision.ImageAnnotatorClient(credentials=credentials)
|
||||||
|
storage_client = storage.Client(credentials=credentials)
|
||||||
|
|
||||||
|
self.log.info("Uploading document to Google Cloud Storage...")
|
||||||
|
bucket_name = f"pngx_{credentials.project_id}_ocrstorage"
|
||||||
|
bucket = storage_client.lookup_bucket(bucket_name)
|
||||||
|
if bucket is None:
|
||||||
|
bucket = storage_client.create_bucket(bucket_name)
|
||||||
|
|
||||||
|
prefix = timezone.now().timestamp()
|
||||||
|
blob = bucket.blob(f"{prefix}/{file.name}")
|
||||||
|
blob.upload_from_filename(str(file))
|
||||||
|
gcs_source_uri = f"gs://{bucket_name}/{prefix}/{file.name}"
|
||||||
|
gcs_destination_uri = f"{gcs_source_uri}.json"
|
||||||
|
|
||||||
|
gcs_source = vision.GcsSource(uri=gcs_source_uri)
|
||||||
|
input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)
|
||||||
|
|
||||||
|
gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
|
||||||
|
output_config = vision.OutputConfig(
|
||||||
|
gcs_destination=gcs_destination,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.log.info("Analyzing document with Google Cloud Vision...")
|
||||||
|
feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
|
||||||
|
async_request = vision.AsyncAnnotateFileRequest(
|
||||||
|
features=[feature],
|
||||||
|
input_config=input_config,
|
||||||
|
output_config=output_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
operation = client.async_batch_annotate_files(requests=[async_request])
|
||||||
|
|
||||||
|
self.log.info("Waiting for Google cloud operation to complete...")
|
||||||
|
operation.result(timeout=180)
|
||||||
|
|
||||||
|
# List objects with the given prefix, filtering out folders.
|
||||||
|
blob_list = [
|
||||||
|
blob
|
||||||
|
for blob in list(bucket.list_blobs(prefix=prefix))
|
||||||
|
if not blob.name.endswith("/")
|
||||||
|
]
|
||||||
|
# second item is the json
|
||||||
|
output = blob_list[1]
|
||||||
|
|
||||||
|
json_string = output.download_as_bytes().decode("utf-8")
|
||||||
|
response = json.loads(json_string)
|
||||||
|
|
||||||
|
text = ""
|
||||||
|
for response in response["responses"]:
|
||||||
|
annotation = response["fullTextAnnotation"]
|
||||||
|
text += annotation["text"]
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def parse(self, document_path: Path, mime_type, file_name=None):
|
||||||
|
if not self.settings.engine_is_valid():
|
||||||
|
self.log.warning(
|
||||||
|
"No valid remote parser engine is configured, content will be empty.",
|
||||||
|
)
|
||||||
|
self.text = ""
|
||||||
|
return
|
||||||
|
elif self.settings.engine == "azureaivision":
|
||||||
|
self.text = self.azure_ai_vision_parse(document_path)
|
||||||
|
elif self.settings.engine == "awstextract":
|
||||||
|
self.text = self.aws_textract_parse(document_path)
|
||||||
|
elif self.settings.engine == "googlecloudvision":
|
||||||
|
self.text = self.google_cloud_vision_parse(document_path, mime_type)
|
18
src/paperless_remote/signals.py
Normal file
18
src/paperless_remote/signals.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
def get_parser(*args, **kwargs):
|
||||||
|
from paperless_remote.parsers import RemoteDocumentParser
|
||||||
|
|
||||||
|
return RemoteDocumentParser(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def get_supported_mime_types():
|
||||||
|
from paperless_remote.parsers import RemoteDocumentParser
|
||||||
|
|
||||||
|
return RemoteDocumentParser(None).supported_mime_types()
|
||||||
|
|
||||||
|
|
||||||
|
def remote_consumer_declaration(sender, **kwargs):
|
||||||
|
return {
|
||||||
|
"parser": get_parser,
|
||||||
|
"weight": 5,
|
||||||
|
"mime_types": get_supported_mime_types(),
|
||||||
|
}
|
0
src/paperless_remote/tests/__init__.py
Normal file
0
src/paperless_remote/tests/__init__.py
Normal file
BIN
src/paperless_remote/tests/samples/simple-digital.pdf
Normal file
BIN
src/paperless_remote/tests/samples/simple-digital.pdf
Normal file
Binary file not shown.
53
src/paperless_remote/tests/test_checks.py
Normal file
53
src/paperless_remote/tests/test_checks.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
from django.test import TestCase
|
||||||
|
from django.test import override_settings
|
||||||
|
|
||||||
|
from paperless_remote import check_remote_parser_configured
|
||||||
|
|
||||||
|
|
||||||
|
class TestChecks(TestCase):
|
||||||
|
@override_settings(REMOTE_OCR_ENGINE=None)
|
||||||
|
def test_no_engine(self):
|
||||||
|
msgs = check_remote_parser_configured(None)
|
||||||
|
self.assertEqual(len(msgs), 0)
|
||||||
|
|
||||||
|
@override_settings(REMOTE_OCR_ENGINE="azureaivision")
|
||||||
|
@override_settings(REMOTE_OCR_API_KEY="somekey")
|
||||||
|
@override_settings(REMOTE_OCR_ENDPOINT=None)
|
||||||
|
def test_azure_no_endpoint(self):
|
||||||
|
msgs = check_remote_parser_configured(None)
|
||||||
|
self.assertEqual(len(msgs), 1)
|
||||||
|
self.assertTrue(
|
||||||
|
msgs[0].msg.startswith(
|
||||||
|
"Azure AI Vision remote parser requires endpoint to be configured.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
@override_settings(REMOTE_OCR_ENGINE="awstextract")
|
||||||
|
@override_settings(REMOTE_OCR_API_KEY="somekey")
|
||||||
|
@override_settings(REMOTE_OCR_API_KEY_ID=None)
|
||||||
|
@override_settings(REMOTE_OCR_REGION=None)
|
||||||
|
def test_aws_no_id_or_region(self):
|
||||||
|
msgs = check_remote_parser_configured(None)
|
||||||
|
self.assertEqual(len(msgs), 1)
|
||||||
|
self.assertTrue(
|
||||||
|
msgs[0].msg.startswith(
|
||||||
|
"AWS Textract remote parser requires access key ID and region to be configured.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
@override_settings(REMOTE_OCR_ENGINE="googlecloudvision")
|
||||||
|
@override_settings(REMOTE_OCR_CREDENTIALS_FILE=None)
|
||||||
|
def test_gcv_no_creds_file(self):
|
||||||
|
msgs = check_remote_parser_configured(None)
|
||||||
|
self.assertEqual(len(msgs), 1)
|
||||||
|
self.assertTrue(
|
||||||
|
msgs[0].msg.startswith(
|
||||||
|
"Google Cloud Vision remote parser requires a valid credentials file to be configured.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
@override_settings(REMOTE_OCR_ENGINE="something")
|
||||||
|
@override_settings(REMOTE_OCR_API_KEY="somekey")
|
||||||
|
def test_valid_configuration(self):
|
||||||
|
msgs = check_remote_parser_configured(None)
|
||||||
|
self.assertEqual(len(msgs), 0)
|
176
src/paperless_remote/tests/test_parser.py
Normal file
176
src/paperless_remote/tests/test_parser.py
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from django.test import TestCase
|
||||||
|
from django.test import override_settings
|
||||||
|
|
||||||
|
from documents.tests.utils import DirectoriesMixin
|
||||||
|
from documents.tests.utils import FileSystemAssertsMixin
|
||||||
|
from paperless_remote.parsers import RemoteDocumentParser
|
||||||
|
|
||||||
|
|
||||||
|
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||||
|
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
||||||
|
|
||||||
|
def assertContainsStrings(self, content, strings):
|
||||||
|
# Asserts that all strings appear in content, in the given order.
|
||||||
|
indices = []
|
||||||
|
for s in strings:
|
||||||
|
if s in content:
|
||||||
|
indices.append(content.index(s))
|
||||||
|
else:
|
||||||
|
self.fail(f"'{s}' is not in '{content}'")
|
||||||
|
self.assertListEqual(indices, sorted(indices))
|
||||||
|
|
||||||
|
@pytest.mark.skipif(
|
||||||
|
sys.version_info > (3, 10),
|
||||||
|
reason="Fails on 3.11 only on CI, for some reason",
|
||||||
|
) # TODO: investigate
|
||||||
|
@mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient")
|
||||||
|
def test_get_text_with_azure(self, mock_azure_client):
|
||||||
|
result = mock.Mock()
|
||||||
|
result.content = "This is a test document."
|
||||||
|
result.pages = [
|
||||||
|
mock.Mock(
|
||||||
|
width=100,
|
||||||
|
height=100,
|
||||||
|
words=[
|
||||||
|
mock.Mock(
|
||||||
|
content="This",
|
||||||
|
polygon=[
|
||||||
|
mock.Mock(x=0, y=0),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
mock.Mock(
|
||||||
|
content="is",
|
||||||
|
polygon=[
|
||||||
|
mock.Mock(x=10, y=10),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
mock.Mock(
|
||||||
|
content="a",
|
||||||
|
polygon=[
|
||||||
|
mock.Mock(x=20, y=20),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
mock.Mock(
|
||||||
|
content="test",
|
||||||
|
polygon=[
|
||||||
|
mock.Mock(x=30, y=30),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
mock.Mock(
|
||||||
|
content="document.",
|
||||||
|
polygon=[
|
||||||
|
mock.Mock(x=40, y=40),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = (
|
||||||
|
result
|
||||||
|
)
|
||||||
|
|
||||||
|
with override_settings(
|
||||||
|
REMOTE_OCR_ENGINE="azureaivision",
|
||||||
|
REMOTE_OCR_API_KEY="somekey",
|
||||||
|
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com/",
|
||||||
|
):
|
||||||
|
parser = RemoteDocumentParser(uuid.uuid4())
|
||||||
|
parser.parse(
|
||||||
|
self.SAMPLE_FILES / "simple-digital.pdf",
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertContainsStrings(
|
||||||
|
parser.text.strip(),
|
||||||
|
["This is a test document."],
|
||||||
|
)
|
||||||
|
|
||||||
|
@mock.patch("boto3.client")
|
||||||
|
def test_get_text_with_awstextract(self, mock_aws_client):
|
||||||
|
mock_aws_client.return_value.analyze_document.return_value = {
|
||||||
|
"Blocks": [
|
||||||
|
{
|
||||||
|
"BlockType": "LINE",
|
||||||
|
"Text": "This is a test document.",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
with override_settings(
|
||||||
|
REMOTE_OCR_ENGINE="awstextract",
|
||||||
|
REMOTE_OCR_API_KEY="somekey",
|
||||||
|
REMOTE_OCR_API_KEY_ID="somekeyid",
|
||||||
|
REMOTE_OCR_REGION="us-west-2",
|
||||||
|
):
|
||||||
|
parser = RemoteDocumentParser(uuid.uuid4())
|
||||||
|
parser.parse(
|
||||||
|
self.SAMPLE_FILES / "simple-digital.pdf",
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertContainsStrings(
|
||||||
|
parser.text.strip(),
|
||||||
|
["This is a test document."],
|
||||||
|
)
|
||||||
|
|
||||||
|
@mock.patch("google.cloud.vision.ImageAnnotatorClient")
|
||||||
|
@mock.patch("google.cloud.storage.Client")
|
||||||
|
@mock.patch("google.oauth2.service_account.Credentials.from_service_account_file")
|
||||||
|
def test_get_text_with_googlecloudvision(
|
||||||
|
self,
|
||||||
|
mock_credentials_from_file,
|
||||||
|
mock_gcs_client,
|
||||||
|
mock_gcv_client,
|
||||||
|
):
|
||||||
|
credentials = mock.Mock()
|
||||||
|
credentials.project_id = "someproject"
|
||||||
|
mock_credentials_from_file.return_value = credentials
|
||||||
|
|
||||||
|
blob_mock0 = mock.Mock()
|
||||||
|
blob_mock0.name = "somefile.pdf"
|
||||||
|
blob_mock1 = mock.Mock()
|
||||||
|
blob_mock1.name = "somefile.json"
|
||||||
|
|
||||||
|
blob_mock1.download_as_bytes.return_value.decode.return_value = json.dumps(
|
||||||
|
{
|
||||||
|
"responses": [
|
||||||
|
{
|
||||||
|
"fullTextAnnotation": {
|
||||||
|
"text": "This is a test document.",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
mock_gcs_client.return_value.lookup_bucket.return_value.list_blobs.return_value = [
|
||||||
|
blob_mock0,
|
||||||
|
blob_mock1,
|
||||||
|
]
|
||||||
|
|
||||||
|
result = mock.Mock()
|
||||||
|
result.result = mock.Mock()
|
||||||
|
mock_gcv_client.return_value.async_batch_annotate_files.return_value = result
|
||||||
|
|
||||||
|
with override_settings(
|
||||||
|
REMOTE_OCR_ENGINE="googlecloudvision",
|
||||||
|
REMOTE_OCR_CREDENTIALS_FILE="somefile.json",
|
||||||
|
):
|
||||||
|
parser = RemoteDocumentParser(uuid.uuid4())
|
||||||
|
parser.parse(
|
||||||
|
self.SAMPLE_FILES / "simple-digital.pdf",
|
||||||
|
"application/pdf",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertContainsStrings(
|
||||||
|
parser.text.strip(),
|
||||||
|
["This is a test document."],
|
||||||
|
)
|
Loading…
x
Reference in New Issue
Block a user