Compare commits

..

11 Commits

Author SHA1 Message Date
shamoon
0e9d2f6831 Messing with conversion of azure output to hocr 2024-03-08 22:28:14 -08:00
shamoon
ec505e41fa Working GCV 2024-03-08 21:19:15 -08:00
shamoon
24c40bbc5e Rename to remote ocr 2024-03-08 21:19:15 -08:00
shamoon
fba4ce9147 Try deps update again 2024-03-08 21:19:15 -08:00
shamoon
a0c6d25d9a Add aws textract, remove chatgpt 2024-03-08 21:18:02 -08:00
shamoon
6e7e40e7a2 Add (non-working) Google cloud vision 2024-03-08 21:17:27 -08:00
shamoon
eacafbcb36 Oh wow this works for azure, not chatgpt 2024-03-08 21:17:27 -08:00
shamoon
3aeb45bf34 Merge branch 'main' into dev 2024-03-08 17:05:33 -08:00
shamoon
b91da77a8a Reset dev version string 2024-03-08 17:04:09 -08:00
github-actions[bot]
33357a3fc2 Documentation: Add v2.6.2 changelog (#6049) 2024-03-08 12:14:25 -08:00
Dimitri
025001499d Fix: missing translation string (#6054)
---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
2024-03-08 07:55:56 -08:00
16 changed files with 1174 additions and 261 deletions

View File

@@ -4,6 +4,8 @@ verify_ssl = true
name = "pypi"
[packages]
azure-ai-formrecognizer = "*"
boto3 = "*"
dateparser = "~=1.2"
# WARNING: django does not use semver.
# Only patch versions are guaranteed to not introduce breaking changes.
@@ -27,6 +29,8 @@ channels-redis = "*"
concurrent-log-handler = "*"
filelock = "*"
flower = "*"
google-cloud-vision = "*"
google-cloud-storage = "*"
gotenberg-client = "*"
gunicorn = "*"
imap-tools = "*"

814
Pipfile.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,26 @@
# Changelog
## paperless-ngx 2.6.2
### Features
- Enhancement: move and rename files when storage paths deleted, update file handling docs [@shamoon](https://github.com/shamoon) ([#6033](https://github.com/paperless-ngx/paperless-ngx/pull/6033))
- Enhancement: better detection of default currency code [@shamoon](https://github.com/shamoon) ([#6020](https://github.com/paperless-ngx/paperless-ngx/pull/6020))
### Bug Fixes
- Fix: make document counts in object lists permissions-aware [@shamoon](https://github.com/shamoon) ([#6019](https://github.com/paperless-ngx/paperless-ngx/pull/6019))
### All App Changes
<details>
<summary>3 changes</summary>
- Enhancement: move and rename files when storage paths deleted, update file handling docs [@shamoon](https://github.com/shamoon) ([#6033](https://github.com/paperless-ngx/paperless-ngx/pull/6033))
- Fix: make document counts in object lists permissions-aware [@shamoon](https://github.com/shamoon) ([#6019](https://github.com/paperless-ngx/paperless-ngx/pull/6019))
- Enhancement: better detection of default currency code [@shamoon](https://github.com/shamoon) ([#6020](https://github.com/paperless-ngx/paperless-ngx/pull/6020))
</details>
## paperless-ngx 2.6.1
### All App Changes

View File

@@ -1912,7 +1912,7 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">55</context>
<context context-type="linenumber">53</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-small/document-card-small.component.html</context>
@@ -4049,7 +4049,7 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">60</context>
<context context-type="linenumber">58</context>
</context-group>
</trans-unit>
<trans-unit id="2722549756198502062" datatype="html">
@@ -4585,6 +4585,13 @@
<context context-type="linenumber">56</context>
</context-group>
</trans-unit>
<trans-unit id="7489316373554112115" datatype="html">
<source>Up to date</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/system-status-dialog/system-status-dialog.component.html</context>
<context context-type="linenumber">59</context>
</context-group>
</trans-unit>
<trans-unit id="7881311375431899727" datatype="html">
<source>Latest Migration</source>
<context-group purpose="location">
@@ -4767,7 +4774,7 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">66</context>
<context context-type="linenumber">64</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-small/document-card-small.component.html</context>
@@ -5000,7 +5007,7 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">52</context>
<context context-type="linenumber">50</context>
</context-group>
</trans-unit>
<trans-unit id="7819314041543176992" datatype="html">
@@ -5639,7 +5646,7 @@
<source>Filter by correspondent</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">21</context>
<context context-type="linenumber">20</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-list.component.html</context>
@@ -5650,7 +5657,7 @@
<source>Filter by tag</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">29</context>
<context context-type="linenumber">28</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-list.component.html</context>
@@ -5661,21 +5668,21 @@
<source>View notes</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">73</context>
<context context-type="linenumber">70</context>
</context-group>
</trans-unit>
<trans-unit id="8778002102373462277" datatype="html">
<source><x id="INTERPOLATION" equiv-text="ocument.notes.length}}"/> Notes</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">74</context>
<context context-type="linenumber">71</context>
</context-group>
</trans-unit>
<trans-unit id="78870852467682010" datatype="html">
<source>Filter by document type</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">78</context>
<context context-type="linenumber">75</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-list.component.html</context>
@@ -5686,7 +5693,7 @@
<source>Filter by storage path</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">84</context>
<context context-type="linenumber">81</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-list.component.html</context>
@@ -5697,7 +5704,7 @@
<source>Created: <x id="INTERPOLATION" equiv-text="{{ document.created | customDate }}"/></source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">96,97</context>
<context context-type="linenumber">93,94</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-small/document-card-small.component.html</context>
@@ -5708,7 +5715,7 @@
<source>Added: <x id="INTERPOLATION" equiv-text="{{ document.added | customDate }}"/></source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">97,98</context>
<context context-type="linenumber">94,95</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-small/document-card-small.component.html</context>
@@ -5719,7 +5726,7 @@
<source>Modified: <x id="INTERPOLATION" equiv-text="{{ document.modified | customDate }}"/></source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">98,99</context>
<context context-type="linenumber">95,96</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-small/document-card-small.component.html</context>
@@ -5730,7 +5737,7 @@
<source>Shared</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">111</context>
<context context-type="linenumber">108</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-small/document-card-small.component.html</context>
@@ -5745,7 +5752,7 @@
<source>Score:</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/document-list/document-card-large/document-card-large.component.html</context>
<context context-type="linenumber">116</context>
<context context-type="linenumber">113</context>
</context-group>
</trans-unit>
<trans-unit id="3661756380991326939" datatype="html">

View File

@@ -56,7 +56,7 @@
<dt i18n>Migration Status</dt>
<dd class="d-flex align-items-center">
@if (status.database.migration_status.unapplied_migrations.length === 0) {
<ng-container>Up to date</ng-container><i-bs name="check-circle-fill" class="text-primary ms-2 lh-1" [ngbPopover]="migrationStatus" triggers="mouseenter:mouseleave"></i-bs>
<ng-container i18n>Up to date</ng-container><i-bs name="check-circle-fill" class="text-primary ms-2 lh-1" [ngbPopover]="migrationStatus" triggers="mouseenter:mouseleave"></i-bs>
} @else {
<ng-container>{{status.database.migration_status.unapplied_migrations.length}} Pending</ng-container><i-bs name="exclamation-triangle-fill" class="text-warning ms-2 lh-1" [ngbPopover]="migrationStatus" triggers="mouseenter:mouseleave"></i-bs>
}

View File

@@ -5,7 +5,7 @@ export const environment = {
apiBaseUrl: document.baseURI + 'api/',
apiVersion: '5',
appTitle: 'Paperless-ngx',
version: '2.6.2',
version: '2.6.2-dev',
webSocketHost: window.location.host,
webSocketProtocol: window.location.protocol == 'https:' ? 'wss:' : 'ws:',
webSocketBaseUrl: base_url.pathname + 'ws/',

View File

@@ -297,6 +297,7 @@ INSTALLED_APPS = [
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig",
"paperless_remote.apps.PaperlessRemoteParserConfig",
"django.contrib.admin",
"rest_framework",
"rest_framework.authtoken",
@@ -1149,3 +1150,14 @@ EMAIL_SUBJECT_PREFIX: Final[str] = "[Paperless-ngx] "
if DEBUG: # pragma: no cover
EMAIL_BACKEND = "django.core.mail.backends.filebased.EmailBackend"
EMAIL_FILE_PATH = BASE_DIR / "sent_emails"
###############################################################################
# Remote Parser #
###############################################################################
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
REMOTE_OCR_API_KEY_ID = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY_ID")
REMOTE_OCR_REGION = os.getenv("PAPERLESS_REMOTE_OCR_REGION")
REMOTE_OCR_CREDENTIALS_FILE = os.getenv("PAPERLESS_REMOTE_OCR_CREDENTIALS_FILE")

View File

@@ -0,0 +1,4 @@
# this is here so that django finds the checks.
from paperless_remote.checks import check_remote_parser_configured
__all__ = ["check_remote_parser_configured"]

View File

@@ -0,0 +1,14 @@
from django.apps import AppConfig
from paperless_remote.signals import remote_consumer_declaration
class PaperlessRemoteParserConfig(AppConfig):
name = "paperless_remote"
def ready(self):
from documents.signals import document_consumer_declaration
document_consumer_declaration.connect(remote_consumer_declaration)
AppConfig.ready(self)

View File

@@ -0,0 +1,39 @@
from pathlib import Path
from django.conf import settings
from django.core.checks import Error
from django.core.checks import register
@register()
def check_remote_parser_configured(app_configs, **kwargs):
if (
settings.REMOTE_OCR_ENGINE == "azureaivision"
and not settings.REMOTE_OCR_ENDPOINT
):
return [
Error(
"Azure AI Vision remote parser requires endpoint to be configured.",
),
]
if settings.REMOTE_OCR_ENGINE == "awstextract" and (
not settings.REMOTE_OCR_API_KEY_ID or not settings.REMOTE_OCR_REGION
):
return [
Error(
"AWS Textract remote parser requires access key ID and region to be configured.",
),
]
if settings.REMOTE_OCR_ENGINE == "googlecloudvision" and (
not settings.REMOTE_OCR_CREDENTIALS_FILE
or not Path(settings.REMOTE_OCR_CREDENTIALS_FILE).exists()
):
return [
Error(
"Google Cloud Vision remote parser requires a valid credentials file to be configured.",
),
]
return []

View File

@@ -0,0 +1,239 @@
import json
from pathlib import Path
from typing import Optional
from django.conf import settings
from paperless_tesseract.parsers import RasterisedDocumentParser
class RemoteEngineConfig:
def __init__(
self,
engine: str,
api_key: Optional[str] = None,
endpoint: Optional[str] = None,
api_key_id: Optional[str] = None,
region: Optional[str] = None,
credentials_file: Optional[str] = None,
):
self.engine = engine
self.api_key = api_key
self.endpoint = endpoint
self.api_key_id = api_key_id
self.region = region
self.credentials_file = credentials_file
def engine_is_valid(self):
valid = (
self.engine in ["azureaivision", "awstextract", "googlecloudvision"]
and self.api_key is not None
)
if self.engine == "azureaivision":
valid = valid and self.endpoint is not None
if self.engine == "awstextract":
valid = valid and self.region is not None and self.api_key_id is not None
if self.engine == "googlecloudvision":
valid = self.credentials_file is not None
return valid
class RemoteDocumentParser(RasterisedDocumentParser):
"""
This parser uses a remote ocr engine to parse documents
"""
logging_name = "paperless.parsing.remote"
def get_settings(self) -> RemoteEngineConfig:
"""
This parser uses the OCR configuration settings to parse documents
"""
return RemoteEngineConfig(
engine=settings.REMOTE_OCR_ENGINE,
api_key=settings.REMOTE_OCR_API_KEY,
endpoint=settings.REMOTE_OCR_ENDPOINT,
api_key_id=settings.REMOTE_OCR_API_KEY_ID,
region=settings.REMOTE_OCR_REGION,
credentials_file=settings.REMOTE_OCR_CREDENTIALS_FILE,
)
def supported_mime_types(self):
if self.settings.engine_is_valid():
if self.settings.engine == "googlecloudvision":
return [
"application/pdf",
"image/tiff",
]
else:
return [
"application/pdf",
"image/png",
"image/jpeg",
"image/tiff",
"image/bmp",
"image/gif",
"image/webp",
]
else:
return []
def aws_textract_parse(
self,
file: Path,
) -> Optional[str]:
import boto3
client = boto3.client(
"textract",
region_name=self.settings.region,
aws_access_key_id=self.settings.api_key_id,
aws_secret_access_key=self.settings.api_key,
)
lines = []
with open(file, "rb") as f:
file_bytes = f.read()
file_bytearray = bytearray(file_bytes)
self.log.info("Analyzing document with AWS Textract...")
response = client.analyze_document(
Document={"Bytes": file_bytearray},
FeatureTypes=["TABLES"],
)
blocks = response["Blocks"]
for block in blocks:
if block["BlockType"] == "LINE":
lines.append(block["Text"])
return "\n".join(lines)
def get_bbox_from_polygon(self, polygon) -> str: # Sequence[Point]
if not polygon:
return "0 0 0 0"
x_coordinates = [point.x for point in polygon]
y_coordinates = [point.y for point in polygon]
return f"{min(x_coordinates)} {min(y_coordinates)} {max(x_coordinates)} {max(y_coordinates)}"
def azure_ai_vision_parse(
self,
file: Path,
) -> Optional[str]:
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
credential = AzureKeyCredential(self.settings.api_key)
document_analysis_client = DocumentAnalysisClient(
endpoint=self.settings.endpoint,
credential=credential,
)
with open(file, "rb") as f:
self.log.info("Analyzing document with Azure Vision AI...")
poller = document_analysis_client.begin_analyze_document(
"prebuilt-layout",
document=f,
)
result = poller.result()
hocr = "<html><body>"
for page_number, page in enumerate(result.pages, start=1):
hocr += f'<div class="ocr_page" id="page_{page_number}" title="bbox 0 0 {page.width} {page.height}">'
for idx, word in enumerate(page.words):
bbox = self.get_bbox_from_polygon(word.polygon)
hocr += f'<span class="ocr_word" id="line_{page_number}_{idx}" title="bbox {bbox}">{word.content}</span>'
hocr += "</div>"
hocr += "</body></html>"
self.log.info(f"HOCR output: {hocr}")
return result.content
def google_cloud_vision_parse(
self,
file: Path,
mime_type: str,
) -> Optional[str]:
# https://cloud.google.com/vision/docs/pdf
from django.utils import timezone
from google.cloud import storage
from google.cloud import vision
from google.oauth2 import service_account
credentials = service_account.Credentials.from_service_account_file(
self.settings.credentials_file,
)
client = vision.ImageAnnotatorClient(credentials=credentials)
storage_client = storage.Client(credentials=credentials)
self.log.info("Uploading document to Google Cloud Storage...")
bucket_name = f"pngx_{credentials.project_id}_ocrstorage"
bucket = storage_client.lookup_bucket(bucket_name)
if bucket is None:
bucket = storage_client.create_bucket(bucket_name)
prefix = timezone.now().timestamp()
blob = bucket.blob(f"{prefix}/{file.name}")
blob.upload_from_filename(str(file))
gcs_source_uri = f"gs://{bucket_name}/{prefix}/{file.name}"
gcs_destination_uri = f"{gcs_source_uri}.json"
gcs_source = vision.GcsSource(uri=gcs_source_uri)
input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)
gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
output_config = vision.OutputConfig(
gcs_destination=gcs_destination,
)
self.log.info("Analyzing document with Google Cloud Vision...")
feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
async_request = vision.AsyncAnnotateFileRequest(
features=[feature],
input_config=input_config,
output_config=output_config,
)
operation = client.async_batch_annotate_files(requests=[async_request])
self.log.info("Waiting for Google cloud operation to complete...")
operation.result(timeout=180)
# List objects with the given prefix, filtering out folders.
blob_list = [
blob
for blob in list(bucket.list_blobs(prefix=prefix))
if not blob.name.endswith("/")
]
# second item is the json
output = blob_list[1]
json_string = output.download_as_bytes().decode("utf-8")
response = json.loads(json_string)
text = ""
for response in response["responses"]:
annotation = response["fullTextAnnotation"]
text += annotation["text"]
return text
def parse(self, document_path: Path, mime_type, file_name=None):
if not self.settings.engine_is_valid():
self.log.warning(
"No valid remote parser engine is configured, content will be empty.",
)
self.text = ""
return
elif self.settings.engine == "azureaivision":
self.text = self.azure_ai_vision_parse(document_path)
elif self.settings.engine == "awstextract":
self.text = self.aws_textract_parse(document_path)
elif self.settings.engine == "googlecloudvision":
self.text = self.google_cloud_vision_parse(document_path, mime_type)

View File

@@ -0,0 +1,18 @@
def get_parser(*args, **kwargs):
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(*args, **kwargs)
def get_supported_mime_types():
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(None).supported_mime_types()
def remote_consumer_declaration(sender, **kwargs):
return {
"parser": get_parser,
"weight": 5,
"mime_types": get_supported_mime_types(),
}

View File

Binary file not shown.

View File

@@ -0,0 +1,53 @@
from django.test import TestCase
from django.test import override_settings
from paperless_remote import check_remote_parser_configured
class TestChecks(TestCase):
@override_settings(REMOTE_OCR_ENGINE=None)
def test_no_engine(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)
@override_settings(REMOTE_OCR_ENGINE="azureaivision")
@override_settings(REMOTE_OCR_API_KEY="somekey")
@override_settings(REMOTE_OCR_ENDPOINT=None)
def test_azure_no_endpoint(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 1)
self.assertTrue(
msgs[0].msg.startswith(
"Azure AI Vision remote parser requires endpoint to be configured.",
),
)
@override_settings(REMOTE_OCR_ENGINE="awstextract")
@override_settings(REMOTE_OCR_API_KEY="somekey")
@override_settings(REMOTE_OCR_API_KEY_ID=None)
@override_settings(REMOTE_OCR_REGION=None)
def test_aws_no_id_or_region(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 1)
self.assertTrue(
msgs[0].msg.startswith(
"AWS Textract remote parser requires access key ID and region to be configured.",
),
)
@override_settings(REMOTE_OCR_ENGINE="googlecloudvision")
@override_settings(REMOTE_OCR_CREDENTIALS_FILE=None)
def test_gcv_no_creds_file(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 1)
self.assertTrue(
msgs[0].msg.startswith(
"Google Cloud Vision remote parser requires a valid credentials file to be configured.",
),
)
@override_settings(REMOTE_OCR_ENGINE="something")
@override_settings(REMOTE_OCR_API_KEY="somekey")
def test_valid_configuration(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)

View File

@@ -0,0 +1,176 @@
import json
import sys
import uuid
from pathlib import Path
from unittest import mock
import pytest
from django.test import TestCase
from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_remote.parsers import RemoteDocumentParser
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
def assertContainsStrings(self, content, strings):
# Asserts that all strings appear in content, in the given order.
indices = []
for s in strings:
if s in content:
indices.append(content.index(s))
else:
self.fail(f"'{s}' is not in '{content}'")
self.assertListEqual(indices, sorted(indices))
@pytest.mark.skipif(
sys.version_info > (3, 10),
reason="Fails on 3.11 only on CI, for some reason",
) # TODO: investigate
@mock.patch("azure.ai.formrecognizer.DocumentAnalysisClient")
def test_get_text_with_azure(self, mock_azure_client):
result = mock.Mock()
result.content = "This is a test document."
result.pages = [
mock.Mock(
width=100,
height=100,
words=[
mock.Mock(
content="This",
polygon=[
mock.Mock(x=0, y=0),
],
),
mock.Mock(
content="is",
polygon=[
mock.Mock(x=10, y=10),
],
),
mock.Mock(
content="a",
polygon=[
mock.Mock(x=20, y=20),
],
),
mock.Mock(
content="test",
polygon=[
mock.Mock(x=30, y=30),
],
),
mock.Mock(
content="document.",
polygon=[
mock.Mock(x=40, y=40),
],
),
],
),
]
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = (
result
)
with override_settings(
REMOTE_OCR_ENGINE="azureaivision",
REMOTE_OCR_API_KEY="somekey",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com/",
):
parser = RemoteDocumentParser(uuid.uuid4())
parser.parse(
self.SAMPLE_FILES / "simple-digital.pdf",
"application/pdf",
)
self.assertContainsStrings(
parser.text.strip(),
["This is a test document."],
)
@mock.patch("boto3.client")
def test_get_text_with_awstextract(self, mock_aws_client):
mock_aws_client.return_value.analyze_document.return_value = {
"Blocks": [
{
"BlockType": "LINE",
"Text": "This is a test document.",
},
],
}
with override_settings(
REMOTE_OCR_ENGINE="awstextract",
REMOTE_OCR_API_KEY="somekey",
REMOTE_OCR_API_KEY_ID="somekeyid",
REMOTE_OCR_REGION="us-west-2",
):
parser = RemoteDocumentParser(uuid.uuid4())
parser.parse(
self.SAMPLE_FILES / "simple-digital.pdf",
"application/pdf",
)
self.assertContainsStrings(
parser.text.strip(),
["This is a test document."],
)
@mock.patch("google.cloud.vision.ImageAnnotatorClient")
@mock.patch("google.cloud.storage.Client")
@mock.patch("google.oauth2.service_account.Credentials.from_service_account_file")
def test_get_text_with_googlecloudvision(
self,
mock_credentials_from_file,
mock_gcs_client,
mock_gcv_client,
):
credentials = mock.Mock()
credentials.project_id = "someproject"
mock_credentials_from_file.return_value = credentials
blob_mock0 = mock.Mock()
blob_mock0.name = "somefile.pdf"
blob_mock1 = mock.Mock()
blob_mock1.name = "somefile.json"
blob_mock1.download_as_bytes.return_value.decode.return_value = json.dumps(
{
"responses": [
{
"fullTextAnnotation": {
"text": "This is a test document.",
},
},
],
},
)
mock_gcs_client.return_value.lookup_bucket.return_value.list_blobs.return_value = [
blob_mock0,
blob_mock1,
]
result = mock.Mock()
result.result = mock.Mock()
mock_gcv_client.return_value.async_batch_annotate_files.return_value = result
with override_settings(
REMOTE_OCR_ENGINE="googlecloudvision",
REMOTE_OCR_CREDENTIALS_FILE="somefile.json",
):
parser = RemoteDocumentParser(uuid.uuid4())
parser.parse(
self.SAMPLE_FILES / "simple-digital.pdf",
"application/pdf",
)
self.assertContainsStrings(
parser.text.strip(),
["This is a test document."],
)