From 892b033cdd2ed3ed61b94d83444d40e10d9f6c15 Mon Sep 17 00:00:00 2001 From: Martin Tan Date: Mon, 25 Sep 2023 22:54:54 +0800 Subject: [PATCH] feat: add get file from gcs gcs - google cloud storage --- .../document-detail.component.ts | 3 +- src/documents/consumer.py | 28 +++-------- src/documents/models.py | 11 +++++ src/google_cloud_storage/storage.py | 49 +++++++++++++++++++ 4 files changed, 68 insertions(+), 23 deletions(-) create mode 100644 src/google_cloud_storage/storage.py diff --git a/src-ui/src/app/components/document-detail/document-detail.component.ts b/src-ui/src/app/components/document-detail/document-detail.component.ts index 34e159a1a..7c14ec428 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.ts +++ b/src-ui/src/app/components/document-detail/document-detail.component.ts @@ -172,7 +172,7 @@ export class DocumentDetailComponent } get folderPath(): string { - return this.storagePaths.find(s => s.id === this.document.storage_path)?.path ?? ''; + return this.storagePaths?.find(s => s.id === this.document?.storage_path)?.path ?? ''; } getContentType() { @@ -402,6 +402,7 @@ export class DocumentDetailComponent } updateComponent(doc: PaperlessDocument) { + console.log('[updateComponent] doc:', doc); this.document = doc this.requiresPassword = false this.documentsService diff --git a/src/documents/consumer.py b/src/documents/consumer.py index d111a0a0d..c7d1a1c39 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -37,7 +37,7 @@ from .parsers import ParseError from .signals import document_consumption_finished from .signals import document_consumption_started -from google.cloud import storage +from google_cloud_storage.storage import upload_file, is_gcs_enabled class ConsumerError(Exception): @@ -433,16 +433,6 @@ class Consumer(LoggingMixin): classifier = load_classifier() - try: - self.log("debug", "Initializing Google Cloud Storage: " + str(settings.GCP_SERVICE_ACCOUNT_JSON)) - # Prepare Google Cloud Storage client - # client = storage.Client() - client = storage.Client.from_service_account_info(settings.GCP_SERVICE_ACCOUNT_JSON) - self.log("debug", "Getting bucket: " + settings.GCP_BUCKET_NAME) - self.bucket = client.bucket(settings.GCP_BUCKET_NAME) - except Exception as e: - self.log("warning", 'Failed to initialize GCP: ' + str(e)) - self._send_progress(95, 100, "WORKING", MESSAGE_SAVE_DOCUMENT) # now that everything is done, we can start to store the document # in the system. This will be a transaction and reasonably fast. @@ -635,18 +625,12 @@ class Consumer(LoggingMixin): def _write(self, storage_type, source, target): - with open(source, "rb") as read_file, open(target, "wb") as write_file: - write_file.write(read_file.read()) + if is_gcs_enabled: + upload_file(source, target) + else: + with open(source, "rb") as read_file, open(target, "wb") as write_file: + write_file.write(read_file.read()) - with open(source, "rb") as read_file_2: - self.log("debug", "GOOGLE_CLOUD_STORAGE:" + str(settings.GOOGLE_CLOUD_STORAGE)) - # Reference: https://github.com/GoogleCloudPlatform/getting-started-python/blob/main/bookshelf/storage.py#L59 - if settings.GOOGLE_CLOUD_STORAGE: - self.log("debug", "Uploading to Google Cloud Storage") - # GCP was initialized earlier - blob = self.bucket.blob(str(target)) - # Reference: https://cloud.google.com/python/docs/reference/storage/latest/google.cloud.storage.blob.Blob#google_cloud_storage_blob_Blob_upload_from_file - blob.upload_from_file(read_file_2) def _log_script_outputs(self, completed_process: CompletedProcess): """ diff --git a/src/documents/models.py b/src/documents/models.py index b6525618f..59301dd7e 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -19,6 +19,8 @@ from django.utils import timezone from django.utils.translation import gettext_lazy as _ from documents.parsers import get_default_file_extension +from google_cloud_storage.storage import is_gcs_enabled, get_file_from_gcs + ALL_STATES = sorted(states.ALL_STATES) TASK_STATE_CHOICES = sorted(zip(ALL_STATES, ALL_STATES)) @@ -312,6 +314,9 @@ class Document(ModelWithOwner): @property def source_file(self): + # print(f"is_gcs_enabled: {is_gcs_enabled}") + if is_gcs_enabled: + return get_file_from_gcs(self.source_path) return open(self.source_path, "rb") @property @@ -327,6 +332,9 @@ class Document(ModelWithOwner): @property def archive_file(self): + # print(f"is_gcs_enabled: {is_gcs_enabled}") + if is_gcs_enabled: + return get_file_from_gcs(self.archive_path) return open(self.archive_path, "rb") def get_public_filename(self, archive=False, counter=0, suffix=None) -> str: @@ -364,6 +372,9 @@ class Document(ModelWithOwner): @property def thumbnail_file(self): + # print(f"is_gcs_enabled: {is_gcs_enabled}") + if is_gcs_enabled: + return get_file_from_gcs(self.thumbnail_path) return open(self.thumbnail_path, "rb") @property diff --git a/src/google_cloud_storage/storage.py b/src/google_cloud_storage/storage.py new file mode 100644 index 000000000..af1db1267 --- /dev/null +++ b/src/google_cloud_storage/storage.py @@ -0,0 +1,49 @@ + +from google.cloud import storage +from django.conf import settings +from io import BytesIO + +is_gcs_enabled = settings.GOOGLE_CLOUD_STORAGE +client = None +bucket = None + +if is_gcs_enabled: + print("Initializing Google Cloud Storage: " + str(settings.GCP_SERVICE_ACCOUNT_JSON)) + # Prepare Google Cloud Storage client + # client = storage.Client() + client = storage.Client.from_service_account_info(settings.GCP_SERVICE_ACCOUNT_JSON) + + print("Getting bucket: " + settings.GCP_BUCKET_NAME) + + bucket = client.bucket(settings.GCP_BUCKET_NAME) + + +def upload_file(source, target): + if (not client) or (not bucket): + return + + with open(source, "rb") as read_file_2: + # Reference: https://github.com/GoogleCloudPlatform/getting-started-python/blob/main/bookshelf/storage.py#L59 + print("Uploading to Google Cloud Storage") + blob = bucket.blob(str(target)) + # Reference: https://cloud.google.com/python/docs/reference/storage/latest/google.cloud.storage.blob.Blob#google_cloud_storage_blob_Blob_upload_from_file + blob.upload_from_file(read_file_2) + +def get_file_from_gcs(bucket_path): + if (not client) or (not bucket): + raise Exception("Google Cloud Storage is not initialized.") + + # print("Getting blob from Google Cloud Storage") + # Create a blob object representing the path in the bucket + blob = bucket.blob(str(bucket_path)) + + # Download the file as a byte array + byte_stream = BytesIO() + # print("Downloading file from Google Cloud Storage") + blob.download_to_file(byte_stream) + + # Seek to the start of the byte stream to allow reading from the beginning + byte_stream.seek(0) + + # print("Returning downloaded file to caller") + return byte_stream \ No newline at end of file