From 81ce4e4597930e4cc7e99defb3dfa25e16fb71ac Mon Sep 17 00:00:00 2001 From: Martin Tan Date: Sun, 24 Sep 2023 01:45:27 +0800 Subject: [PATCH] feat: adding google cloud storage uploading --- Dockerfile | 2 + Pipfile | 205 +++++++++++++++++++------------------- src/documents/consumer.py | 24 ++++- src/paperless/settings.py | 6 ++ 4 files changed, 135 insertions(+), 102 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1d3def67a..23d792dd4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -198,6 +198,8 @@ RUN set -eux \ && echo "Installing psycopg2" \ && python3 -m pip install --no-cache-dir ./psycopg2/${PSYCOPG2_VERSION}/${TARGETARCH}${TARGETVARIANT}/psycopg2*.whl \ && python3 -m pip list \ + && echo "Installing google cloud storage" \ + && python3 -m pip install google-cloud-storage \ && echo "Cleaning up image layer" \ && cd ../ \ && rm -rf paperless-ngx \ diff --git a/Pipfile b/Pipfile index 1308dc2a3..9b2e283d4 100644 --- a/Pipfile +++ b/Pipfile @@ -1,101 +1,104 @@ -[[source]] -url = "https://pypi.python.org/simple" -verify_ssl = true -name = "pypi" - -[[source]] -url = "https://www.piwheels.org/simple" -verify_ssl = true -name = "piwheels" - -[packages] -dateparser = "~=1.1" -django = "~=4.1" -django-cors-headers = "*" -django-celery-results = "*" -django-compression-middleware = "*" -django-guardian = "*" -django-extensions = "*" -django-filter = "~=22.1" -djangorestframework = "~=3.14" -djangorestframework-guardian = "*" -django-ipware = "*" -filelock = "*" -gunicorn = "*" -imap-tools = "*" -langdetect = "*" -pathvalidate = "*" -pillow = "~=9.4" -pikepdf = "*" -python-gnupg = "*" -python-dotenv = "*" -python-dateutil = "*" -python-magic = "*" -psycopg2 = "*" -rapidfuzz = "*" -redis = {extras = ["hiredis"], version = "*"} -scikit-learn = "~=1.2" -numpy = "*" -whitenoise = "~=6.3" -watchdog = "~=2.2" -whoosh="~=2.7" -inotifyrecursive = "~=0.3" -ocrmypdf = "~=14.0" -tqdm = "*" -tika = "*" -# TODO: This will sadly also install daphne+dependencies, -# which an ASGI server we don't need. Adds about 15MB image size. -channels = "~=3.0" -channels-redis = "*" -uvicorn = {extras = ["standard"], version = "*"} -concurrent-log-handler = "*" -"pdfminer.six" = "*" -pyzbar = "*" -mysqlclient = "*" -celery = {extras = ["redis"], version = "*"} -setproctitle = "*" -nltk = "*" -pdf2image = "*" -flower = "*" -bleach = "*" -zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"} -# -# Packages locked due to issues (try to check if these are fixed in a release every so often) -# -# Pin this until piwheels is building 1.9 (see https://www.piwheels.org/project/scipy/) -scipy = "==1.8.1" - -[dev-packages] -coveralls = "*" -factory-boy = "*" -pytest = "*" -pytest-cov = "*" -pytest-django = "*" -pytest-env = "*" -pytest-sugar = "*" -pytest-xdist = "*" -black = "*" -pre-commit = "*" -imagehash = "*" -mkdocs-material = "*" -ruff = "*" - -[typing-dev] -mypy = "*" -types-Pillow = "*" -django-filter-stubs = "*" -types-python-dateutil = "*" -djangorestframework-stubs = {extras= ["compatible-mypy"], version="*"} -celery-types = "*" -django-stubs = {extras= ["compatible-mypy"], version="*"} -types-dateparser = "*" -types-bleach = "*" -types-humanfriendly = "*" -types-redis = "*" -types-tqdm = "*" -types-Markdown = "*" -types-Pygments = "*" -types-backports = "*" -types-colorama = "*" -types-psycopg2 = "*" -types-setuptools = "*" +[[source]] +url = "https://pypi.python.org/simple" +verify_ssl = true +name = "pypi" + +[[source]] +url = "https://www.piwheels.org/simple" +verify_ssl = true +name = "piwheels" + +[packages] +dateparser = "~=1.1" +django = "~=4.1" +django-cors-headers = "*" +django-celery-results = "*" +django-compression-middleware = "*" +django-guardian = "*" +django-extensions = "*" +django-filter = "~=22.1" +djangorestframework = "~=3.14" +djangorestframework-guardian = "*" +django-ipware = "*" +filelock = "*" +gunicorn = "*" +imap-tools = "*" +langdetect = "*" +pathvalidate = "*" +pillow = "~=9.4" +pikepdf = "*" +python-gnupg = "*" +python-dotenv = "*" +python-dateutil = "*" +python-magic = "*" +psycopg2 = "*" +rapidfuzz = "*" +redis = {extras = ["hiredis"], version = "*"} +scikit-learn = "~=1.2" +numpy = "*" +whitenoise = "~=6.3" +watchdog = "~=2.2" +whoosh="~=2.7" +inotifyrecursive = "~=0.3" +ocrmypdf = "~=14.0" +tqdm = "*" +tika = "*" +# TODO: This will sadly also install daphne+dependencies, +# which an ASGI server we don't need. Adds about 15MB image size. +channels = "~=3.0" +channels-redis = "*" +uvicorn = {extras = ["standard"], version = "*"} +concurrent-log-handler = "*" +"pdfminer.six" = "*" +pyzbar = "*" +celery = {extras = ["redis"], version = "*"} +setproctitle = "*" +nltk = "*" +pdf2image = "*" +flower = "*" +bleach = "*" +zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"} +# +# Packages locked due to issues (try to check if these are fixed in a release every so often) +# +# Pin this until piwheels is building 1.9 (see https://www.piwheels.org/project/scipy/) +scipy = "==1.8.1" +google-cloud-storage = "*" +google = "*" +google-api-core = "*" +google-cloud-core = "*" + +[dev-packages] +coveralls = "*" +factory-boy = "*" +pytest = "*" +pytest-cov = "*" +pytest-django = "*" +pytest-env = "*" +pytest-sugar = "*" +pytest-xdist = "*" +black = "*" +pre-commit = "*" +imagehash = "*" +mkdocs-material = "*" +ruff = "*" + +[typing-dev] +mypy = "*" +types-Pillow = "*" +django-filter-stubs = "*" +types-python-dateutil = "*" +djangorestframework-stubs = {extras= ["compatible-mypy"], version="*"} +celery-types = "*" +django-stubs = {extras= ["compatible-mypy"], version="*"} +types-dateparser = "*" +types-bleach = "*" +types-humanfriendly = "*" +types-redis = "*" +types-tqdm = "*" +types-Markdown = "*" +types-Pygments = "*" +types-backports = "*" +types-colorama = "*" +types-psycopg2 = "*" +types-setuptools = "*" diff --git a/src/documents/consumer.py b/src/documents/consumer.py index ab4c5b958..d111a0a0d 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -37,6 +37,8 @@ from .parsers import ParseError from .signals import document_consumption_finished from .signals import document_consumption_started +from google.cloud import storage + class ConsumerError(Exception): pass @@ -431,6 +433,16 @@ class Consumer(LoggingMixin): classifier = load_classifier() + try: + self.log("debug", "Initializing Google Cloud Storage: " + str(settings.GCP_SERVICE_ACCOUNT_JSON)) + # Prepare Google Cloud Storage client + # client = storage.Client() + client = storage.Client.from_service_account_info(settings.GCP_SERVICE_ACCOUNT_JSON) + self.log("debug", "Getting bucket: " + settings.GCP_BUCKET_NAME) + self.bucket = client.bucket(settings.GCP_BUCKET_NAME) + except Exception as e: + self.log("warning", 'Failed to initialize GCP: ' + str(e)) + self._send_progress(95, 100, "WORKING", MESSAGE_SAVE_DOCUMENT) # now that everything is done, we can start to store the document # in the system. This will be a transaction and reasonably fast. @@ -487,7 +499,7 @@ class Consumer(LoggingMixin): document.save() # Delete the file only if it was successfully consumed - self.log("debug", f"Deleting file {self.path}") + self.log("debug", f"Deleting file 123 {self.path}") os.unlink(self.path) self.original_path.unlink() @@ -625,6 +637,16 @@ class Consumer(LoggingMixin): def _write(self, storage_type, source, target): with open(source, "rb") as read_file, open(target, "wb") as write_file: write_file.write(read_file.read()) + + with open(source, "rb") as read_file_2: + self.log("debug", "GOOGLE_CLOUD_STORAGE:" + str(settings.GOOGLE_CLOUD_STORAGE)) + # Reference: https://github.com/GoogleCloudPlatform/getting-started-python/blob/main/bookshelf/storage.py#L59 + if settings.GOOGLE_CLOUD_STORAGE: + self.log("debug", "Uploading to Google Cloud Storage") + # GCP was initialized earlier + blob = self.bucket.blob(str(target)) + # Reference: https://cloud.google.com/python/docs/reference/storage/latest/google.cloud.storage.blob.Blob#google_cloud_storage_blob_Blob_upload_from_file + blob.upload_from_file(read_file_2) def _log_script_outputs(self, completed_process: CompletedProcess): """ diff --git a/src/paperless/settings.py b/src/paperless/settings.py index b8abea5ff..a6947d13d 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -217,6 +217,12 @@ def _parse_beat_schedule() -> Dict: # NEVER RUN WITH DEBUG IN PRODUCTION. DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO") +# Google Cloud Storage +GOOGLE_CLOUD_STORAGE = __get_boolean("GOOGLE_CLOUD_STORAGE", "NO") +GCP_BUCKET_NAME = os.getenv('GCP_BUCKET_NAME', 'dms_files_local') +GCP_SERVICE_ACCOUNT_JSON = os.getenv('GCP_SERVICE_ACCOUNT_JSON', '') +if GCP_SERVICE_ACCOUNT_JSON != '': + GCP_SERVICE_ACCOUNT_JSON = json.loads(GCP_SERVICE_ACCOUNT_JSON) ############################################################################### # Directories #