feat: adding google cloud storage uploading

This commit is contained in:
Martin Tan 2023-09-24 01:45:27 +08:00
parent e8a849e6c1
commit 81ce4e4597
4 changed files with 135 additions and 102 deletions

View File

@ -198,6 +198,8 @@ RUN set -eux \
&& echo "Installing psycopg2" \ && echo "Installing psycopg2" \
&& python3 -m pip install --no-cache-dir ./psycopg2/${PSYCOPG2_VERSION}/${TARGETARCH}${TARGETVARIANT}/psycopg2*.whl \ && python3 -m pip install --no-cache-dir ./psycopg2/${PSYCOPG2_VERSION}/${TARGETARCH}${TARGETVARIANT}/psycopg2*.whl \
&& python3 -m pip list \ && python3 -m pip list \
&& echo "Installing google cloud storage" \
&& python3 -m pip install google-cloud-storage \
&& echo "Cleaning up image layer" \ && echo "Cleaning up image layer" \
&& cd ../ \ && cd ../ \
&& rm -rf paperless-ngx \ && rm -rf paperless-ngx \

205
Pipfile
View File

@ -1,101 +1,104 @@
[[source]] [[source]]
url = "https://pypi.python.org/simple" url = "https://pypi.python.org/simple"
verify_ssl = true verify_ssl = true
name = "pypi" name = "pypi"
[[source]] [[source]]
url = "https://www.piwheels.org/simple" url = "https://www.piwheels.org/simple"
verify_ssl = true verify_ssl = true
name = "piwheels" name = "piwheels"
[packages] [packages]
dateparser = "~=1.1" dateparser = "~=1.1"
django = "~=4.1" django = "~=4.1"
django-cors-headers = "*" django-cors-headers = "*"
django-celery-results = "*" django-celery-results = "*"
django-compression-middleware = "*" django-compression-middleware = "*"
django-guardian = "*" django-guardian = "*"
django-extensions = "*" django-extensions = "*"
django-filter = "~=22.1" django-filter = "~=22.1"
djangorestframework = "~=3.14" djangorestframework = "~=3.14"
djangorestframework-guardian = "*" djangorestframework-guardian = "*"
django-ipware = "*" django-ipware = "*"
filelock = "*" filelock = "*"
gunicorn = "*" gunicorn = "*"
imap-tools = "*" imap-tools = "*"
langdetect = "*" langdetect = "*"
pathvalidate = "*" pathvalidate = "*"
pillow = "~=9.4" pillow = "~=9.4"
pikepdf = "*" pikepdf = "*"
python-gnupg = "*" python-gnupg = "*"
python-dotenv = "*" python-dotenv = "*"
python-dateutil = "*" python-dateutil = "*"
python-magic = "*" python-magic = "*"
psycopg2 = "*" psycopg2 = "*"
rapidfuzz = "*" rapidfuzz = "*"
redis = {extras = ["hiredis"], version = "*"} redis = {extras = ["hiredis"], version = "*"}
scikit-learn = "~=1.2" scikit-learn = "~=1.2"
numpy = "*" numpy = "*"
whitenoise = "~=6.3" whitenoise = "~=6.3"
watchdog = "~=2.2" watchdog = "~=2.2"
whoosh="~=2.7" whoosh="~=2.7"
inotifyrecursive = "~=0.3" inotifyrecursive = "~=0.3"
ocrmypdf = "~=14.0" ocrmypdf = "~=14.0"
tqdm = "*" tqdm = "*"
tika = "*" tika = "*"
# TODO: This will sadly also install daphne+dependencies, # TODO: This will sadly also install daphne+dependencies,
# which an ASGI server we don't need. Adds about 15MB image size. # which an ASGI server we don't need. Adds about 15MB image size.
channels = "~=3.0" channels = "~=3.0"
channels-redis = "*" channels-redis = "*"
uvicorn = {extras = ["standard"], version = "*"} uvicorn = {extras = ["standard"], version = "*"}
concurrent-log-handler = "*" concurrent-log-handler = "*"
"pdfminer.six" = "*" "pdfminer.six" = "*"
pyzbar = "*" pyzbar = "*"
mysqlclient = "*" celery = {extras = ["redis"], version = "*"}
celery = {extras = ["redis"], version = "*"} setproctitle = "*"
setproctitle = "*" nltk = "*"
nltk = "*" pdf2image = "*"
pdf2image = "*" flower = "*"
flower = "*" bleach = "*"
bleach = "*" zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"}
zxing-cpp = {version = "*", platform_machine = "== 'x86_64'"} #
# # Packages locked due to issues (try to check if these are fixed in a release every so often)
# Packages locked due to issues (try to check if these are fixed in a release every so often) #
# # Pin this until piwheels is building 1.9 (see https://www.piwheels.org/project/scipy/)
# Pin this until piwheels is building 1.9 (see https://www.piwheels.org/project/scipy/) scipy = "==1.8.1"
scipy = "==1.8.1" google-cloud-storage = "*"
google = "*"
[dev-packages] google-api-core = "*"
coveralls = "*" google-cloud-core = "*"
factory-boy = "*"
pytest = "*" [dev-packages]
pytest-cov = "*" coveralls = "*"
pytest-django = "*" factory-boy = "*"
pytest-env = "*" pytest = "*"
pytest-sugar = "*" pytest-cov = "*"
pytest-xdist = "*" pytest-django = "*"
black = "*" pytest-env = "*"
pre-commit = "*" pytest-sugar = "*"
imagehash = "*" pytest-xdist = "*"
mkdocs-material = "*" black = "*"
ruff = "*" pre-commit = "*"
imagehash = "*"
[typing-dev] mkdocs-material = "*"
mypy = "*" ruff = "*"
types-Pillow = "*"
django-filter-stubs = "*" [typing-dev]
types-python-dateutil = "*" mypy = "*"
djangorestframework-stubs = {extras= ["compatible-mypy"], version="*"} types-Pillow = "*"
celery-types = "*" django-filter-stubs = "*"
django-stubs = {extras= ["compatible-mypy"], version="*"} types-python-dateutil = "*"
types-dateparser = "*" djangorestframework-stubs = {extras= ["compatible-mypy"], version="*"}
types-bleach = "*" celery-types = "*"
types-humanfriendly = "*" django-stubs = {extras= ["compatible-mypy"], version="*"}
types-redis = "*" types-dateparser = "*"
types-tqdm = "*" types-bleach = "*"
types-Markdown = "*" types-humanfriendly = "*"
types-Pygments = "*" types-redis = "*"
types-backports = "*" types-tqdm = "*"
types-colorama = "*" types-Markdown = "*"
types-psycopg2 = "*" types-Pygments = "*"
types-setuptools = "*" types-backports = "*"
types-colorama = "*"
types-psycopg2 = "*"
types-setuptools = "*"

View File

@ -37,6 +37,8 @@ from .parsers import ParseError
from .signals import document_consumption_finished from .signals import document_consumption_finished
from .signals import document_consumption_started from .signals import document_consumption_started
from google.cloud import storage
class ConsumerError(Exception): class ConsumerError(Exception):
pass pass
@ -431,6 +433,16 @@ class Consumer(LoggingMixin):
classifier = load_classifier() classifier = load_classifier()
try:
self.log("debug", "Initializing Google Cloud Storage: " + str(settings.GCP_SERVICE_ACCOUNT_JSON))
# Prepare Google Cloud Storage client
# client = storage.Client()
client = storage.Client.from_service_account_info(settings.GCP_SERVICE_ACCOUNT_JSON)
self.log("debug", "Getting bucket: " + settings.GCP_BUCKET_NAME)
self.bucket = client.bucket(settings.GCP_BUCKET_NAME)
except Exception as e:
self.log("warning", 'Failed to initialize GCP: ' + str(e))
self._send_progress(95, 100, "WORKING", MESSAGE_SAVE_DOCUMENT) self._send_progress(95, 100, "WORKING", MESSAGE_SAVE_DOCUMENT)
# now that everything is done, we can start to store the document # now that everything is done, we can start to store the document
# in the system. This will be a transaction and reasonably fast. # in the system. This will be a transaction and reasonably fast.
@ -487,7 +499,7 @@ class Consumer(LoggingMixin):
document.save() document.save()
# Delete the file only if it was successfully consumed # Delete the file only if it was successfully consumed
self.log("debug", f"Deleting file {self.path}") self.log("debug", f"Deleting file 123 {self.path}")
os.unlink(self.path) os.unlink(self.path)
self.original_path.unlink() self.original_path.unlink()
@ -625,6 +637,16 @@ class Consumer(LoggingMixin):
def _write(self, storage_type, source, target): def _write(self, storage_type, source, target):
with open(source, "rb") as read_file, open(target, "wb") as write_file: with open(source, "rb") as read_file, open(target, "wb") as write_file:
write_file.write(read_file.read()) write_file.write(read_file.read())
with open(source, "rb") as read_file_2:
self.log("debug", "GOOGLE_CLOUD_STORAGE:" + str(settings.GOOGLE_CLOUD_STORAGE))
# Reference: https://github.com/GoogleCloudPlatform/getting-started-python/blob/main/bookshelf/storage.py#L59
if settings.GOOGLE_CLOUD_STORAGE:
self.log("debug", "Uploading to Google Cloud Storage")
# GCP was initialized earlier
blob = self.bucket.blob(str(target))
# Reference: https://cloud.google.com/python/docs/reference/storage/latest/google.cloud.storage.blob.Blob#google_cloud_storage_blob_Blob_upload_from_file
blob.upload_from_file(read_file_2)
def _log_script_outputs(self, completed_process: CompletedProcess): def _log_script_outputs(self, completed_process: CompletedProcess):
""" """

View File

@ -217,6 +217,12 @@ def _parse_beat_schedule() -> Dict:
# NEVER RUN WITH DEBUG IN PRODUCTION. # NEVER RUN WITH DEBUG IN PRODUCTION.
DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO") DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")
# Google Cloud Storage
GOOGLE_CLOUD_STORAGE = __get_boolean("GOOGLE_CLOUD_STORAGE", "NO")
GCP_BUCKET_NAME = os.getenv('GCP_BUCKET_NAME', 'dms_files_local')
GCP_SERVICE_ACCOUNT_JSON = os.getenv('GCP_SERVICE_ACCOUNT_JSON', '')
if GCP_SERVICE_ACCOUNT_JSON != '':
GCP_SERVICE_ACCOUNT_JSON = json.loads(GCP_SERVICE_ACCOUNT_JSON)
############################################################################### ###############################################################################
# Directories # # Directories #