diff --git a/src/documents/caching.py b/src/documents/caching.py new file mode 100644 index 000000000..83ec0ba25 --- /dev/null +++ b/src/documents/caching.py @@ -0,0 +1,11 @@ +from typing import Final + +CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version" +CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash" +CLASSIFIER_MODIFIED_KEY: Final[str] = "classifier_modified" + +CACHE_1_MINUTE: Final[int] = 60 +CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE + +DOC_SUGGESTIONS_BASE: Final[str] = "doc_{}_suggest" +DOC_METADATA_BASE: Final[str] = "doc_{}_metadata" diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 5833e373e..298d28349 100644 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -10,8 +10,13 @@ from pathlib import Path from typing import Optional from django.conf import settings +from django.core.cache import cache from sklearn.exceptions import InconsistentVersionWarning +from documents.caching import CACHE_5_MINUTES +from documents.caching import CLASSIFIER_HASH_KEY +from documents.caching import CLASSIFIER_MODIFIED_KEY +from documents.caching import CLASSIFIER_VERSION_KEY from documents.models import Document from documents.models import MatchingModel @@ -322,6 +327,10 @@ class DocumentClassifier: self.last_doc_change_time = latest_doc_change self.last_auto_type_hash = hasher.digest() + cache.set(CLASSIFIER_MODIFIED_KEY, self.last_doc_change_time, CACHE_5_MINUTES) + cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_5_MINUTES) + cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_5_MINUTES) + return True def preprocess_content(self, content: str) -> str: # pragma: no cover diff --git a/src/documents/conditionals.py b/src/documents/conditionals.py index 07e6850fb..a5cf85c2d 100644 --- a/src/documents/conditionals.py +++ b/src/documents/conditionals.py @@ -1,9 +1,13 @@ -import pickle from datetime import datetime from typing import Optional from django.conf import settings +from django.core.cache import cache +from documents.caching import CACHE_5_MINUTES +from documents.caching import CLASSIFIER_HASH_KEY +from documents.caching import CLASSIFIER_MODIFIED_KEY +from documents.caching import CLASSIFIER_VERSION_KEY from documents.classifier import DocumentClassifier from documents.models import Document @@ -19,13 +23,18 @@ def suggestions_etag(request, pk: int) -> Optional[str]: """ if not settings.MODEL_FILE.exists(): return None - with open(settings.MODEL_FILE, "rb") as f: - schema_version = pickle.load(f) - if schema_version != DocumentClassifier.FORMAT_VERSION: - return None - _ = pickle.load(f) - last_auto_type_hash: bytes = pickle.load(f) - return f"{last_auto_type_hash}:{settings.NUMBER_OF_SUGGESTED_DATES}" + cache_hits = cache.get_many( + [CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY], + ) + if ( + CLASSIFIER_VERSION_KEY in cache_hits + and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION + ): + return None + elif CLASSIFIER_HASH_KEY in cache_hits: + cache.touch(CLASSIFIER_HASH_KEY, CACHE_5_MINUTES) + return f"{cache_hits[CLASSIFIER_HASH_KEY]}:{settings.NUMBER_OF_SUGGESTED_DATES}" + return None def suggestions_last_modified(request, pk: int) -> Optional[datetime]: @@ -36,12 +45,18 @@ def suggestions_last_modified(request, pk: int) -> Optional[datetime]: """ if not settings.MODEL_FILE.exists(): return None - with open(settings.MODEL_FILE, "rb") as f: - schema_version = pickle.load(f) - if schema_version != DocumentClassifier.FORMAT_VERSION: - return None - last_doc_change_time = pickle.load(f) - return last_doc_change_time + cache_hits = cache.get_many( + [CLASSIFIER_VERSION_KEY, CLASSIFIER_MODIFIED_KEY], + ) + if ( + CLASSIFIER_VERSION_KEY in cache_hits + and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION + ): + return None + elif CLASSIFIER_MODIFIED_KEY in cache_hits: + cache.touch(CLASSIFIER_MODIFIED_KEY, CACHE_5_MINUTES) + return cache_hits[CLASSIFIER_MODIFIED_KEY] + return None def metadata_etag(request, pk: int) -> Optional[str]: diff --git a/src/documents/views.py b/src/documents/views.py index 11fb5b1f2..6dedf8bf1 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -15,6 +15,7 @@ from urllib.parse import quote import pathvalidate from django.conf import settings from django.contrib.auth.models import User +from django.core.cache import cache from django.db.models import Case from django.db.models import Count from django.db.models import IntegerField @@ -62,6 +63,9 @@ from documents import bulk_edit from documents.bulk_download import ArchiveOnlyStrategy from documents.bulk_download import OriginalAndArchiveStrategy from documents.bulk_download import OriginalsOnlyStrategy +from documents.caching import CACHE_5_MINUTES +from documents.caching import DOC_METADATA_BASE +from documents.caching import DOC_SUGGESTIONS_BASE from documents.classifier import load_classifier from documents.conditionals import metadata_etag from documents.conditionals import metadata_last_modified @@ -407,6 +411,14 @@ class DocumentViewSet( except Document.DoesNotExist: raise Http404 + doc_key = DOC_METADATA_BASE.format(doc.pk) + + cache_hit = cache.get(doc_key) + + if cache_hit is not None: + cache.touch(doc_key, CACHE_5_MINUTES) + return Response(cache_hit) + meta = { "original_checksum": doc.checksum, "original_size": self.get_filesize(doc.source_path), @@ -436,6 +448,8 @@ class DocumentViewSet( meta["archive_size"] = None meta["archive_metadata"] = None + cache.set(doc_key, meta, CACHE_5_MINUTES) + return Response(meta) @action(methods=["get"], detail=True) @@ -454,6 +468,14 @@ class DocumentViewSet( ): return HttpResponseForbidden("Insufficient permissions") + doc_key = DOC_SUGGESTIONS_BASE.format(doc.pk) + + cache_hit = cache.get(doc_key) + + if cache_hit is not None: + cache.touch(doc_key, CACHE_5_MINUTES) + return Response(cache_hit) + classifier = load_classifier() dates = [] @@ -463,23 +485,23 @@ class DocumentViewSet( {i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)}, ) - return Response( - { - "correspondents": [ - c.id for c in match_correspondents(doc, classifier, request.user) - ], - "tags": [t.id for t in match_tags(doc, classifier, request.user)], - "document_types": [ - dt.id for dt in match_document_types(doc, classifier, request.user) - ], - "storage_paths": [ - dt.id for dt in match_storage_paths(doc, classifier, request.user) - ], - "dates": [ - date.strftime("%Y-%m-%d") for date in dates if date is not None - ], - }, - ) + resp_data = { + "correspondents": [ + c.id for c in match_correspondents(doc, classifier, request.user) + ], + "tags": [t.id for t in match_tags(doc, classifier, request.user)], + "document_types": [ + dt.id for dt in match_document_types(doc, classifier, request.user) + ], + "storage_paths": [ + dt.id for dt in match_storage_paths(doc, classifier, request.user) + ], + "dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None], + } + + cache.set(doc_key, resp_data, CACHE_5_MINUTES) + + return Response(resp_data) @action(methods=["get"], detail=True) @method_decorator(cache_control(public=False, max_age=5 * 60)) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 17ec2765d..86b2b2524 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -762,7 +762,10 @@ CELERY_BEAT_SCHEDULE_FILENAME = os.path.join(DATA_DIR, "celerybeat-schedule.db") # django setting. CACHES = { "default": { - "BACKEND": "django.core.cache.backends.redis.RedisCache", + "BACKEND": os.environ.get( + "PAPERLESS_CACHE_BACKEND", + "django.core.cache.backends.redis.RedisCache", + ), "LOCATION": _CHANNELS_REDIS_URL, }, } diff --git a/src/setup.cfg b/src/setup.cfg index dc5e9e33a..8fbf73d66 100644 --- a/src/setup.cfg +++ b/src/setup.cfg @@ -3,6 +3,7 @@ DJANGO_SETTINGS_MODULE = paperless.settings addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50 env = PAPERLESS_DISABLE_DBHANDLER=true + PAPERLESS_CACHE_BACKEND=django.core.cache.backends.dummy.DummyCache [coverage:run] source =