Playing around with the Redis caching

This commit is contained in:
Trenton H 2024-01-16 12:24:25 -08:00
parent 45e2b7f814
commit f90248de09
6 changed files with 93 additions and 32 deletions

11
src/documents/caching.py Normal file
View File

@ -0,0 +1,11 @@
from typing import Final
CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
CLASSIFIER_MODIFIED_KEY: Final[str] = "classifier_modified"
CACHE_1_MINUTE: Final[int] = 60
CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
DOC_SUGGESTIONS_BASE: Final[str] = "doc_{}_suggest"
DOC_METADATA_BASE: Final[str] = "doc_{}_metadata"

View File

@ -10,8 +10,13 @@ from pathlib import Path
from typing import Optional from typing import Optional
from django.conf import settings from django.conf import settings
from django.core.cache import cache
from sklearn.exceptions import InconsistentVersionWarning from sklearn.exceptions import InconsistentVersionWarning
from documents.caching import CACHE_5_MINUTES
from documents.caching import CLASSIFIER_HASH_KEY
from documents.caching import CLASSIFIER_MODIFIED_KEY
from documents.caching import CLASSIFIER_VERSION_KEY
from documents.models import Document from documents.models import Document
from documents.models import MatchingModel from documents.models import MatchingModel
@ -322,6 +327,10 @@ class DocumentClassifier:
self.last_doc_change_time = latest_doc_change self.last_doc_change_time = latest_doc_change
self.last_auto_type_hash = hasher.digest() self.last_auto_type_hash = hasher.digest()
cache.set(CLASSIFIER_MODIFIED_KEY, self.last_doc_change_time, CACHE_5_MINUTES)
cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_5_MINUTES)
cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_5_MINUTES)
return True return True
def preprocess_content(self, content: str) -> str: # pragma: no cover def preprocess_content(self, content: str) -> str: # pragma: no cover

View File

@ -1,9 +1,13 @@
import pickle
from datetime import datetime from datetime import datetime
from typing import Optional from typing import Optional
from django.conf import settings from django.conf import settings
from django.core.cache import cache
from documents.caching import CACHE_5_MINUTES
from documents.caching import CLASSIFIER_HASH_KEY
from documents.caching import CLASSIFIER_MODIFIED_KEY
from documents.caching import CLASSIFIER_VERSION_KEY
from documents.classifier import DocumentClassifier from documents.classifier import DocumentClassifier
from documents.models import Document from documents.models import Document
@ -19,13 +23,18 @@ def suggestions_etag(request, pk: int) -> Optional[str]:
""" """
if not settings.MODEL_FILE.exists(): if not settings.MODEL_FILE.exists():
return None return None
with open(settings.MODEL_FILE, "rb") as f: cache_hits = cache.get_many(
schema_version = pickle.load(f) [CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY],
if schema_version != DocumentClassifier.FORMAT_VERSION: )
return None if (
_ = pickle.load(f) CLASSIFIER_VERSION_KEY in cache_hits
last_auto_type_hash: bytes = pickle.load(f) and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
return f"{last_auto_type_hash}:{settings.NUMBER_OF_SUGGESTED_DATES}" ):
return None
elif CLASSIFIER_HASH_KEY in cache_hits:
cache.touch(CLASSIFIER_HASH_KEY, CACHE_5_MINUTES)
return f"{cache_hits[CLASSIFIER_HASH_KEY]}:{settings.NUMBER_OF_SUGGESTED_DATES}"
return None
def suggestions_last_modified(request, pk: int) -> Optional[datetime]: def suggestions_last_modified(request, pk: int) -> Optional[datetime]:
@ -36,12 +45,18 @@ def suggestions_last_modified(request, pk: int) -> Optional[datetime]:
""" """
if not settings.MODEL_FILE.exists(): if not settings.MODEL_FILE.exists():
return None return None
with open(settings.MODEL_FILE, "rb") as f: cache_hits = cache.get_many(
schema_version = pickle.load(f) [CLASSIFIER_VERSION_KEY, CLASSIFIER_MODIFIED_KEY],
if schema_version != DocumentClassifier.FORMAT_VERSION: )
return None if (
last_doc_change_time = pickle.load(f) CLASSIFIER_VERSION_KEY in cache_hits
return last_doc_change_time and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
):
return None
elif CLASSIFIER_MODIFIED_KEY in cache_hits:
cache.touch(CLASSIFIER_MODIFIED_KEY, CACHE_5_MINUTES)
return cache_hits[CLASSIFIER_MODIFIED_KEY]
return None
def metadata_etag(request, pk: int) -> Optional[str]: def metadata_etag(request, pk: int) -> Optional[str]:

View File

@ -15,6 +15,7 @@ from urllib.parse import quote
import pathvalidate import pathvalidate
from django.conf import settings from django.conf import settings
from django.contrib.auth.models import User from django.contrib.auth.models import User
from django.core.cache import cache
from django.db.models import Case from django.db.models import Case
from django.db.models import Count from django.db.models import Count
from django.db.models import IntegerField from django.db.models import IntegerField
@ -62,6 +63,9 @@ from documents import bulk_edit
from documents.bulk_download import ArchiveOnlyStrategy from documents.bulk_download import ArchiveOnlyStrategy
from documents.bulk_download import OriginalAndArchiveStrategy from documents.bulk_download import OriginalAndArchiveStrategy
from documents.bulk_download import OriginalsOnlyStrategy from documents.bulk_download import OriginalsOnlyStrategy
from documents.caching import CACHE_5_MINUTES
from documents.caching import DOC_METADATA_BASE
from documents.caching import DOC_SUGGESTIONS_BASE
from documents.classifier import load_classifier from documents.classifier import load_classifier
from documents.conditionals import metadata_etag from documents.conditionals import metadata_etag
from documents.conditionals import metadata_last_modified from documents.conditionals import metadata_last_modified
@ -407,6 +411,14 @@ class DocumentViewSet(
except Document.DoesNotExist: except Document.DoesNotExist:
raise Http404 raise Http404
doc_key = DOC_METADATA_BASE.format(doc.pk)
cache_hit = cache.get(doc_key)
if cache_hit is not None:
cache.touch(doc_key, CACHE_5_MINUTES)
return Response(cache_hit)
meta = { meta = {
"original_checksum": doc.checksum, "original_checksum": doc.checksum,
"original_size": self.get_filesize(doc.source_path), "original_size": self.get_filesize(doc.source_path),
@ -436,6 +448,8 @@ class DocumentViewSet(
meta["archive_size"] = None meta["archive_size"] = None
meta["archive_metadata"] = None meta["archive_metadata"] = None
cache.set(doc_key, meta, CACHE_5_MINUTES)
return Response(meta) return Response(meta)
@action(methods=["get"], detail=True) @action(methods=["get"], detail=True)
@ -454,6 +468,14 @@ class DocumentViewSet(
): ):
return HttpResponseForbidden("Insufficient permissions") return HttpResponseForbidden("Insufficient permissions")
doc_key = DOC_SUGGESTIONS_BASE.format(doc.pk)
cache_hit = cache.get(doc_key)
if cache_hit is not None:
cache.touch(doc_key, CACHE_5_MINUTES)
return Response(cache_hit)
classifier = load_classifier() classifier = load_classifier()
dates = [] dates = []
@ -463,23 +485,23 @@ class DocumentViewSet(
{i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)}, {i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)},
) )
return Response( resp_data = {
{ "correspondents": [
"correspondents": [ c.id for c in match_correspondents(doc, classifier, request.user)
c.id for c in match_correspondents(doc, classifier, request.user) ],
], "tags": [t.id for t in match_tags(doc, classifier, request.user)],
"tags": [t.id for t in match_tags(doc, classifier, request.user)], "document_types": [
"document_types": [ dt.id for dt in match_document_types(doc, classifier, request.user)
dt.id for dt in match_document_types(doc, classifier, request.user) ],
], "storage_paths": [
"storage_paths": [ dt.id for dt in match_storage_paths(doc, classifier, request.user)
dt.id for dt in match_storage_paths(doc, classifier, request.user) ],
], "dates": [date.strftime("%Y-%m-%d") for date in dates if date is not None],
"dates": [ }
date.strftime("%Y-%m-%d") for date in dates if date is not None
], cache.set(doc_key, resp_data, CACHE_5_MINUTES)
},
) return Response(resp_data)
@action(methods=["get"], detail=True) @action(methods=["get"], detail=True)
@method_decorator(cache_control(public=False, max_age=5 * 60)) @method_decorator(cache_control(public=False, max_age=5 * 60))

View File

@ -762,7 +762,10 @@ CELERY_BEAT_SCHEDULE_FILENAME = os.path.join(DATA_DIR, "celerybeat-schedule.db")
# django setting. # django setting.
CACHES = { CACHES = {
"default": { "default": {
"BACKEND": "django.core.cache.backends.redis.RedisCache", "BACKEND": os.environ.get(
"PAPERLESS_CACHE_BACKEND",
"django.core.cache.backends.redis.RedisCache",
),
"LOCATION": _CHANNELS_REDIS_URL, "LOCATION": _CHANNELS_REDIS_URL,
}, },
} }

View File

@ -3,6 +3,7 @@ DJANGO_SETTINGS_MODULE = paperless.settings
addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50 addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50
env = env =
PAPERLESS_DISABLE_DBHANDLER=true PAPERLESS_DISABLE_DBHANDLER=true
PAPERLESS_CACHE_BACKEND=django.core.cache.backends.dummy.DummyCache
[coverage:run] [coverage:run]
source = source =