Use Redis for more caching

This commit is contained in:
Trenton H 2024-01-18 15:21:07 -08:00
parent f90248de09
commit 8a4059b9bf
6 changed files with 114 additions and 38 deletions

View File

@ -6,6 +6,9 @@ CLASSIFIER_MODIFIED_KEY: Final[str] = "classifier_modified"
CACHE_1_MINUTE: Final[int] = 60 CACHE_1_MINUTE: Final[int] = 60
CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
DOC_SUGGESTIONS_BASE: Final[str] = "doc_{}_suggest" DOC_SUGGESTIONS_BASE: Final[str] = "doc_{}_suggest"
DOC_METADATA_BASE: Final[str] = "doc_{}_metadata" DOC_METADATA_BASE: Final[str] = "doc_{}_metadata"
DOC_THUMBNAIL_ETAG_BASE: Final[str] = "doc_{}_thumbnail_etag"
DOC_THUMBNAIL_MODIFIED_BASE: Final[str] = "doc_{}_thumbnail_modified"

View File

@ -13,7 +13,7 @@ from django.conf import settings
from django.core.cache import cache from django.core.cache import cache
from sklearn.exceptions import InconsistentVersionWarning from sklearn.exceptions import InconsistentVersionWarning
from documents.caching import CACHE_5_MINUTES from documents.caching import CACHE_50_MINUTES
from documents.caching import CLASSIFIER_HASH_KEY from documents.caching import CLASSIFIER_HASH_KEY
from documents.caching import CLASSIFIER_MODIFIED_KEY from documents.caching import CLASSIFIER_MODIFIED_KEY
from documents.caching import CLASSIFIER_VERSION_KEY from documents.caching import CLASSIFIER_VERSION_KEY
@ -327,9 +327,11 @@ class DocumentClassifier:
self.last_doc_change_time = latest_doc_change self.last_doc_change_time = latest_doc_change
self.last_auto_type_hash = hasher.digest() self.last_auto_type_hash = hasher.digest()
cache.set(CLASSIFIER_MODIFIED_KEY, self.last_doc_change_time, CACHE_5_MINUTES) # Set the classifier information into the cache
cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_5_MINUTES) # Caching for 50 minutes, so slightly less than the normal retrain time
cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_5_MINUTES) cache.set(CLASSIFIER_MODIFIED_KEY, self.last_doc_change_time, CACHE_50_MINUTES)
cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES)
cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES)
return True return True

View File

@ -1,13 +1,18 @@
from datetime import datetime from datetime import datetime
from datetime import timezone
from hashlib import sha256
from typing import Optional from typing import Optional
from django.conf import settings from django.conf import settings
from django.core.cache import cache from django.core.cache import cache
from documents.caching import CACHE_5_MINUTES from documents.caching import CACHE_5_MINUTES
from documents.caching import CACHE_50_MINUTES
from documents.caching import CLASSIFIER_HASH_KEY from documents.caching import CLASSIFIER_HASH_KEY
from documents.caching import CLASSIFIER_MODIFIED_KEY from documents.caching import CLASSIFIER_MODIFIED_KEY
from documents.caching import CLASSIFIER_VERSION_KEY from documents.caching import CLASSIFIER_VERSION_KEY
from documents.caching import DOC_THUMBNAIL_ETAG_BASE
from documents.caching import DOC_THUMBNAIL_MODIFIED_BASE
from documents.classifier import DocumentClassifier from documents.classifier import DocumentClassifier
from documents.models import Document from documents.models import Document
@ -18,20 +23,22 @@ def suggestions_etag(request, pk: int) -> Optional[str]:
suggestions if the classifier has not been changed and the suggested dates suggestions if the classifier has not been changed and the suggested dates
setting is also unchanged setting is also unchanged
TODO: It would be nice to not duplicate the partial loading and the loading
between here and the actual classifier
""" """
# If no model file, no etag at all
if not settings.MODEL_FILE.exists(): if not settings.MODEL_FILE.exists():
return None return None
# Check cache information
cache_hits = cache.get_many( cache_hits = cache.get_many(
[CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY], [CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY],
) )
# If the version differs somehow, no etag
if ( if (
CLASSIFIER_VERSION_KEY in cache_hits CLASSIFIER_VERSION_KEY in cache_hits
and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
): ):
return None return None
elif CLASSIFIER_HASH_KEY in cache_hits: elif CLASSIFIER_HASH_KEY in cache_hits:
# Refresh the cache and return the hash digest and the dates setting
cache.touch(CLASSIFIER_HASH_KEY, CACHE_5_MINUTES) cache.touch(CLASSIFIER_HASH_KEY, CACHE_5_MINUTES)
return f"{cache_hits[CLASSIFIER_HASH_KEY]}:{settings.NUMBER_OF_SUGGESTED_DATES}" return f"{cache_hits[CLASSIFIER_HASH_KEY]}:{settings.NUMBER_OF_SUGGESTED_DATES}"
return None return None
@ -43,17 +50,20 @@ def suggestions_last_modified(request, pk: int) -> Optional[datetime]:
as there is not way to track the suggested date setting modification, but it seems as there is not way to track the suggested date setting modification, but it seems
unlikely that changes too often unlikely that changes too often
""" """
# No file, no last modified
if not settings.MODEL_FILE.exists(): if not settings.MODEL_FILE.exists():
return None return None
cache_hits = cache.get_many( cache_hits = cache.get_many(
[CLASSIFIER_VERSION_KEY, CLASSIFIER_MODIFIED_KEY], [CLASSIFIER_VERSION_KEY, CLASSIFIER_MODIFIED_KEY],
) )
# If the version differs somehow, no last modified
if ( if (
CLASSIFIER_VERSION_KEY in cache_hits CLASSIFIER_VERSION_KEY in cache_hits
and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION
): ):
return None return None
elif CLASSIFIER_MODIFIED_KEY in cache_hits: elif CLASSIFIER_MODIFIED_KEY in cache_hits:
# Refresh the cache and return the last modified
cache.touch(CLASSIFIER_MODIFIED_KEY, CACHE_5_MINUTES) cache.touch(CLASSIFIER_MODIFIED_KEY, CACHE_5_MINUTES)
return cache_hits[CLASSIFIER_MODIFIED_KEY] return cache_hits[CLASSIFIER_MODIFIED_KEY]
return None return None
@ -67,7 +77,7 @@ def metadata_etag(request, pk: int) -> Optional[str]:
try: try:
doc = Document.objects.get(pk=pk) doc = Document.objects.get(pk=pk)
return doc.checksum return doc.checksum
except Document.DoesNotExist: except Document.DoesNotExist: # pragma: no cover
return None return None
return None return None
@ -81,7 +91,7 @@ def metadata_last_modified(request, pk: int) -> Optional[datetime]:
try: try:
doc = Document.objects.get(pk=pk) doc = Document.objects.get(pk=pk)
return doc.modified return doc.modified
except Document.DoesNotExist: except Document.DoesNotExist: # pragma: no cover
return None return None
return None return None
@ -97,6 +107,58 @@ def preview_etag(request, pk: int) -> Optional[str]:
and request.query_params["original"] == "true" and request.query_params["original"] == "true"
) )
return doc.checksum if use_original else doc.archive_checksum return doc.checksum if use_original else doc.archive_checksum
except Document.DoesNotExist: except Document.DoesNotExist: # pragma: no cover
return None return None
return None return None
def preview_last_modified(request, pk: int) -> Optional[str]:
""" """
try:
doc = Document.objects.get(pk=pk)
return doc.modified
except Document.DoesNotExist: # pragma: no cover
return None
return None
def thumbnail_etag(request, pk: int) -> Optional[str]:
"""
Returns the SHA256 of a thumbnail, either from cache or calculated
"""
try:
doc = Document.objects.get(pk=pk)
if not doc.thumbnail_path.exists():
return None
doc_key = DOC_THUMBNAIL_ETAG_BASE.format(pk)
cache_hit = cache.get(doc_key)
if cache_hit is not None:
return cache_hit
hasher = sha256()
hasher.update(doc.thumbnail_path.read_bytes())
thumb_checksum = hasher.hexdigest()
cache.set(doc_key, thumb_checksum, CACHE_50_MINUTES)
except Document.DoesNotExist: # pragma: no cover
return None
def thumbnail_last_modified(request, pk: int) -> Optional[int]:
"""
Returns the filesystem last modified either from cache or from filesystem
"""
try:
doc = Document.objects.get(pk=pk)
if not doc.thumbnail_path.exists():
return None
doc_key = DOC_THUMBNAIL_MODIFIED_BASE.format(pk)
cache_hit = cache.get(doc_key)
if cache_hit is not None:
return cache_hit
last_modified = datetime.fromtimestamp(
doc.thumbnail_path.stat().st_mtime,
tz=timezone.utc,
)
cache.set(doc_key, last_modified, CACHE_50_MINUTES)
return last_modified
except Document.DoesNotExist: # pragma: no cover
return None

View File

@ -13,12 +13,17 @@ from dateutil import parser
from django.conf import settings from django.conf import settings
from django.contrib.auth.models import Permission from django.contrib.auth.models import Permission
from django.contrib.auth.models import User from django.contrib.auth.models import User
from django.core.cache import cache
from django.test import override_settings from django.test import override_settings
from django.utils import timezone from django.utils import timezone
from guardian.shortcuts import assign_perm from guardian.shortcuts import assign_perm
from rest_framework import status from rest_framework import status
from rest_framework.test import APITestCase from rest_framework.test import APITestCase
from documents.caching import CACHE_50_MINUTES
from documents.caching import CLASSIFIER_HASH_KEY
from documents.caching import CLASSIFIER_MODIFIED_KEY
from documents.caching import CLASSIFIER_VERSION_KEY
from documents.models import Correspondent from documents.models import Correspondent
from documents.models import CustomField from documents.models import CustomField
from documents.models import CustomFieldInstance from documents.models import CustomFieldInstance
@ -40,6 +45,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
self.user = User.objects.create_superuser(username="temp_admin") self.user = User.objects.create_superuser(username="temp_admin")
self.client.force_authenticate(user=self.user) self.client.force_authenticate(user=self.user)
cache.clear()
def testDocuments(self): def testDocuments(self):
response = self.client.get("/api/documents/").data response = self.client.get("/api/documents/").data
@ -1266,7 +1272,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
}, },
) )
@mock.patch("documents.conditionals.pickle.load") @mock.patch("documents.views.load_classifier")
@mock.patch("documents.views.match_storage_paths") @mock.patch("documents.views.match_storage_paths")
@mock.patch("documents.views.match_document_types") @mock.patch("documents.views.match_document_types")
@mock.patch("documents.views.match_tags") @mock.patch("documents.views.match_tags")
@ -1278,7 +1284,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
match_tags, match_tags,
match_document_types, match_document_types,
match_storage_paths, match_storage_paths,
mocked_pickle_load, mocked_load,
): ):
""" """
GIVEN: GIVEN:
@ -1287,23 +1293,28 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
- Classifier has not been modified - Classifier has not been modified
THEN: THEN:
- Subsequent requests are returned alright - Subsequent requests are returned alright
- ETag and last modified are called - ETag and last modified headers are set
""" """
settings.MODEL_FILE.touch()
# setup the cache how the classifier does it
from documents.classifier import DocumentClassifier from documents.classifier import DocumentClassifier
last_modified = timezone.now() settings.MODEL_FILE.touch()
# ETag first, then modified last_modified = timezone.now()
mock_effect = [ cache.set(CLASSIFIER_MODIFIED_KEY, last_modified, CACHE_50_MINUTES)
cache.set(CLASSIFIER_HASH_KEY, "thisisachecksum", CACHE_50_MINUTES)
cache.set(
CLASSIFIER_VERSION_KEY,
DocumentClassifier.FORMAT_VERSION, DocumentClassifier.FORMAT_VERSION,
"dont care", CACHE_50_MINUTES,
b"thisisachecksum", )
DocumentClassifier.FORMAT_VERSION,
last_modified, # Mock the matching
] match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
mocked_pickle_load.side_effect = mock_effect match_tags.return_value = [Tag(id=56), Tag(id=123)]
match_document_types.return_value = [DocumentType(id=23)]
match_storage_paths.return_value = [StoragePath(id=99), StoragePath(id=77)]
doc = Document.objects.create( doc = Document.objects.create(
title="test", title="test",
@ -1311,12 +1322,8 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
content="this is an invoice from 12.04.2022!", content="this is an invoice from 12.04.2022!",
) )
match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
match_tags.return_value = [Tag(id=56), Tag(id=123)]
match_document_types.return_value = [DocumentType(id=23)]
match_storage_paths.return_value = [StoragePath(id=99), StoragePath(id=77)]
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/") response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual( self.assertEqual(
response.data, response.data,
{ {
@ -1327,7 +1334,6 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
"dates": ["2022-04-12"], "dates": ["2022-04-12"],
}, },
) )
mocked_pickle_load.assert_called()
self.assertIn("Last-Modified", response.headers) self.assertIn("Last-Modified", response.headers)
self.assertEqual( self.assertEqual(
response.headers["Last-Modified"], response.headers["Last-Modified"],
@ -1336,15 +1342,11 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
self.assertIn("ETag", response.headers) self.assertIn("ETag", response.headers)
self.assertEqual( self.assertEqual(
response.headers["ETag"], response.headers["ETag"],
f"\"b'thisisachecksum':{settings.NUMBER_OF_SUGGESTED_DATES}\"", f'"thisisachecksum:{settings.NUMBER_OF_SUGGESTED_DATES}"',
) )
mocked_pickle_load.rest_mock()
mocked_pickle_load.side_effect = mock_effect
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/") response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual(response.status_code, status.HTTP_200_OK)
mocked_pickle_load.assert_called()
@mock.patch("documents.parsers.parse_date_generator") @mock.patch("documents.parsers.parse_date_generator")
@override_settings(NUMBER_OF_SUGGESTED_DATES=0) @override_settings(NUMBER_OF_SUGGESTED_DATES=0)

View File

@ -64,14 +64,18 @@ from documents.bulk_download import ArchiveOnlyStrategy
from documents.bulk_download import OriginalAndArchiveStrategy from documents.bulk_download import OriginalAndArchiveStrategy
from documents.bulk_download import OriginalsOnlyStrategy from documents.bulk_download import OriginalsOnlyStrategy
from documents.caching import CACHE_5_MINUTES from documents.caching import CACHE_5_MINUTES
from documents.caching import CACHE_50_MINUTES
from documents.caching import DOC_METADATA_BASE from documents.caching import DOC_METADATA_BASE
from documents.caching import DOC_SUGGESTIONS_BASE from documents.caching import DOC_SUGGESTIONS_BASE
from documents.classifier import load_classifier from documents.classifier import load_classifier
from documents.conditionals import metadata_etag from documents.conditionals import metadata_etag
from documents.conditionals import metadata_last_modified from documents.conditionals import metadata_last_modified
from documents.conditionals import preview_etag from documents.conditionals import preview_etag
from documents.conditionals import preview_last_modified
from documents.conditionals import suggestions_etag from documents.conditionals import suggestions_etag
from documents.conditionals import suggestions_last_modified from documents.conditionals import suggestions_last_modified
from documents.conditionals import thumbnail_etag
from documents.conditionals import thumbnail_last_modified
from documents.data_models import ConsumableDocument from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource from documents.data_models import DocumentSource
@ -505,7 +509,9 @@ class DocumentViewSet(
@action(methods=["get"], detail=True) @action(methods=["get"], detail=True)
@method_decorator(cache_control(public=False, max_age=5 * 60)) @method_decorator(cache_control(public=False, max_age=5 * 60))
@method_decorator(condition(etag_func=preview_etag)) @method_decorator(
condition(etag_func=preview_etag, last_modified_func=preview_last_modified),
)
def preview(self, request, pk=None): def preview(self, request, pk=None):
try: try:
response = self.file_response(pk, request, "inline") response = self.file_response(pk, request, "inline")
@ -514,7 +520,10 @@ class DocumentViewSet(
raise Http404 raise Http404
@action(methods=["get"], detail=True) @action(methods=["get"], detail=True)
@method_decorator(cache_control(public=False, max_age=315360000)) @method_decorator(cache_control(public=False, max_age=CACHE_50_MINUTES))
@method_decorator(
condition(etag_func=thumbnail_etag, last_modified_func=thumbnail_last_modified),
)
def thumb(self, request, pk=None): def thumb(self, request, pk=None):
try: try:
doc = Document.objects.get(id=pk) doc = Document.objects.get(id=pk)
@ -528,8 +537,6 @@ class DocumentViewSet(
handle = GnuPG.decrypted(doc.thumbnail_file) handle = GnuPG.decrypted(doc.thumbnail_file)
else: else:
handle = doc.thumbnail_file handle = doc.thumbnail_file
# TODO: Send ETag information and use that to send new thumbnails
# if available
return HttpResponse(handle, content_type="image/webp") return HttpResponse(handle, content_type="image/webp")
except (FileNotFoundError, Document.DoesNotExist): except (FileNotFoundError, Document.DoesNotExist):

View File

@ -3,7 +3,7 @@ DJANGO_SETTINGS_MODULE = paperless.settings
addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50 addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50
env = env =
PAPERLESS_DISABLE_DBHANDLER=true PAPERLESS_DISABLE_DBHANDLER=true
PAPERLESS_CACHE_BACKEND=django.core.cache.backends.dummy.DummyCache PAPERLESS_CACHE_BACKEND=django.core.cache.backends.locmem.LocMemCache
[coverage:run] [coverage:run]
source = source =