From 8a4059b9bf2592476e6b7bff991228d97cb34a90 Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Thu, 18 Jan 2024 15:21:07 -0800 Subject: [PATCH] Use Redis for more caching --- src/documents/caching.py | 3 + src/documents/classifier.py | 10 ++-- src/documents/conditionals.py | 72 +++++++++++++++++++++-- src/documents/tests/test_api_documents.py | 50 ++++++++-------- src/documents/views.py | 15 +++-- src/setup.cfg | 2 +- 6 files changed, 114 insertions(+), 38 deletions(-) diff --git a/src/documents/caching.py b/src/documents/caching.py index 83ec0ba25..fd11a0825 100644 --- a/src/documents/caching.py +++ b/src/documents/caching.py @@ -6,6 +6,9 @@ CLASSIFIER_MODIFIED_KEY: Final[str] = "classifier_modified" CACHE_1_MINUTE: Final[int] = 60 CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE +CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE DOC_SUGGESTIONS_BASE: Final[str] = "doc_{}_suggest" DOC_METADATA_BASE: Final[str] = "doc_{}_metadata" +DOC_THUMBNAIL_ETAG_BASE: Final[str] = "doc_{}_thumbnail_etag" +DOC_THUMBNAIL_MODIFIED_BASE: Final[str] = "doc_{}_thumbnail_modified" diff --git a/src/documents/classifier.py b/src/documents/classifier.py index 298d28349..7daca71b8 100644 --- a/src/documents/classifier.py +++ b/src/documents/classifier.py @@ -13,7 +13,7 @@ from django.conf import settings from django.core.cache import cache from sklearn.exceptions import InconsistentVersionWarning -from documents.caching import CACHE_5_MINUTES +from documents.caching import CACHE_50_MINUTES from documents.caching import CLASSIFIER_HASH_KEY from documents.caching import CLASSIFIER_MODIFIED_KEY from documents.caching import CLASSIFIER_VERSION_KEY @@ -327,9 +327,11 @@ class DocumentClassifier: self.last_doc_change_time = latest_doc_change self.last_auto_type_hash = hasher.digest() - cache.set(CLASSIFIER_MODIFIED_KEY, self.last_doc_change_time, CACHE_5_MINUTES) - cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_5_MINUTES) - cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_5_MINUTES) + # Set the classifier information into the cache + # Caching for 50 minutes, so slightly less than the normal retrain time + cache.set(CLASSIFIER_MODIFIED_KEY, self.last_doc_change_time, CACHE_50_MINUTES) + cache.set(CLASSIFIER_HASH_KEY, hasher.hexdigest(), CACHE_50_MINUTES) + cache.set(CLASSIFIER_VERSION_KEY, self.FORMAT_VERSION, CACHE_50_MINUTES) return True diff --git a/src/documents/conditionals.py b/src/documents/conditionals.py index a5cf85c2d..06d764543 100644 --- a/src/documents/conditionals.py +++ b/src/documents/conditionals.py @@ -1,13 +1,18 @@ from datetime import datetime +from datetime import timezone +from hashlib import sha256 from typing import Optional from django.conf import settings from django.core.cache import cache from documents.caching import CACHE_5_MINUTES +from documents.caching import CACHE_50_MINUTES from documents.caching import CLASSIFIER_HASH_KEY from documents.caching import CLASSIFIER_MODIFIED_KEY from documents.caching import CLASSIFIER_VERSION_KEY +from documents.caching import DOC_THUMBNAIL_ETAG_BASE +from documents.caching import DOC_THUMBNAIL_MODIFIED_BASE from documents.classifier import DocumentClassifier from documents.models import Document @@ -18,20 +23,22 @@ def suggestions_etag(request, pk: int) -> Optional[str]: suggestions if the classifier has not been changed and the suggested dates setting is also unchanged - TODO: It would be nice to not duplicate the partial loading and the loading - between here and the actual classifier """ + # If no model file, no etag at all if not settings.MODEL_FILE.exists(): return None + # Check cache information cache_hits = cache.get_many( [CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY], ) + # If the version differs somehow, no etag if ( CLASSIFIER_VERSION_KEY in cache_hits and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION ): return None elif CLASSIFIER_HASH_KEY in cache_hits: + # Refresh the cache and return the hash digest and the dates setting cache.touch(CLASSIFIER_HASH_KEY, CACHE_5_MINUTES) return f"{cache_hits[CLASSIFIER_HASH_KEY]}:{settings.NUMBER_OF_SUGGESTED_DATES}" return None @@ -43,17 +50,20 @@ def suggestions_last_modified(request, pk: int) -> Optional[datetime]: as there is not way to track the suggested date setting modification, but it seems unlikely that changes too often """ + # No file, no last modified if not settings.MODEL_FILE.exists(): return None cache_hits = cache.get_many( [CLASSIFIER_VERSION_KEY, CLASSIFIER_MODIFIED_KEY], ) + # If the version differs somehow, no last modified if ( CLASSIFIER_VERSION_KEY in cache_hits and cache_hits[CLASSIFIER_VERSION_KEY] != DocumentClassifier.FORMAT_VERSION ): return None elif CLASSIFIER_MODIFIED_KEY in cache_hits: + # Refresh the cache and return the last modified cache.touch(CLASSIFIER_MODIFIED_KEY, CACHE_5_MINUTES) return cache_hits[CLASSIFIER_MODIFIED_KEY] return None @@ -67,7 +77,7 @@ def metadata_etag(request, pk: int) -> Optional[str]: try: doc = Document.objects.get(pk=pk) return doc.checksum - except Document.DoesNotExist: + except Document.DoesNotExist: # pragma: no cover return None return None @@ -81,7 +91,7 @@ def metadata_last_modified(request, pk: int) -> Optional[datetime]: try: doc = Document.objects.get(pk=pk) return doc.modified - except Document.DoesNotExist: + except Document.DoesNotExist: # pragma: no cover return None return None @@ -97,6 +107,58 @@ def preview_etag(request, pk: int) -> Optional[str]: and request.query_params["original"] == "true" ) return doc.checksum if use_original else doc.archive_checksum - except Document.DoesNotExist: + except Document.DoesNotExist: # pragma: no cover return None return None + + +def preview_last_modified(request, pk: int) -> Optional[str]: + """ """ + try: + doc = Document.objects.get(pk=pk) + return doc.modified + except Document.DoesNotExist: # pragma: no cover + return None + return None + + +def thumbnail_etag(request, pk: int) -> Optional[str]: + """ + Returns the SHA256 of a thumbnail, either from cache or calculated + """ + try: + doc = Document.objects.get(pk=pk) + if not doc.thumbnail_path.exists(): + return None + doc_key = DOC_THUMBNAIL_ETAG_BASE.format(pk) + cache_hit = cache.get(doc_key) + if cache_hit is not None: + return cache_hit + hasher = sha256() + hasher.update(doc.thumbnail_path.read_bytes()) + thumb_checksum = hasher.hexdigest() + cache.set(doc_key, thumb_checksum, CACHE_50_MINUTES) + except Document.DoesNotExist: # pragma: no cover + return None + + +def thumbnail_last_modified(request, pk: int) -> Optional[int]: + """ + Returns the filesystem last modified either from cache or from filesystem + """ + try: + doc = Document.objects.get(pk=pk) + if not doc.thumbnail_path.exists(): + return None + doc_key = DOC_THUMBNAIL_MODIFIED_BASE.format(pk) + cache_hit = cache.get(doc_key) + if cache_hit is not None: + return cache_hit + last_modified = datetime.fromtimestamp( + doc.thumbnail_path.stat().st_mtime, + tz=timezone.utc, + ) + cache.set(doc_key, last_modified, CACHE_50_MINUTES) + return last_modified + except Document.DoesNotExist: # pragma: no cover + return None diff --git a/src/documents/tests/test_api_documents.py b/src/documents/tests/test_api_documents.py index 20dd64d82..4fe47f297 100644 --- a/src/documents/tests/test_api_documents.py +++ b/src/documents/tests/test_api_documents.py @@ -13,12 +13,17 @@ from dateutil import parser from django.conf import settings from django.contrib.auth.models import Permission from django.contrib.auth.models import User +from django.core.cache import cache from django.test import override_settings from django.utils import timezone from guardian.shortcuts import assign_perm from rest_framework import status from rest_framework.test import APITestCase +from documents.caching import CACHE_50_MINUTES +from documents.caching import CLASSIFIER_HASH_KEY +from documents.caching import CLASSIFIER_MODIFIED_KEY +from documents.caching import CLASSIFIER_VERSION_KEY from documents.models import Correspondent from documents.models import CustomField from documents.models import CustomFieldInstance @@ -40,6 +45,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): self.user = User.objects.create_superuser(username="temp_admin") self.client.force_authenticate(user=self.user) + cache.clear() def testDocuments(self): response = self.client.get("/api/documents/").data @@ -1266,7 +1272,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): }, ) - @mock.patch("documents.conditionals.pickle.load") + @mock.patch("documents.views.load_classifier") @mock.patch("documents.views.match_storage_paths") @mock.patch("documents.views.match_document_types") @mock.patch("documents.views.match_tags") @@ -1278,7 +1284,7 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): match_tags, match_document_types, match_storage_paths, - mocked_pickle_load, + mocked_load, ): """ GIVEN: @@ -1287,23 +1293,28 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): - Classifier has not been modified THEN: - Subsequent requests are returned alright - - ETag and last modified are called + - ETag and last modified headers are set """ - settings.MODEL_FILE.touch() + # setup the cache how the classifier does it from documents.classifier import DocumentClassifier - last_modified = timezone.now() + settings.MODEL_FILE.touch() - # ETag first, then modified - mock_effect = [ + last_modified = timezone.now() + cache.set(CLASSIFIER_MODIFIED_KEY, last_modified, CACHE_50_MINUTES) + cache.set(CLASSIFIER_HASH_KEY, "thisisachecksum", CACHE_50_MINUTES) + cache.set( + CLASSIFIER_VERSION_KEY, DocumentClassifier.FORMAT_VERSION, - "dont care", - b"thisisachecksum", - DocumentClassifier.FORMAT_VERSION, - last_modified, - ] - mocked_pickle_load.side_effect = mock_effect + CACHE_50_MINUTES, + ) + + # Mock the matching + match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)] + match_tags.return_value = [Tag(id=56), Tag(id=123)] + match_document_types.return_value = [DocumentType(id=23)] + match_storage_paths.return_value = [StoragePath(id=99), StoragePath(id=77)] doc = Document.objects.create( title="test", @@ -1311,12 +1322,8 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): content="this is an invoice from 12.04.2022!", ) - match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)] - match_tags.return_value = [Tag(id=56), Tag(id=123)] - match_document_types.return_value = [DocumentType(id=23)] - match_storage_paths.return_value = [StoragePath(id=99), StoragePath(id=77)] - response = self.client.get(f"/api/documents/{doc.pk}/suggestions/") + self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual( response.data, { @@ -1327,7 +1334,6 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): "dates": ["2022-04-12"], }, ) - mocked_pickle_load.assert_called() self.assertIn("Last-Modified", response.headers) self.assertEqual( response.headers["Last-Modified"], @@ -1336,15 +1342,11 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): self.assertIn("ETag", response.headers) self.assertEqual( response.headers["ETag"], - f"\"b'thisisachecksum':{settings.NUMBER_OF_SUGGESTED_DATES}\"", + f'"thisisachecksum:{settings.NUMBER_OF_SUGGESTED_DATES}"', ) - mocked_pickle_load.rest_mock() - mocked_pickle_load.side_effect = mock_effect - response = self.client.get(f"/api/documents/{doc.pk}/suggestions/") self.assertEqual(response.status_code, status.HTTP_200_OK) - mocked_pickle_load.assert_called() @mock.patch("documents.parsers.parse_date_generator") @override_settings(NUMBER_OF_SUGGESTED_DATES=0) diff --git a/src/documents/views.py b/src/documents/views.py index 6dedf8bf1..7e50ffa1c 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -64,14 +64,18 @@ from documents.bulk_download import ArchiveOnlyStrategy from documents.bulk_download import OriginalAndArchiveStrategy from documents.bulk_download import OriginalsOnlyStrategy from documents.caching import CACHE_5_MINUTES +from documents.caching import CACHE_50_MINUTES from documents.caching import DOC_METADATA_BASE from documents.caching import DOC_SUGGESTIONS_BASE from documents.classifier import load_classifier from documents.conditionals import metadata_etag from documents.conditionals import metadata_last_modified from documents.conditionals import preview_etag +from documents.conditionals import preview_last_modified from documents.conditionals import suggestions_etag from documents.conditionals import suggestions_last_modified +from documents.conditionals import thumbnail_etag +from documents.conditionals import thumbnail_last_modified from documents.data_models import ConsumableDocument from documents.data_models import DocumentMetadataOverrides from documents.data_models import DocumentSource @@ -505,7 +509,9 @@ class DocumentViewSet( @action(methods=["get"], detail=True) @method_decorator(cache_control(public=False, max_age=5 * 60)) - @method_decorator(condition(etag_func=preview_etag)) + @method_decorator( + condition(etag_func=preview_etag, last_modified_func=preview_last_modified), + ) def preview(self, request, pk=None): try: response = self.file_response(pk, request, "inline") @@ -514,7 +520,10 @@ class DocumentViewSet( raise Http404 @action(methods=["get"], detail=True) - @method_decorator(cache_control(public=False, max_age=315360000)) + @method_decorator(cache_control(public=False, max_age=CACHE_50_MINUTES)) + @method_decorator( + condition(etag_func=thumbnail_etag, last_modified_func=thumbnail_last_modified), + ) def thumb(self, request, pk=None): try: doc = Document.objects.get(id=pk) @@ -528,8 +537,6 @@ class DocumentViewSet( handle = GnuPG.decrypted(doc.thumbnail_file) else: handle = doc.thumbnail_file - # TODO: Send ETag information and use that to send new thumbnails - # if available return HttpResponse(handle, content_type="image/webp") except (FileNotFoundError, Document.DoesNotExist): diff --git a/src/setup.cfg b/src/setup.cfg index 8fbf73d66..1877cb16e 100644 --- a/src/setup.cfg +++ b/src/setup.cfg @@ -3,7 +3,7 @@ DJANGO_SETTINGS_MODULE = paperless.settings addopts = --pythonwarnings=all --cov --cov-report=html --cov-report=xml --numprocesses auto --maxprocesses=16 --quiet --durations=50 env = PAPERLESS_DISABLE_DBHANDLER=true - PAPERLESS_CACHE_BACKEND=django.core.cache.backends.dummy.DummyCache + PAPERLESS_CACHE_BACKEND=django.core.cache.backends.locmem.LocMemCache [coverage:run] source =