diff --git a/src/documents/caching.py b/src/documents/caching.py index 420ff177a..a1f38a9fd 100644 --- a/src/documents/caching.py +++ b/src/documents/caching.py @@ -1,4 +1,33 @@ +import logging +from dataclasses import dataclass +from typing import TYPE_CHECKING from typing import Final +from typing import Optional + +from django.core.cache import cache + +from documents.models import Document + +if TYPE_CHECKING: + from documents.classifier import DocumentClassifier + +logger = logging.getLogger("paperless.caching") + + +@dataclass(frozen=True) +class MetadataCacheData: + original_checksum: str + original_metadata: dict + archive_checksum: str | None + archive_metadata: dict | None + + +@dataclass(frozen=True) +class SuggestionCacheData: + classifier_version: int + classifier_hash: str + suggestions: dict + CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version" CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash" @@ -9,24 +38,156 @@ CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE -def get_suggestion_key(document_id: int) -> str: +def get_suggestion_cache_key(document_id: int) -> str: """ - Builds the key to store a document's suggestion data in the cache + Returns the basic key for a document's suggestions """ return f"doc_{document_id}_suggest" -def get_metadata_key(document_id: int, is_archive: bool) -> str: +def get_suggestion_cache(document_id: int) -> Optional[SuggestionCacheData]: """ - Builds the key to store a document's metadata data in the cache + If possible, return the cached suggestions for the given document ID. + The classifier needs to be matching in format and hash and the suggestions need to + have been cached once. """ - return ( - f"doc_{document_id}_archive_metadata" - if is_archive - else f"doc_{document_id}_original_metadata" + from documents.classifier import DocumentClassifier + + doc_key = get_suggestion_cache_key(document_id) + cache_hits = cache.get_many([CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY, doc_key]) + # The document suggestions are in the cache + if doc_key in cache_hits: + doc_suggestions: SuggestionCacheData = cache_hits[doc_key] + # The classifier format is the same + # The classifier hash is the same + # Then the suggestions can be used + if ( + CLASSIFIER_VERSION_KEY in cache_hits + and cache_hits[CLASSIFIER_VERSION_KEY] == DocumentClassifier.FORMAT_VERSION + and cache_hits[CLASSIFIER_VERSION_KEY] == doc_suggestions.classifier_version + ) and ( + CLASSIFIER_HASH_KEY in cache_hits + and cache_hits[CLASSIFIER_HASH_KEY] == doc_suggestions.classifier_hash + ): + return doc_suggestions + else: + # Remove the key because something didn't match + cache.delete(doc_key) + return None + + +def set_suggestions_cache( + document_id: int, + suggestions: dict, + classifier: Optional["DocumentClassifier"], + *, + timeout=CACHE_50_MINUTES, +) -> None: + """ + Caches the given suggestions, which were generated by the given classifier. If there is no classifier, + this function is a no-op (there won't be suggestions then anyway) + """ + if classifier is not None: + doc_key = get_suggestion_cache_key(document_id) + cache.set( + doc_key, + SuggestionCacheData( + classifier.FORMAT_VERSION, + classifier.last_auto_type_hash.hex(), + suggestions, + ), + timeout, + ) + + +def refresh_suggestions_cache( + document_id: int, + *, + timeout: int = CACHE_50_MINUTES, +) -> None: + """ + Refreshes the expiration of the suggestions for the given document ID + to the given timeout + """ + doc_key = get_suggestion_cache_key(document_id) + cache.touch(doc_key, timeout) + + +def get_metadata_cache_key(document_id: int) -> str: + """ + Returns the basic key for a document's metadata + """ + return f"doc_{document_id}_metadata" + + +def get_metadata_cache(document_id: int) -> Optional[MetadataCacheData]: + """ + Returns the cached document metadata for the given document ID, as long as the metadata + was cached once and the checksums have not changed + """ + doc_key = get_metadata_cache_key(document_id) + doc_metadata: MetadataCacheData | None = cache.get(doc_key) + # The metadata exists in the cache + if doc_metadata is not None: + try: + doc = Document.objects.get(pk=document_id) + # The original checksums match + # If it has one, the archive checksums match + # Then, we can use the metadata + if ( + doc_metadata.original_checksum == doc.checksum + and doc.has_archive_version + and doc_metadata.archive_checksum is not None + and doc_metadata.archive_checksum == doc.archive_checksum + ): + # Refresh cache + cache.touch(doc_key, CACHE_50_MINUTES) + return doc_metadata + else: + # Something didn't match, delete the key + cache.delete(doc_key) + except Document.DoesNotExist: # pragma: no cover + # Basically impossible, but the key existed, but the Document didn't + cache.delete(doc_key) + return None + + +def set_metadata_cache( + document: Document, + original_metadata: dict, + archive_metadata: dict | None, + *, + timeout=CACHE_50_MINUTES, +) -> None: + """ + Sets the metadata into cache for the given Document + """ + doc_key = get_metadata_cache_key(document.pk) + cache.set( + doc_key, + MetadataCacheData( + document.checksum, + original_metadata, + document.archive_checksum, + archive_metadata, + ), + timeout, ) +def refresh_metadata_cache( + document_id: int, + *, + timeout: int = CACHE_50_MINUTES, +) -> None: + """ + Refreshes the expiration of the metadata for the given document ID + to the given timeout + """ + doc_key = get_metadata_cache_key(document_id) + cache.touch(doc_key, timeout) + + def get_thumbnail_modified_key(document_id: int) -> str: """ Builds the key to store a thumbnail's timestamp diff --git a/src/documents/conditionals.py b/src/documents/conditionals.py index 7f77ae125..1b53dfe2b 100644 --- a/src/documents/conditionals.py +++ b/src/documents/conditionals.py @@ -125,7 +125,8 @@ def preview_last_modified(request, pk: int) -> Optional[datetime]: def thumbnail_last_modified(request, pk: int) -> Optional[datetime]: """ - Returns the filesystem last modified either from cache or from filesystem + Returns the filesystem last modified either from cache or from filesystem. + Cache should be (slightly?) faster than filesystem """ try: doc = Document.objects.get(pk=pk) diff --git a/src/documents/tests/test_api_documents.py b/src/documents/tests/test_api_documents.py index 0e33a201e..bd36d4c11 100644 --- a/src/documents/tests/test_api_documents.py +++ b/src/documents/tests/test_api_documents.py @@ -1303,9 +1303,16 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): classifier_checksum = b"thisisachecksum" + # Two loads, so two side effects mocked_load.side_effect = [ - mock.Mock(last_auto_type_hash=classifier_checksum), - mock.Mock(last_auto_type_hash=classifier_checksum), + mock.Mock( + last_auto_type_hash=classifier_checksum, + FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION, + ), + mock.Mock( + last_auto_type_hash=classifier_checksum, + FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION, + ), ] last_modified = timezone.now() diff --git a/src/documents/views.py b/src/documents/views.py index 5c1b9b24c..0faedc989 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -15,7 +15,6 @@ from urllib.parse import quote import pathvalidate from django.conf import settings from django.contrib.auth.models import User -from django.core.cache import cache from django.db.models import Case from django.db.models import Count from django.db.models import IntegerField @@ -64,11 +63,13 @@ from documents import bulk_edit from documents.bulk_download import ArchiveOnlyStrategy from documents.bulk_download import OriginalAndArchiveStrategy from documents.bulk_download import OriginalsOnlyStrategy -from documents.caching import CACHE_5_MINUTES from documents.caching import CACHE_50_MINUTES -from documents.caching import CLASSIFIER_HASH_KEY -from documents.caching import get_metadata_key -from documents.caching import get_suggestion_key +from documents.caching import get_metadata_cache +from documents.caching import get_suggestion_cache +from documents.caching import refresh_metadata_cache +from documents.caching import refresh_suggestions_cache +from documents.caching import set_metadata_cache +from documents.caching import set_suggestions_cache from documents.classifier import load_classifier from documents.conditionals import metadata_etag from documents.conditionals import metadata_last_modified @@ -389,9 +390,11 @@ class DocumentViewSet( try: return parser.extract_metadata(file, mime_type) except Exception: + logger.exception(f"Issue getting metadata for {file}") # TODO: cover GPG errors, remove later. return [] else: + logger.warning(f"No parser for {mime_type}") return [] def get_filesize(self, filename): @@ -416,33 +419,23 @@ class DocumentViewSet( except Document.DoesNotExist: raise Http404 - doc_original_key = get_metadata_key(doc.pk, is_archive=False) - doc_archive_key = get_metadata_key(doc.pk, is_archive=True) + document_cached_metadata = get_metadata_cache(doc.pk) - cache_hits = cache.get_many([doc_original_key, doc_archive_key]) - - # use cached original file metadata if possible, else gather then cache - if doc_original_key in cache_hits: - cache.touch(doc_original_key, CACHE_5_MINUTES) - original_metadata = cache_hits[doc_original_key] + if document_cached_metadata is not None: + original_metadata = document_cached_metadata.original_metadata + archive_metadata = document_cached_metadata.archive_metadata + refresh_metadata_cache(doc.pk) else: original_metadata = self.get_metadata(doc.source_path, doc.mime_type) - cache.set(doc_original_key, original_metadata, CACHE_5_MINUTES) - - # use cached archive file metadata, if applicable, then cache if it wasn't - archive_metadata = None - archive_filesize = None - if doc.has_archive_version: - if doc_archive_key in cache_hits: - archive_metadata = cache_hits[doc_archive_key] - archive_filesize = self.get_filesize(doc.archive_path) - else: + archive_metadata = None + archive_filesize = None + if doc.has_archive_version: archive_filesize = self.get_filesize(doc.archive_path) archive_metadata = self.get_metadata( doc.archive_path, "application/pdf", ) - cache.set(doc_archive_key, archive_metadata, CACHE_5_MINUTES) + set_metadata_cache(doc, original_metadata, archive_metadata) meta = { "original_checksum": doc.checksum, @@ -483,20 +476,11 @@ class DocumentViewSet( ): return HttpResponseForbidden("Insufficient permissions") - doc_key = get_suggestion_key(doc.pk) + document_suggestions = get_suggestion_cache(doc.pk) - cache_hits = cache.get_many([doc_key, CLASSIFIER_HASH_KEY]) - - # Check if we can use the cache - # Needs to exist, and have the same classifier hash - if doc_key in cache_hits: - classifier_version, suggestions = cache_hits[doc_key] - if ( - CLASSIFIER_HASH_KEY in cache_hits - and classifier_version == cache_hits[CLASSIFIER_HASH_KEY] - ): - cache.touch(doc_key, CACHE_5_MINUTES) - return Response(suggestions) + if document_suggestions is not None: + refresh_suggestions_cache(doc.pk) + return Response(document_suggestions.suggestions) classifier = load_classifier() @@ -522,12 +506,7 @@ class DocumentViewSet( } # Cache the suggestions and the classifier hash for later - if classifier is not None: - cache.set( - doc_key, - (classifier.last_auto_type_hash, resp_data), - CACHE_5_MINUTES, - ) + set_suggestions_cache(doc.pk, resp_data, classifier) return Response(resp_data)