Simplifies the interface for caching suggestions and metadata

2024-02-03 13:47:03 -08:00
parent 2424f2e85e
commit 01e422d466
4 changed files with 202 additions and 54 deletions
--- a/src/documents/caching.py
+++ b/src/documents/caching.py
@@ -1,4 +1,33 @@
 import logging
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 from typing import Final
 from typing import Optional
 from django.core.cache import cache
 from documents.models import Document
 if TYPE_CHECKING:
    from documents.classifier import DocumentClassifier
 logger = logging.getLogger("paperless.caching")
@dataclass(frozen=True)
 class MetadataCacheData:
    original_checksum: str
    original_metadata: dict
    archive_checksum: str | None
    archive_metadata: dict | None
@dataclass(frozen=True)
 class SuggestionCacheData:
    classifier_version: int
    classifier_hash: str
    suggestions: dict
 CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
 CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
@@ -9,24 +38,156 @@ CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
 CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
-def get_suggestion_key(document_id: int) -> str:
+def get_suggestion_cache_key(document_id: int) -> str:
    """
-    Builds the key to store a document's suggestion data in the cache
+    Returns the basic key for a document's suggestions
    """
    return f"doc_{document_id}_suggest"
-def get_metadata_key(document_id: int, is_archive: bool) -> str:
+def get_suggestion_cache(document_id: int) -> Optional[SuggestionCacheData]:
    """
-    Builds the key to store a document's metadata data in the cache
+    If possible, return the cached suggestions for the given document ID.
    The classifier needs to be matching in format and hash and the suggestions need to
    have been cached once.
    """
-    return (
+    from documents.classifier import DocumentClassifier
-        f"doc_{document_id}_archive_metadata"
+
-        if is_archive
+    doc_key = get_suggestion_cache_key(document_id)
-        else f"doc_{document_id}_original_metadata"
+    cache_hits = cache.get_many([CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY, doc_key])
    # The document suggestions are in the cache
    if doc_key in cache_hits:
        doc_suggestions: SuggestionCacheData = cache_hits[doc_key]
        # The classifier format is the same
        # The classifier hash is the same
        # Then the suggestions can be used
        if (
            CLASSIFIER_VERSION_KEY in cache_hits
            and cache_hits[CLASSIFIER_VERSION_KEY] == DocumentClassifier.FORMAT_VERSION
            and cache_hits[CLASSIFIER_VERSION_KEY] == doc_suggestions.classifier_version
        ) and (
            CLASSIFIER_HASH_KEY in cache_hits
            and cache_hits[CLASSIFIER_HASH_KEY] == doc_suggestions.classifier_hash
        ):
            return doc_suggestions
        else:
            # Remove the key because something didn't match
            cache.delete(doc_key)
    return None
 def set_suggestions_cache(
    document_id: int,
    suggestions: dict,
    classifier: Optional["DocumentClassifier"],
    *,
    timeout=CACHE_50_MINUTES,
 ) -> None:
    """
    Caches the given suggestions, which were generated by the given classifier.  If there is no classifier,
    this function is a no-op (there won't be suggestions then anyway)
    """
    if classifier is not None:
        doc_key = get_suggestion_cache_key(document_id)
        cache.set(
            doc_key,
            SuggestionCacheData(
                classifier.FORMAT_VERSION,
                classifier.last_auto_type_hash.hex(),
                suggestions,
            ),
            timeout,
        )
 def refresh_suggestions_cache(
    document_id: int,
    *,
    timeout: int = CACHE_50_MINUTES,
 ) -> None:
    """
    Refreshes the expiration of the suggestions for the given document ID
    to the given timeout
    """
    doc_key = get_suggestion_cache_key(document_id)
    cache.touch(doc_key, timeout)
 def get_metadata_cache_key(document_id: int) -> str:
    """
    Returns the basic key for a document's metadata
    """
    return f"doc_{document_id}_metadata"
 def get_metadata_cache(document_id: int) -> Optional[MetadataCacheData]:
    """
    Returns the cached document metadata for the given document ID, as long as the metadata
    was cached once and the checksums have not changed
    """
    doc_key = get_metadata_cache_key(document_id)
    doc_metadata: MetadataCacheData | None = cache.get(doc_key)
    # The metadata exists in the cache
    if doc_metadata is not None:
        try:
            doc = Document.objects.get(pk=document_id)
            # The original checksums match
            # If it has one, the archive checksums match
            # Then, we can use the metadata
            if (
                doc_metadata.original_checksum == doc.checksum
                and doc.has_archive_version
                and doc_metadata.archive_checksum is not None
                and doc_metadata.archive_checksum == doc.archive_checksum
            ):
                # Refresh cache
                cache.touch(doc_key, CACHE_50_MINUTES)
                return doc_metadata
            else:
                # Something didn't match, delete the key
                cache.delete(doc_key)
        except Document.DoesNotExist:  # pragma: no cover
            # Basically impossible, but the key existed, but the Document didn't
            cache.delete(doc_key)
    return None
 def set_metadata_cache(
    document: Document,
    original_metadata: dict,
    archive_metadata: dict | None,
    *,
    timeout=CACHE_50_MINUTES,
 ) -> None:
    """
    Sets the metadata into cache for the given Document
    """
    doc_key = get_metadata_cache_key(document.pk)
    cache.set(
        doc_key,
        MetadataCacheData(
            document.checksum,
            original_metadata,
            document.archive_checksum,
            archive_metadata,
        ),
        timeout,
    )
 def refresh_metadata_cache(
    document_id: int,
    *,
    timeout: int = CACHE_50_MINUTES,
 ) -> None:
    """
    Refreshes the expiration of the metadata for the given document ID
    to the given timeout
    """
    doc_key = get_metadata_cache_key(document_id)
    cache.touch(doc_key, timeout)
 def get_thumbnail_modified_key(document_id: int) -> str:
    """
    Builds the key to store a thumbnail's timestamp
--- a/src/documents/conditionals.py
+++ b/src/documents/conditionals.py
@@ -125,7 +125,8 @@ def preview_last_modified(request, pk: int) -> Optional[datetime]:
 def thumbnail_last_modified(request, pk: int) -> Optional[datetime]:
    """
-    Returns the filesystem last modified either from cache or from filesystem
+    Returns the filesystem last modified either from cache or from filesystem.
    Cache should be (slightly?) faster than filesystem
    """
    try:
        doc = Document.objects.get(pk=pk)
--- a/src/documents/tests/test_api_documents.py
+++ b/src/documents/tests/test_api_documents.py
@@ -1303,9 +1303,16 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
        classifier_checksum = b"thisisachecksum"
        # Two loads, so two side effects
        mocked_load.side_effect = [
-            mock.Mock(last_auto_type_hash=classifier_checksum),
+            mock.Mock(
-            mock.Mock(last_auto_type_hash=classifier_checksum),
+                last_auto_type_hash=classifier_checksum,
                FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
            ),
            mock.Mock(
                last_auto_type_hash=classifier_checksum,
                FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
            ),
        ]
        last_modified = timezone.now()
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -15,7 +15,6 @@ from urllib.parse import quote
 import pathvalidate
 from django.conf import settings
 from django.contrib.auth.models import User
 from django.core.cache import cache
 from django.db.models import Case
 from django.db.models import Count
 from django.db.models import IntegerField
@@ -64,11 +63,13 @@ from documents import bulk_edit
 from documents.bulk_download import ArchiveOnlyStrategy
 from documents.bulk_download import OriginalAndArchiveStrategy
 from documents.bulk_download import OriginalsOnlyStrategy
 from documents.caching import CACHE_5_MINUTES
 from documents.caching import CACHE_50_MINUTES
-from documents.caching import CLASSIFIER_HASH_KEY
+from documents.caching import get_metadata_cache
-from documents.caching import get_metadata_key
+from documents.caching import get_suggestion_cache
-from documents.caching import get_suggestion_key
+from documents.caching import refresh_metadata_cache
 from documents.caching import refresh_suggestions_cache
 from documents.caching import set_metadata_cache
 from documents.caching import set_suggestions_cache
 from documents.classifier import load_classifier
 from documents.conditionals import metadata_etag
 from documents.conditionals import metadata_last_modified
@@ -389,9 +390,11 @@ class DocumentViewSet(
            try:
                return parser.extract_metadata(file, mime_type)
            except Exception:
                logger.exception(f"Issue getting metadata for {file}")
                # TODO: cover GPG errors, remove later.
                return []
        else:
            logger.warning(f"No parser for {mime_type}")
            return []
    def get_filesize(self, filename):
@@ -416,33 +419,23 @@ class DocumentViewSet(
        except Document.DoesNotExist:
            raise Http404
-        doc_original_key = get_metadata_key(doc.pk, is_archive=False)
+        document_cached_metadata = get_metadata_cache(doc.pk)
        doc_archive_key = get_metadata_key(doc.pk, is_archive=True)
-        cache_hits = cache.get_many([doc_original_key, doc_archive_key])
+        if document_cached_metadata is not None:
-
+            original_metadata = document_cached_metadata.original_metadata
-        # use cached original file metadata if possible, else gather then cache
+            archive_metadata = document_cached_metadata.archive_metadata
-        if doc_original_key in cache_hits:
+            refresh_metadata_cache(doc.pk)
            cache.touch(doc_original_key, CACHE_5_MINUTES)
            original_metadata = cache_hits[doc_original_key]
        else:
            original_metadata = self.get_metadata(doc.source_path, doc.mime_type)
-            cache.set(doc_original_key, original_metadata, CACHE_5_MINUTES)
+            archive_metadata = None
-
+            archive_filesize = None
-        # use cached archive file metadata, if applicable, then cache if it wasn't
+            if doc.has_archive_version:
        archive_metadata = None
        archive_filesize = None
        if doc.has_archive_version:
            if doc_archive_key in cache_hits:
                archive_metadata = cache_hits[doc_archive_key]
                archive_filesize = self.get_filesize(doc.archive_path)
            else:
                archive_filesize = self.get_filesize(doc.archive_path)
                archive_metadata = self.get_metadata(
                    doc.archive_path,
                    "application/pdf",
                )
-                cache.set(doc_archive_key, archive_metadata, CACHE_5_MINUTES)
+            set_metadata_cache(doc, original_metadata, archive_metadata)
        meta = {
            "original_checksum": doc.checksum,
@@ -483,20 +476,11 @@ class DocumentViewSet(
        ):
            return HttpResponseForbidden("Insufficient permissions")
-        doc_key = get_suggestion_key(doc.pk)
+        document_suggestions = get_suggestion_cache(doc.pk)
-        cache_hits = cache.get_many([doc_key, CLASSIFIER_HASH_KEY])
+        if document_suggestions is not None:
-
+            refresh_suggestions_cache(doc.pk)
-        # Check if we can use the cache
+            return Response(document_suggestions.suggestions)
        # Needs to exist, and have the same classifier hash
        if doc_key in cache_hits:
            classifier_version, suggestions = cache_hits[doc_key]
            if (
                CLASSIFIER_HASH_KEY in cache_hits
                and classifier_version == cache_hits[CLASSIFIER_HASH_KEY]
            ):
                cache.touch(doc_key, CACHE_5_MINUTES)
                return Response(suggestions)
        classifier = load_classifier()
@@ -522,12 +506,7 @@ class DocumentViewSet(
        }
        # Cache the suggestions and the classifier hash for later
-        if classifier is not None:
+        set_suggestions_cache(doc.pk, resp_data, classifier)
            cache.set(
                doc_key,
                (classifier.last_auto_type_hash, resp_data),
                CACHE_5_MINUTES,
            )
        return Response(resp_data)