Simplifies the interface for caching suggestions and metadata

2024-02-03 13:47:03 -08:00 · 2024-02-03 13:47:03 -08:00 · 01e422d466
commit 01e422d466
parent 2424f2e85e
4 changed files with 202 additions and 54 deletions
--- a/src/documents/caching.py
+++ b/src/documents/caching.py
@ -1,4 +1,33 @@
+import logging
+from dataclasses import dataclass
+from typing import TYPE_CHECKING
 from typing import Final
+from typing import Optional
+
+from django.core.cache import cache
+
+from documents.models import Document
+
+if TYPE_CHECKING:
+    from documents.classifier import DocumentClassifier
+
+logger = logging.getLogger("paperless.caching")
+
+
+@dataclass(frozen=True)
+class MetadataCacheData:
+    original_checksum: str
+    original_metadata: dict
+    archive_checksum: str | None
+    archive_metadata: dict | None
+
+
+@dataclass(frozen=True)
+class SuggestionCacheData:
+    classifier_version: int
+    classifier_hash: str
+    suggestions: dict
+

 CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
 CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
@ -9,24 +38,156 @@ CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
 CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE


-def get_suggestion_key(document_id: int) -> str:
+def get_suggestion_cache_key(document_id: int) -> str:
    """
-    Builds the key to store a document's suggestion data in the cache
+    Returns the basic key for a document's suggestions
    """
    return f"doc_{document_id}_suggest"


-def get_metadata_key(document_id: int, is_archive: bool) -> str:
+def get_suggestion_cache(document_id: int) -> Optional[SuggestionCacheData]:
    """
-    Builds the key to store a document's metadata data in the cache
+    If possible, return the cached suggestions for the given document ID.
+    The classifier needs to be matching in format and hash and the suggestions need to
+    have been cached once.
    """
-    return (
-        f"doc_{document_id}_archive_metadata"
-        if is_archive
-        else f"doc_{document_id}_original_metadata"
+    from documents.classifier import DocumentClassifier
+
+    doc_key = get_suggestion_cache_key(document_id)
+    cache_hits = cache.get_many([CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY, doc_key])
+    # The document suggestions are in the cache
+    if doc_key in cache_hits:
+        doc_suggestions: SuggestionCacheData = cache_hits[doc_key]
+        # The classifier format is the same
+        # The classifier hash is the same
+        # Then the suggestions can be used
+        if (
+            CLASSIFIER_VERSION_KEY in cache_hits
+            and cache_hits[CLASSIFIER_VERSION_KEY] == DocumentClassifier.FORMAT_VERSION
+            and cache_hits[CLASSIFIER_VERSION_KEY] == doc_suggestions.classifier_version
+        ) and (
+            CLASSIFIER_HASH_KEY in cache_hits
+            and cache_hits[CLASSIFIER_HASH_KEY] == doc_suggestions.classifier_hash
+        ):
+            return doc_suggestions
+        else:
+            # Remove the key because something didn't match
+            cache.delete(doc_key)
+    return None
+
+
+def set_suggestions_cache(
+    document_id: int,
+    suggestions: dict,
+    classifier: Optional["DocumentClassifier"],
+    *,
+    timeout=CACHE_50_MINUTES,
+) -> None:
+    """
+    Caches the given suggestions, which were generated by the given classifier.  If there is no classifier,
+    this function is a no-op (there won't be suggestions then anyway)
+    """
+    if classifier is not None:
+        doc_key = get_suggestion_cache_key(document_id)
+        cache.set(
+            doc_key,
+            SuggestionCacheData(
+                classifier.FORMAT_VERSION,
+                classifier.last_auto_type_hash.hex(),
+                suggestions,
+            ),
+            timeout,
        )


+def refresh_suggestions_cache(
+    document_id: int,
+    *,
+    timeout: int = CACHE_50_MINUTES,
+) -> None:
+    """
+    Refreshes the expiration of the suggestions for the given document ID
+    to the given timeout
+    """
+    doc_key = get_suggestion_cache_key(document_id)
+    cache.touch(doc_key, timeout)
+
+
+def get_metadata_cache_key(document_id: int) -> str:
+    """
+    Returns the basic key for a document's metadata
+    """
+    return f"doc_{document_id}_metadata"
+
+
+def get_metadata_cache(document_id: int) -> Optional[MetadataCacheData]:
+    """
+    Returns the cached document metadata for the given document ID, as long as the metadata
+    was cached once and the checksums have not changed
+    """
+    doc_key = get_metadata_cache_key(document_id)
+    doc_metadata: MetadataCacheData | None = cache.get(doc_key)
+    # The metadata exists in the cache
+    if doc_metadata is not None:
+        try:
+            doc = Document.objects.get(pk=document_id)
+            # The original checksums match
+            # If it has one, the archive checksums match
+            # Then, we can use the metadata
+            if (
+                doc_metadata.original_checksum == doc.checksum
+                and doc.has_archive_version
+                and doc_metadata.archive_checksum is not None
+                and doc_metadata.archive_checksum == doc.archive_checksum
+            ):
+                # Refresh cache
+                cache.touch(doc_key, CACHE_50_MINUTES)
+                return doc_metadata
+            else:
+                # Something didn't match, delete the key
+                cache.delete(doc_key)
+        except Document.DoesNotExist:  # pragma: no cover
+            # Basically impossible, but the key existed, but the Document didn't
+            cache.delete(doc_key)
+    return None
+
+
+def set_metadata_cache(
+    document: Document,
+    original_metadata: dict,
+    archive_metadata: dict | None,
+    *,
+    timeout=CACHE_50_MINUTES,
+) -> None:
+    """
+    Sets the metadata into cache for the given Document
+    """
+    doc_key = get_metadata_cache_key(document.pk)
+    cache.set(
+        doc_key,
+        MetadataCacheData(
+            document.checksum,
+            original_metadata,
+            document.archive_checksum,
+            archive_metadata,
+        ),
+        timeout,
+    )
+
+
+def refresh_metadata_cache(
+    document_id: int,
+    *,
+    timeout: int = CACHE_50_MINUTES,
+) -> None:
+    """
+    Refreshes the expiration of the metadata for the given document ID
+    to the given timeout
+    """
+    doc_key = get_metadata_cache_key(document_id)
+    cache.touch(doc_key, timeout)
+
+
 def get_thumbnail_modified_key(document_id: int) -> str:
    """
    Builds the key to store a thumbnail's timestamp
--- a/src/documents/conditionals.py
+++ b/src/documents/conditionals.py
@ -125,7 +125,8 @@ def preview_last_modified(request, pk: int) -> Optional[datetime]:

 def thumbnail_last_modified(request, pk: int) -> Optional[datetime]:
    """
-    Returns the filesystem last modified either from cache or from filesystem
+    Returns the filesystem last modified either from cache or from filesystem.
+    Cache should be (slightly?) faster than filesystem
    """
    try:
        doc = Document.objects.get(pk=pk)
--- a/src/documents/tests/test_api_documents.py
+++ b/src/documents/tests/test_api_documents.py
@ -1303,9 +1303,16 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):

        classifier_checksum = b"thisisachecksum"

+        # Two loads, so two side effects
        mocked_load.side_effect = [
-            mock.Mock(last_auto_type_hash=classifier_checksum),
-            mock.Mock(last_auto_type_hash=classifier_checksum),
+            mock.Mock(
+                last_auto_type_hash=classifier_checksum,
+                FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
+            ),
+            mock.Mock(
+                last_auto_type_hash=classifier_checksum,
+                FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
+            ),
        ]

        last_modified = timezone.now()
--- a/src/documents/views.py
+++ b/src/documents/views.py
@ -15,7 +15,6 @@ from urllib.parse import quote
 import pathvalidate
 from django.conf import settings
 from django.contrib.auth.models import User
-from django.core.cache import cache
 from django.db.models import Case
 from django.db.models import Count
 from django.db.models import IntegerField
@ -64,11 +63,13 @@ from documents import bulk_edit
 from documents.bulk_download import ArchiveOnlyStrategy
 from documents.bulk_download import OriginalAndArchiveStrategy
 from documents.bulk_download import OriginalsOnlyStrategy
-from documents.caching import CACHE_5_MINUTES
 from documents.caching import CACHE_50_MINUTES
-from documents.caching import CLASSIFIER_HASH_KEY
-from documents.caching import get_metadata_key
-from documents.caching import get_suggestion_key
+from documents.caching import get_metadata_cache
+from documents.caching import get_suggestion_cache
+from documents.caching import refresh_metadata_cache
+from documents.caching import refresh_suggestions_cache
+from documents.caching import set_metadata_cache
+from documents.caching import set_suggestions_cache
 from documents.classifier import load_classifier
 from documents.conditionals import metadata_etag
 from documents.conditionals import metadata_last_modified
@ -389,9 +390,11 @@ class DocumentViewSet(
            try:
                return parser.extract_metadata(file, mime_type)
            except Exception:
+                logger.exception(f"Issue getting metadata for {file}")
                # TODO: cover GPG errors, remove later.
                return []
        else:
+            logger.warning(f"No parser for {mime_type}")
            return []

    def get_filesize(self, filename):
@ -416,33 +419,23 @@ class DocumentViewSet(
        except Document.DoesNotExist:
            raise Http404

-        doc_original_key = get_metadata_key(doc.pk, is_archive=False)
-        doc_archive_key = get_metadata_key(doc.pk, is_archive=True)
+        document_cached_metadata = get_metadata_cache(doc.pk)

-        cache_hits = cache.get_many([doc_original_key, doc_archive_key])
-
-        # use cached original file metadata if possible, else gather then cache
-        if doc_original_key in cache_hits:
-            cache.touch(doc_original_key, CACHE_5_MINUTES)
-            original_metadata = cache_hits[doc_original_key]
+        if document_cached_metadata is not None:
+            original_metadata = document_cached_metadata.original_metadata
+            archive_metadata = document_cached_metadata.archive_metadata
+            refresh_metadata_cache(doc.pk)
        else:
            original_metadata = self.get_metadata(doc.source_path, doc.mime_type)
-            cache.set(doc_original_key, original_metadata, CACHE_5_MINUTES)
-
-        # use cached archive file metadata, if applicable, then cache if it wasn't
            archive_metadata = None
            archive_filesize = None
            if doc.has_archive_version:
-            if doc_archive_key in cache_hits:
-                archive_metadata = cache_hits[doc_archive_key]
-                archive_filesize = self.get_filesize(doc.archive_path)
-            else:
                archive_filesize = self.get_filesize(doc.archive_path)
                archive_metadata = self.get_metadata(
                    doc.archive_path,
                    "application/pdf",
                )
-                cache.set(doc_archive_key, archive_metadata, CACHE_5_MINUTES)
+            set_metadata_cache(doc, original_metadata, archive_metadata)

        meta = {
            "original_checksum": doc.checksum,
@ -483,20 +476,11 @@ class DocumentViewSet(
        ):
            return HttpResponseForbidden("Insufficient permissions")

-        doc_key = get_suggestion_key(doc.pk)
+        document_suggestions = get_suggestion_cache(doc.pk)

-        cache_hits = cache.get_many([doc_key, CLASSIFIER_HASH_KEY])
-
-        # Check if we can use the cache
-        # Needs to exist, and have the same classifier hash
-        if doc_key in cache_hits:
-            classifier_version, suggestions = cache_hits[doc_key]
-            if (
-                CLASSIFIER_HASH_KEY in cache_hits
-                and classifier_version == cache_hits[CLASSIFIER_HASH_KEY]
-            ):
-                cache.touch(doc_key, CACHE_5_MINUTES)
-                return Response(suggestions)
+        if document_suggestions is not None:
+            refresh_suggestions_cache(doc.pk)
+            return Response(document_suggestions.suggestions)

        classifier = load_classifier()

@ -522,12 +506,7 @@ class DocumentViewSet(
        }

        # Cache the suggestions and the classifier hash for later
-        if classifier is not None:
-            cache.set(
-                doc_key,
-                (classifier.last_auto_type_hash, resp_data),
-                CACHE_5_MINUTES,
-            )
+        set_suggestions_cache(doc.pk, resp_data, classifier)

        return Response(resp_data)