Simplifies the interface for caching suggestions and metadata

This commit is contained in:
Trenton H 2024-02-03 13:47:03 -08:00
parent 2424f2e85e
commit 01e422d466
4 changed files with 202 additions and 54 deletions

View File

@ -1,4 +1,33 @@
import logging
from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import Final from typing import Final
from typing import Optional
from django.core.cache import cache
from documents.models import Document
if TYPE_CHECKING:
from documents.classifier import DocumentClassifier
logger = logging.getLogger("paperless.caching")
@dataclass(frozen=True)
class MetadataCacheData:
original_checksum: str
original_metadata: dict
archive_checksum: str | None
archive_metadata: dict | None
@dataclass(frozen=True)
class SuggestionCacheData:
classifier_version: int
classifier_hash: str
suggestions: dict
CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version" CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash" CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
@ -9,24 +38,156 @@ CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
def get_suggestion_key(document_id: int) -> str: def get_suggestion_cache_key(document_id: int) -> str:
""" """
Builds the key to store a document's suggestion data in the cache Returns the basic key for a document's suggestions
""" """
return f"doc_{document_id}_suggest" return f"doc_{document_id}_suggest"
def get_metadata_key(document_id: int, is_archive: bool) -> str: def get_suggestion_cache(document_id: int) -> Optional[SuggestionCacheData]:
""" """
Builds the key to store a document's metadata data in the cache If possible, return the cached suggestions for the given document ID.
The classifier needs to be matching in format and hash and the suggestions need to
have been cached once.
""" """
return ( from documents.classifier import DocumentClassifier
f"doc_{document_id}_archive_metadata"
if is_archive doc_key = get_suggestion_cache_key(document_id)
else f"doc_{document_id}_original_metadata" cache_hits = cache.get_many([CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY, doc_key])
# The document suggestions are in the cache
if doc_key in cache_hits:
doc_suggestions: SuggestionCacheData = cache_hits[doc_key]
# The classifier format is the same
# The classifier hash is the same
# Then the suggestions can be used
if (
CLASSIFIER_VERSION_KEY in cache_hits
and cache_hits[CLASSIFIER_VERSION_KEY] == DocumentClassifier.FORMAT_VERSION
and cache_hits[CLASSIFIER_VERSION_KEY] == doc_suggestions.classifier_version
) and (
CLASSIFIER_HASH_KEY in cache_hits
and cache_hits[CLASSIFIER_HASH_KEY] == doc_suggestions.classifier_hash
):
return doc_suggestions
else:
# Remove the key because something didn't match
cache.delete(doc_key)
return None
def set_suggestions_cache(
document_id: int,
suggestions: dict,
classifier: Optional["DocumentClassifier"],
*,
timeout=CACHE_50_MINUTES,
) -> None:
"""
Caches the given suggestions, which were generated by the given classifier. If there is no classifier,
this function is a no-op (there won't be suggestions then anyway)
"""
if classifier is not None:
doc_key = get_suggestion_cache_key(document_id)
cache.set(
doc_key,
SuggestionCacheData(
classifier.FORMAT_VERSION,
classifier.last_auto_type_hash.hex(),
suggestions,
),
timeout,
)
def refresh_suggestions_cache(
document_id: int,
*,
timeout: int = CACHE_50_MINUTES,
) -> None:
"""
Refreshes the expiration of the suggestions for the given document ID
to the given timeout
"""
doc_key = get_suggestion_cache_key(document_id)
cache.touch(doc_key, timeout)
def get_metadata_cache_key(document_id: int) -> str:
"""
Returns the basic key for a document's metadata
"""
return f"doc_{document_id}_metadata"
def get_metadata_cache(document_id: int) -> Optional[MetadataCacheData]:
"""
Returns the cached document metadata for the given document ID, as long as the metadata
was cached once and the checksums have not changed
"""
doc_key = get_metadata_cache_key(document_id)
doc_metadata: MetadataCacheData | None = cache.get(doc_key)
# The metadata exists in the cache
if doc_metadata is not None:
try:
doc = Document.objects.get(pk=document_id)
# The original checksums match
# If it has one, the archive checksums match
# Then, we can use the metadata
if (
doc_metadata.original_checksum == doc.checksum
and doc.has_archive_version
and doc_metadata.archive_checksum is not None
and doc_metadata.archive_checksum == doc.archive_checksum
):
# Refresh cache
cache.touch(doc_key, CACHE_50_MINUTES)
return doc_metadata
else:
# Something didn't match, delete the key
cache.delete(doc_key)
except Document.DoesNotExist: # pragma: no cover
# Basically impossible, but the key existed, but the Document didn't
cache.delete(doc_key)
return None
def set_metadata_cache(
document: Document,
original_metadata: dict,
archive_metadata: dict | None,
*,
timeout=CACHE_50_MINUTES,
) -> None:
"""
Sets the metadata into cache for the given Document
"""
doc_key = get_metadata_cache_key(document.pk)
cache.set(
doc_key,
MetadataCacheData(
document.checksum,
original_metadata,
document.archive_checksum,
archive_metadata,
),
timeout,
) )
def refresh_metadata_cache(
document_id: int,
*,
timeout: int = CACHE_50_MINUTES,
) -> None:
"""
Refreshes the expiration of the metadata for the given document ID
to the given timeout
"""
doc_key = get_metadata_cache_key(document_id)
cache.touch(doc_key, timeout)
def get_thumbnail_modified_key(document_id: int) -> str: def get_thumbnail_modified_key(document_id: int) -> str:
""" """
Builds the key to store a thumbnail's timestamp Builds the key to store a thumbnail's timestamp

View File

@ -125,7 +125,8 @@ def preview_last_modified(request, pk: int) -> Optional[datetime]:
def thumbnail_last_modified(request, pk: int) -> Optional[datetime]: def thumbnail_last_modified(request, pk: int) -> Optional[datetime]:
""" """
Returns the filesystem last modified either from cache or from filesystem Returns the filesystem last modified either from cache or from filesystem.
Cache should be (slightly?) faster than filesystem
""" """
try: try:
doc = Document.objects.get(pk=pk) doc = Document.objects.get(pk=pk)

View File

@ -1303,9 +1303,16 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
classifier_checksum = b"thisisachecksum" classifier_checksum = b"thisisachecksum"
# Two loads, so two side effects
mocked_load.side_effect = [ mocked_load.side_effect = [
mock.Mock(last_auto_type_hash=classifier_checksum), mock.Mock(
mock.Mock(last_auto_type_hash=classifier_checksum), last_auto_type_hash=classifier_checksum,
FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
),
mock.Mock(
last_auto_type_hash=classifier_checksum,
FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
),
] ]
last_modified = timezone.now() last_modified = timezone.now()

View File

@ -15,7 +15,6 @@ from urllib.parse import quote
import pathvalidate import pathvalidate
from django.conf import settings from django.conf import settings
from django.contrib.auth.models import User from django.contrib.auth.models import User
from django.core.cache import cache
from django.db.models import Case from django.db.models import Case
from django.db.models import Count from django.db.models import Count
from django.db.models import IntegerField from django.db.models import IntegerField
@ -64,11 +63,13 @@ from documents import bulk_edit
from documents.bulk_download import ArchiveOnlyStrategy from documents.bulk_download import ArchiveOnlyStrategy
from documents.bulk_download import OriginalAndArchiveStrategy from documents.bulk_download import OriginalAndArchiveStrategy
from documents.bulk_download import OriginalsOnlyStrategy from documents.bulk_download import OriginalsOnlyStrategy
from documents.caching import CACHE_5_MINUTES
from documents.caching import CACHE_50_MINUTES from documents.caching import CACHE_50_MINUTES
from documents.caching import CLASSIFIER_HASH_KEY from documents.caching import get_metadata_cache
from documents.caching import get_metadata_key from documents.caching import get_suggestion_cache
from documents.caching import get_suggestion_key from documents.caching import refresh_metadata_cache
from documents.caching import refresh_suggestions_cache
from documents.caching import set_metadata_cache
from documents.caching import set_suggestions_cache
from documents.classifier import load_classifier from documents.classifier import load_classifier
from documents.conditionals import metadata_etag from documents.conditionals import metadata_etag
from documents.conditionals import metadata_last_modified from documents.conditionals import metadata_last_modified
@ -389,9 +390,11 @@ class DocumentViewSet(
try: try:
return parser.extract_metadata(file, mime_type) return parser.extract_metadata(file, mime_type)
except Exception: except Exception:
logger.exception(f"Issue getting metadata for {file}")
# TODO: cover GPG errors, remove later. # TODO: cover GPG errors, remove later.
return [] return []
else: else:
logger.warning(f"No parser for {mime_type}")
return [] return []
def get_filesize(self, filename): def get_filesize(self, filename):
@ -416,33 +419,23 @@ class DocumentViewSet(
except Document.DoesNotExist: except Document.DoesNotExist:
raise Http404 raise Http404
doc_original_key = get_metadata_key(doc.pk, is_archive=False) document_cached_metadata = get_metadata_cache(doc.pk)
doc_archive_key = get_metadata_key(doc.pk, is_archive=True)
cache_hits = cache.get_many([doc_original_key, doc_archive_key]) if document_cached_metadata is not None:
original_metadata = document_cached_metadata.original_metadata
# use cached original file metadata if possible, else gather then cache archive_metadata = document_cached_metadata.archive_metadata
if doc_original_key in cache_hits: refresh_metadata_cache(doc.pk)
cache.touch(doc_original_key, CACHE_5_MINUTES)
original_metadata = cache_hits[doc_original_key]
else: else:
original_metadata = self.get_metadata(doc.source_path, doc.mime_type) original_metadata = self.get_metadata(doc.source_path, doc.mime_type)
cache.set(doc_original_key, original_metadata, CACHE_5_MINUTES) archive_metadata = None
archive_filesize = None
# use cached archive file metadata, if applicable, then cache if it wasn't if doc.has_archive_version:
archive_metadata = None
archive_filesize = None
if doc.has_archive_version:
if doc_archive_key in cache_hits:
archive_metadata = cache_hits[doc_archive_key]
archive_filesize = self.get_filesize(doc.archive_path)
else:
archive_filesize = self.get_filesize(doc.archive_path) archive_filesize = self.get_filesize(doc.archive_path)
archive_metadata = self.get_metadata( archive_metadata = self.get_metadata(
doc.archive_path, doc.archive_path,
"application/pdf", "application/pdf",
) )
cache.set(doc_archive_key, archive_metadata, CACHE_5_MINUTES) set_metadata_cache(doc, original_metadata, archive_metadata)
meta = { meta = {
"original_checksum": doc.checksum, "original_checksum": doc.checksum,
@ -483,20 +476,11 @@ class DocumentViewSet(
): ):
return HttpResponseForbidden("Insufficient permissions") return HttpResponseForbidden("Insufficient permissions")
doc_key = get_suggestion_key(doc.pk) document_suggestions = get_suggestion_cache(doc.pk)
cache_hits = cache.get_many([doc_key, CLASSIFIER_HASH_KEY]) if document_suggestions is not None:
refresh_suggestions_cache(doc.pk)
# Check if we can use the cache return Response(document_suggestions.suggestions)
# Needs to exist, and have the same classifier hash
if doc_key in cache_hits:
classifier_version, suggestions = cache_hits[doc_key]
if (
CLASSIFIER_HASH_KEY in cache_hits
and classifier_version == cache_hits[CLASSIFIER_HASH_KEY]
):
cache.touch(doc_key, CACHE_5_MINUTES)
return Response(suggestions)
classifier = load_classifier() classifier = load_classifier()
@ -522,12 +506,7 @@ class DocumentViewSet(
} }
# Cache the suggestions and the classifier hash for later # Cache the suggestions and the classifier hash for later
if classifier is not None: set_suggestions_cache(doc.pk, resp_data, classifier)
cache.set(
doc_key,
(classifier.last_auto_type_hash, resp_data),
CACHE_5_MINUTES,
)
return Response(resp_data) return Response(resp_data)