Simplifies the interface for caching suggestions and metadata
This commit is contained in:
parent
2424f2e85e
commit
01e422d466
@ -1,4 +1,33 @@
|
|||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
from typing import Final
|
from typing import Final
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from django.core.cache import cache
|
||||||
|
|
||||||
|
from documents.models import Document
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from documents.classifier import DocumentClassifier
|
||||||
|
|
||||||
|
logger = logging.getLogger("paperless.caching")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class MetadataCacheData:
|
||||||
|
original_checksum: str
|
||||||
|
original_metadata: dict
|
||||||
|
archive_checksum: str | None
|
||||||
|
archive_metadata: dict | None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class SuggestionCacheData:
|
||||||
|
classifier_version: int
|
||||||
|
classifier_hash: str
|
||||||
|
suggestions: dict
|
||||||
|
|
||||||
|
|
||||||
CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
|
CLASSIFIER_VERSION_KEY: Final[str] = "classifier_version"
|
||||||
CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
|
CLASSIFIER_HASH_KEY: Final[str] = "classifier_hash"
|
||||||
@ -9,24 +38,156 @@ CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
|
|||||||
CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
|
CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
|
||||||
|
|
||||||
|
|
||||||
def get_suggestion_key(document_id: int) -> str:
|
def get_suggestion_cache_key(document_id: int) -> str:
|
||||||
"""
|
"""
|
||||||
Builds the key to store a document's suggestion data in the cache
|
Returns the basic key for a document's suggestions
|
||||||
"""
|
"""
|
||||||
return f"doc_{document_id}_suggest"
|
return f"doc_{document_id}_suggest"
|
||||||
|
|
||||||
|
|
||||||
def get_metadata_key(document_id: int, is_archive: bool) -> str:
|
def get_suggestion_cache(document_id: int) -> Optional[SuggestionCacheData]:
|
||||||
"""
|
"""
|
||||||
Builds the key to store a document's metadata data in the cache
|
If possible, return the cached suggestions for the given document ID.
|
||||||
|
The classifier needs to be matching in format and hash and the suggestions need to
|
||||||
|
have been cached once.
|
||||||
"""
|
"""
|
||||||
return (
|
from documents.classifier import DocumentClassifier
|
||||||
f"doc_{document_id}_archive_metadata"
|
|
||||||
if is_archive
|
doc_key = get_suggestion_cache_key(document_id)
|
||||||
else f"doc_{document_id}_original_metadata"
|
cache_hits = cache.get_many([CLASSIFIER_VERSION_KEY, CLASSIFIER_HASH_KEY, doc_key])
|
||||||
|
# The document suggestions are in the cache
|
||||||
|
if doc_key in cache_hits:
|
||||||
|
doc_suggestions: SuggestionCacheData = cache_hits[doc_key]
|
||||||
|
# The classifier format is the same
|
||||||
|
# The classifier hash is the same
|
||||||
|
# Then the suggestions can be used
|
||||||
|
if (
|
||||||
|
CLASSIFIER_VERSION_KEY in cache_hits
|
||||||
|
and cache_hits[CLASSIFIER_VERSION_KEY] == DocumentClassifier.FORMAT_VERSION
|
||||||
|
and cache_hits[CLASSIFIER_VERSION_KEY] == doc_suggestions.classifier_version
|
||||||
|
) and (
|
||||||
|
CLASSIFIER_HASH_KEY in cache_hits
|
||||||
|
and cache_hits[CLASSIFIER_HASH_KEY] == doc_suggestions.classifier_hash
|
||||||
|
):
|
||||||
|
return doc_suggestions
|
||||||
|
else:
|
||||||
|
# Remove the key because something didn't match
|
||||||
|
cache.delete(doc_key)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def set_suggestions_cache(
|
||||||
|
document_id: int,
|
||||||
|
suggestions: dict,
|
||||||
|
classifier: Optional["DocumentClassifier"],
|
||||||
|
*,
|
||||||
|
timeout=CACHE_50_MINUTES,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Caches the given suggestions, which were generated by the given classifier. If there is no classifier,
|
||||||
|
this function is a no-op (there won't be suggestions then anyway)
|
||||||
|
"""
|
||||||
|
if classifier is not None:
|
||||||
|
doc_key = get_suggestion_cache_key(document_id)
|
||||||
|
cache.set(
|
||||||
|
doc_key,
|
||||||
|
SuggestionCacheData(
|
||||||
|
classifier.FORMAT_VERSION,
|
||||||
|
classifier.last_auto_type_hash.hex(),
|
||||||
|
suggestions,
|
||||||
|
),
|
||||||
|
timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def refresh_suggestions_cache(
|
||||||
|
document_id: int,
|
||||||
|
*,
|
||||||
|
timeout: int = CACHE_50_MINUTES,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Refreshes the expiration of the suggestions for the given document ID
|
||||||
|
to the given timeout
|
||||||
|
"""
|
||||||
|
doc_key = get_suggestion_cache_key(document_id)
|
||||||
|
cache.touch(doc_key, timeout)
|
||||||
|
|
||||||
|
|
||||||
|
def get_metadata_cache_key(document_id: int) -> str:
|
||||||
|
"""
|
||||||
|
Returns the basic key for a document's metadata
|
||||||
|
"""
|
||||||
|
return f"doc_{document_id}_metadata"
|
||||||
|
|
||||||
|
|
||||||
|
def get_metadata_cache(document_id: int) -> Optional[MetadataCacheData]:
|
||||||
|
"""
|
||||||
|
Returns the cached document metadata for the given document ID, as long as the metadata
|
||||||
|
was cached once and the checksums have not changed
|
||||||
|
"""
|
||||||
|
doc_key = get_metadata_cache_key(document_id)
|
||||||
|
doc_metadata: MetadataCacheData | None = cache.get(doc_key)
|
||||||
|
# The metadata exists in the cache
|
||||||
|
if doc_metadata is not None:
|
||||||
|
try:
|
||||||
|
doc = Document.objects.get(pk=document_id)
|
||||||
|
# The original checksums match
|
||||||
|
# If it has one, the archive checksums match
|
||||||
|
# Then, we can use the metadata
|
||||||
|
if (
|
||||||
|
doc_metadata.original_checksum == doc.checksum
|
||||||
|
and doc.has_archive_version
|
||||||
|
and doc_metadata.archive_checksum is not None
|
||||||
|
and doc_metadata.archive_checksum == doc.archive_checksum
|
||||||
|
):
|
||||||
|
# Refresh cache
|
||||||
|
cache.touch(doc_key, CACHE_50_MINUTES)
|
||||||
|
return doc_metadata
|
||||||
|
else:
|
||||||
|
# Something didn't match, delete the key
|
||||||
|
cache.delete(doc_key)
|
||||||
|
except Document.DoesNotExist: # pragma: no cover
|
||||||
|
# Basically impossible, but the key existed, but the Document didn't
|
||||||
|
cache.delete(doc_key)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def set_metadata_cache(
|
||||||
|
document: Document,
|
||||||
|
original_metadata: dict,
|
||||||
|
archive_metadata: dict | None,
|
||||||
|
*,
|
||||||
|
timeout=CACHE_50_MINUTES,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Sets the metadata into cache for the given Document
|
||||||
|
"""
|
||||||
|
doc_key = get_metadata_cache_key(document.pk)
|
||||||
|
cache.set(
|
||||||
|
doc_key,
|
||||||
|
MetadataCacheData(
|
||||||
|
document.checksum,
|
||||||
|
original_metadata,
|
||||||
|
document.archive_checksum,
|
||||||
|
archive_metadata,
|
||||||
|
),
|
||||||
|
timeout,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def refresh_metadata_cache(
|
||||||
|
document_id: int,
|
||||||
|
*,
|
||||||
|
timeout: int = CACHE_50_MINUTES,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Refreshes the expiration of the metadata for the given document ID
|
||||||
|
to the given timeout
|
||||||
|
"""
|
||||||
|
doc_key = get_metadata_cache_key(document_id)
|
||||||
|
cache.touch(doc_key, timeout)
|
||||||
|
|
||||||
|
|
||||||
def get_thumbnail_modified_key(document_id: int) -> str:
|
def get_thumbnail_modified_key(document_id: int) -> str:
|
||||||
"""
|
"""
|
||||||
Builds the key to store a thumbnail's timestamp
|
Builds the key to store a thumbnail's timestamp
|
||||||
|
@ -125,7 +125,8 @@ def preview_last_modified(request, pk: int) -> Optional[datetime]:
|
|||||||
|
|
||||||
def thumbnail_last_modified(request, pk: int) -> Optional[datetime]:
|
def thumbnail_last_modified(request, pk: int) -> Optional[datetime]:
|
||||||
"""
|
"""
|
||||||
Returns the filesystem last modified either from cache or from filesystem
|
Returns the filesystem last modified either from cache or from filesystem.
|
||||||
|
Cache should be (slightly?) faster than filesystem
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
doc = Document.objects.get(pk=pk)
|
doc = Document.objects.get(pk=pk)
|
||||||
|
@ -1303,9 +1303,16 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
|
|||||||
|
|
||||||
classifier_checksum = b"thisisachecksum"
|
classifier_checksum = b"thisisachecksum"
|
||||||
|
|
||||||
|
# Two loads, so two side effects
|
||||||
mocked_load.side_effect = [
|
mocked_load.side_effect = [
|
||||||
mock.Mock(last_auto_type_hash=classifier_checksum),
|
mock.Mock(
|
||||||
mock.Mock(last_auto_type_hash=classifier_checksum),
|
last_auto_type_hash=classifier_checksum,
|
||||||
|
FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
|
||||||
|
),
|
||||||
|
mock.Mock(
|
||||||
|
last_auto_type_hash=classifier_checksum,
|
||||||
|
FORMAT_VERSION=DocumentClassifier.FORMAT_VERSION,
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
last_modified = timezone.now()
|
last_modified = timezone.now()
|
||||||
|
@ -15,7 +15,6 @@ from urllib.parse import quote
|
|||||||
import pathvalidate
|
import pathvalidate
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.contrib.auth.models import User
|
from django.contrib.auth.models import User
|
||||||
from django.core.cache import cache
|
|
||||||
from django.db.models import Case
|
from django.db.models import Case
|
||||||
from django.db.models import Count
|
from django.db.models import Count
|
||||||
from django.db.models import IntegerField
|
from django.db.models import IntegerField
|
||||||
@ -64,11 +63,13 @@ from documents import bulk_edit
|
|||||||
from documents.bulk_download import ArchiveOnlyStrategy
|
from documents.bulk_download import ArchiveOnlyStrategy
|
||||||
from documents.bulk_download import OriginalAndArchiveStrategy
|
from documents.bulk_download import OriginalAndArchiveStrategy
|
||||||
from documents.bulk_download import OriginalsOnlyStrategy
|
from documents.bulk_download import OriginalsOnlyStrategy
|
||||||
from documents.caching import CACHE_5_MINUTES
|
|
||||||
from documents.caching import CACHE_50_MINUTES
|
from documents.caching import CACHE_50_MINUTES
|
||||||
from documents.caching import CLASSIFIER_HASH_KEY
|
from documents.caching import get_metadata_cache
|
||||||
from documents.caching import get_metadata_key
|
from documents.caching import get_suggestion_cache
|
||||||
from documents.caching import get_suggestion_key
|
from documents.caching import refresh_metadata_cache
|
||||||
|
from documents.caching import refresh_suggestions_cache
|
||||||
|
from documents.caching import set_metadata_cache
|
||||||
|
from documents.caching import set_suggestions_cache
|
||||||
from documents.classifier import load_classifier
|
from documents.classifier import load_classifier
|
||||||
from documents.conditionals import metadata_etag
|
from documents.conditionals import metadata_etag
|
||||||
from documents.conditionals import metadata_last_modified
|
from documents.conditionals import metadata_last_modified
|
||||||
@ -389,9 +390,11 @@ class DocumentViewSet(
|
|||||||
try:
|
try:
|
||||||
return parser.extract_metadata(file, mime_type)
|
return parser.extract_metadata(file, mime_type)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
logger.exception(f"Issue getting metadata for {file}")
|
||||||
# TODO: cover GPG errors, remove later.
|
# TODO: cover GPG errors, remove later.
|
||||||
return []
|
return []
|
||||||
else:
|
else:
|
||||||
|
logger.warning(f"No parser for {mime_type}")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def get_filesize(self, filename):
|
def get_filesize(self, filename):
|
||||||
@ -416,33 +419,23 @@ class DocumentViewSet(
|
|||||||
except Document.DoesNotExist:
|
except Document.DoesNotExist:
|
||||||
raise Http404
|
raise Http404
|
||||||
|
|
||||||
doc_original_key = get_metadata_key(doc.pk, is_archive=False)
|
document_cached_metadata = get_metadata_cache(doc.pk)
|
||||||
doc_archive_key = get_metadata_key(doc.pk, is_archive=True)
|
|
||||||
|
|
||||||
cache_hits = cache.get_many([doc_original_key, doc_archive_key])
|
if document_cached_metadata is not None:
|
||||||
|
original_metadata = document_cached_metadata.original_metadata
|
||||||
# use cached original file metadata if possible, else gather then cache
|
archive_metadata = document_cached_metadata.archive_metadata
|
||||||
if doc_original_key in cache_hits:
|
refresh_metadata_cache(doc.pk)
|
||||||
cache.touch(doc_original_key, CACHE_5_MINUTES)
|
|
||||||
original_metadata = cache_hits[doc_original_key]
|
|
||||||
else:
|
else:
|
||||||
original_metadata = self.get_metadata(doc.source_path, doc.mime_type)
|
original_metadata = self.get_metadata(doc.source_path, doc.mime_type)
|
||||||
cache.set(doc_original_key, original_metadata, CACHE_5_MINUTES)
|
archive_metadata = None
|
||||||
|
archive_filesize = None
|
||||||
# use cached archive file metadata, if applicable, then cache if it wasn't
|
if doc.has_archive_version:
|
||||||
archive_metadata = None
|
|
||||||
archive_filesize = None
|
|
||||||
if doc.has_archive_version:
|
|
||||||
if doc_archive_key in cache_hits:
|
|
||||||
archive_metadata = cache_hits[doc_archive_key]
|
|
||||||
archive_filesize = self.get_filesize(doc.archive_path)
|
|
||||||
else:
|
|
||||||
archive_filesize = self.get_filesize(doc.archive_path)
|
archive_filesize = self.get_filesize(doc.archive_path)
|
||||||
archive_metadata = self.get_metadata(
|
archive_metadata = self.get_metadata(
|
||||||
doc.archive_path,
|
doc.archive_path,
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
)
|
)
|
||||||
cache.set(doc_archive_key, archive_metadata, CACHE_5_MINUTES)
|
set_metadata_cache(doc, original_metadata, archive_metadata)
|
||||||
|
|
||||||
meta = {
|
meta = {
|
||||||
"original_checksum": doc.checksum,
|
"original_checksum": doc.checksum,
|
||||||
@ -483,20 +476,11 @@ class DocumentViewSet(
|
|||||||
):
|
):
|
||||||
return HttpResponseForbidden("Insufficient permissions")
|
return HttpResponseForbidden("Insufficient permissions")
|
||||||
|
|
||||||
doc_key = get_suggestion_key(doc.pk)
|
document_suggestions = get_suggestion_cache(doc.pk)
|
||||||
|
|
||||||
cache_hits = cache.get_many([doc_key, CLASSIFIER_HASH_KEY])
|
if document_suggestions is not None:
|
||||||
|
refresh_suggestions_cache(doc.pk)
|
||||||
# Check if we can use the cache
|
return Response(document_suggestions.suggestions)
|
||||||
# Needs to exist, and have the same classifier hash
|
|
||||||
if doc_key in cache_hits:
|
|
||||||
classifier_version, suggestions = cache_hits[doc_key]
|
|
||||||
if (
|
|
||||||
CLASSIFIER_HASH_KEY in cache_hits
|
|
||||||
and classifier_version == cache_hits[CLASSIFIER_HASH_KEY]
|
|
||||||
):
|
|
||||||
cache.touch(doc_key, CACHE_5_MINUTES)
|
|
||||||
return Response(suggestions)
|
|
||||||
|
|
||||||
classifier = load_classifier()
|
classifier = load_classifier()
|
||||||
|
|
||||||
@ -522,12 +506,7 @@ class DocumentViewSet(
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Cache the suggestions and the classifier hash for later
|
# Cache the suggestions and the classifier hash for later
|
||||||
if classifier is not None:
|
set_suggestions_cache(doc.pk, resp_data, classifier)
|
||||||
cache.set(
|
|
||||||
doc_key,
|
|
||||||
(classifier.last_auto_type_hash, resp_data),
|
|
||||||
CACHE_5_MINUTES,
|
|
||||||
)
|
|
||||||
|
|
||||||
return Response(resp_data)
|
return Response(resp_data)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user