Feature: number of pages of document in documents list
This commit is contained in:
@@ -586,6 +586,7 @@ class ConsumerPlugin(
|
||||
date = None
|
||||
thumbnail = None
|
||||
archive_path = None
|
||||
pages_count = None
|
||||
|
||||
try:
|
||||
self._send_progress(
|
||||
@@ -621,6 +622,7 @@ class ConsumerPlugin(
|
||||
)
|
||||
date = parse_date(self.filename, text)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
pages_count = document_parser.get_pages_count(self.working_copy, mime_type)
|
||||
|
||||
except ParseError as e:
|
||||
document_parser.cleanup()
|
||||
@@ -662,7 +664,12 @@ class ConsumerPlugin(
|
||||
try:
|
||||
with transaction.atomic():
|
||||
# store the document.
|
||||
document = self._store(text=text, date=date, mime_type=mime_type)
|
||||
document = self._store(
|
||||
text=text,
|
||||
date=date,
|
||||
pages_count=pages_count,
|
||||
mime_type=mime_type,
|
||||
)
|
||||
|
||||
# If we get here, it was successful. Proceed with post-consume
|
||||
# hooks. If they fail, nothing will get changed.
|
||||
@@ -790,6 +797,7 @@ class ConsumerPlugin(
|
||||
self,
|
||||
text: str,
|
||||
date: Optional[datetime.datetime],
|
||||
pages_count: int,
|
||||
mime_type: str,
|
||||
) -> Document:
|
||||
# If someone gave us the original filename, use it instead of doc.
|
||||
@@ -835,6 +843,7 @@ class ConsumerPlugin(
|
||||
created=create_date,
|
||||
modified=create_date,
|
||||
storage_type=storage_type,
|
||||
pages_count=pages_count,
|
||||
original_filename=self.filename,
|
||||
)
|
||||
|
||||
|
||||
@@ -80,6 +80,7 @@ def get_schema():
|
||||
has_owner=BOOLEAN(),
|
||||
viewer_id=KEYWORD(commas=True),
|
||||
checksum=TEXT(),
|
||||
pages_count=NUMERIC(sortable=True),
|
||||
original_filename=TEXT(sortable=True),
|
||||
is_shared=BOOLEAN(),
|
||||
)
|
||||
@@ -181,6 +182,7 @@ def update_document(writer: AsyncWriter, doc: Document):
|
||||
has_owner=doc.owner is not None,
|
||||
viewer_id=viewer_ids if viewer_ids else None,
|
||||
checksum=doc.checksum,
|
||||
pages_count=doc.pages_count,
|
||||
original_filename=doc.original_filename,
|
||||
is_shared=len(viewer_ids) > 0,
|
||||
)
|
||||
@@ -247,6 +249,7 @@ class DelayedQuery:
|
||||
"archive_serial_number": "asn",
|
||||
"num_notes": "num_notes",
|
||||
"owner": "owner",
|
||||
"pages_count": "pages_count",
|
||||
}
|
||||
|
||||
if field.startswith("-"):
|
||||
|
||||
109
src/documents/migrations/1053_document_pages_count.py
Normal file
109
src/documents/migrations/1053_document_pages_count.py
Normal file
@@ -0,0 +1,109 @@
|
||||
# Generated by Django 4.2.16 on 2024-09-21 15:44
|
||||
|
||||
import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import pikepdf
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
from django.utils.termcolors import colorize as colourise
|
||||
|
||||
from documents.parsers import get_default_file_extension
|
||||
|
||||
|
||||
class Document:
|
||||
"""
|
||||
Django's migrations restrict access to model methods, so this is a snapshot
|
||||
of the methods that existed at the time this migration was written, since
|
||||
we need to make use of a lot of these shortcuts here.
|
||||
"""
|
||||
|
||||
def __init__(self, doc):
|
||||
self.pk = doc.pk
|
||||
self.correspondent = doc.correspondent
|
||||
self.title = doc.title
|
||||
self.mime_type = doc.mime_type
|
||||
self.filename = doc.filename
|
||||
self.created = doc.created
|
||||
|
||||
def __str__(self) -> str:
|
||||
# Convert UTC database time to local time
|
||||
created = datetime.date.isoformat(timezone.localdate(self.created))
|
||||
|
||||
res = f"{created}"
|
||||
|
||||
if self.correspondent:
|
||||
res += f" {self.correspondent}"
|
||||
if self.title:
|
||||
res += f" {self.title}"
|
||||
return res
|
||||
|
||||
@property
|
||||
def file_type(self):
|
||||
return get_default_file_extension(self.mime_type)
|
||||
|
||||
@property
|
||||
def source_path(self) -> Path:
|
||||
if self.filename:
|
||||
fname = str(self.filename)
|
||||
return (settings.ORIGINALS_DIR / Path(fname)).resolve()
|
||||
|
||||
|
||||
def add_number_of_pages_to_pages_count(apps, schema_editor):
|
||||
documentModel = apps.get_model("documents", "Document")
|
||||
|
||||
if not documentModel.objects.all().exists():
|
||||
return
|
||||
|
||||
for doc in documentModel.objects.filter(mime_type="application/pdf"):
|
||||
document = Document(doc)
|
||||
|
||||
print(
|
||||
" {} {} {}".format(
|
||||
colourise("*", fg="green"),
|
||||
colourise("Calculating number of pages for", fg="white"),
|
||||
colourise(document.filename, fg="cyan"),
|
||||
),
|
||||
)
|
||||
|
||||
pdf = pikepdf.open(document.source_path)
|
||||
|
||||
if pdf.pages is not None:
|
||||
doc.pages_count = len(pdf.pages)
|
||||
doc.save()
|
||||
|
||||
|
||||
def remove_number_of_pages_to_pages_count(apps, schema_editor):
|
||||
documentModel = apps.get_model("documents", "Document")
|
||||
|
||||
if not documentModel.objects.all().exists():
|
||||
return
|
||||
|
||||
for document in documentModel.objects.filter(mime_type="application/pdf"):
|
||||
document.pages_count = 0
|
||||
document.save()
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1052_document_transaction_id"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="document",
|
||||
name="pages_count",
|
||||
field=models.PositiveIntegerField(
|
||||
blank=False,
|
||||
null=True,
|
||||
unique=False,
|
||||
db_index=False,
|
||||
),
|
||||
),
|
||||
migrations.RunPython(
|
||||
add_number_of_pages_to_pages_count,
|
||||
remove_number_of_pages_to_pages_count,
|
||||
),
|
||||
]
|
||||
@@ -205,6 +205,18 @@ class Document(SoftDeleteModel, ModelWithOwner):
|
||||
help_text=_("The checksum of the archived document."),
|
||||
)
|
||||
|
||||
pages_count = models.PositiveIntegerField(
|
||||
_("pages count"),
|
||||
blank=False,
|
||||
null=True,
|
||||
unique=False,
|
||||
db_index=False,
|
||||
validators=[MinValueValidator(1)],
|
||||
help_text=_(
|
||||
"The number of pages of the document.",
|
||||
),
|
||||
)
|
||||
|
||||
created = models.DateTimeField(_("created"), default=timezone.now, db_index=True)
|
||||
|
||||
modified = models.DateTimeField(
|
||||
@@ -414,6 +426,7 @@ class SavedView(ModelWithOwner):
|
||||
OWNER = ("owner", _("Owner"))
|
||||
SHARED = ("shared", _("Shared"))
|
||||
ASN = ("asn", _("ASN"))
|
||||
PAGES_COUNT = ("pagescount", _("Pages"))
|
||||
CUSTOM_FIELD = ("custom_field_%d", ("Custom Field"))
|
||||
|
||||
name = models.CharField(_("name"), max_length=128)
|
||||
|
||||
@@ -367,6 +367,9 @@ class DocumentParser(LoggingMixin):
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
return []
|
||||
|
||||
def get_pages_count(self, document_path, mime_type):
|
||||
return None
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@@ -759,6 +759,7 @@ class DocumentSerializer(
|
||||
original_file_name = SerializerMethodField()
|
||||
archived_file_name = SerializerMethodField()
|
||||
created_date = serializers.DateField(required=False)
|
||||
pages_count = SerializerMethodField()
|
||||
|
||||
custom_fields = CustomFieldInstanceSerializer(
|
||||
many=True,
|
||||
@@ -779,6 +780,9 @@ class DocumentSerializer(
|
||||
required=False,
|
||||
)
|
||||
|
||||
def get_pages_count(self, obj):
|
||||
return obj.pages_count
|
||||
|
||||
def get_original_file_name(self, obj):
|
||||
return obj.original_filename
|
||||
|
||||
@@ -894,6 +898,7 @@ class DocumentSerializer(
|
||||
"notes",
|
||||
"custom_fields",
|
||||
"remove_inbox_tags",
|
||||
"pages_count",
|
||||
)
|
||||
list_serializer_class = OwnedObjectListSerializer
|
||||
|
||||
|
||||
@@ -361,6 +361,7 @@ class DocumentViewSet(
|
||||
"archive_serial_number",
|
||||
"num_notes",
|
||||
"owner",
|
||||
"pages_count",
|
||||
)
|
||||
|
||||
def get_queryset(self):
|
||||
@@ -444,6 +445,24 @@ class DocumentViewSet(
|
||||
logger.warning(f"No parser for {mime_type}")
|
||||
return []
|
||||
|
||||
def get_pages_count(self, file, mime_type):
|
||||
if not os.path.isfile(file):
|
||||
return None
|
||||
|
||||
parser_class = get_parser_class_for_mime_type(mime_type)
|
||||
if parser_class:
|
||||
parser = parser_class(progress_callback=None, logging_group=None)
|
||||
|
||||
try:
|
||||
return parser.get_pages_count(file)
|
||||
except Exception: # pragma: no cover
|
||||
logger.exception(f"Issue getting pages count for {file}")
|
||||
# TODO: cover GPG errors, remove later.
|
||||
return []
|
||||
else: # pragma: no cover
|
||||
logger.warning(f"No parser for {mime_type}")
|
||||
return []
|
||||
|
||||
def get_filesize(self, filename):
|
||||
if os.path.isfile(filename):
|
||||
return os.stat(filename).st_size
|
||||
|
||||
@@ -41,6 +41,15 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
"""
|
||||
return OcrConfig()
|
||||
|
||||
def get_pages_count(self, document_path, mime_type):
|
||||
pages_count = None
|
||||
if mime_type == "application/pdf":
|
||||
import pikepdf
|
||||
|
||||
pdf = pikepdf.open(document_path)
|
||||
pages_count = len(pdf.pages)
|
||||
return pages_count
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
result = []
|
||||
if mime_type == "application/pdf":
|
||||
|
||||
@@ -57,6 +57,20 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
|
||||
self.assertContainsStrings(text.strip(), ["This is a test document."])
|
||||
|
||||
def test_get_pages_count(self):
|
||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||
pages_count = parser.get_pages_count(
|
||||
os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertEqual(pages_count, 1)
|
||||
|
||||
pages_count = parser.get_pages_count(
|
||||
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertEqual(pages_count, 6)
|
||||
|
||||
def test_thumbnail(self):
|
||||
parser = RasterisedDocumentParser(uuid.uuid4())
|
||||
thumb = parser.get_thumbnail(
|
||||
|
||||
Reference in New Issue
Block a user