Feature: number of pages of document in documents list

This commit is contained in:
s0llvan
2024-09-21 18:18:19 +00:00
parent 609fa9a212
commit 865856b06d
19 changed files with 318 additions and 58 deletions

View File

@@ -586,6 +586,7 @@ class ConsumerPlugin(
date = None
thumbnail = None
archive_path = None
pages_count = None
try:
self._send_progress(
@@ -621,6 +622,7 @@ class ConsumerPlugin(
)
date = parse_date(self.filename, text)
archive_path = document_parser.get_archive_path()
pages_count = document_parser.get_pages_count(self.working_copy, mime_type)
except ParseError as e:
document_parser.cleanup()
@@ -662,7 +664,12 @@ class ConsumerPlugin(
try:
with transaction.atomic():
# store the document.
document = self._store(text=text, date=date, mime_type=mime_type)
document = self._store(
text=text,
date=date,
pages_count=pages_count,
mime_type=mime_type,
)
# If we get here, it was successful. Proceed with post-consume
# hooks. If they fail, nothing will get changed.
@@ -790,6 +797,7 @@ class ConsumerPlugin(
self,
text: str,
date: Optional[datetime.datetime],
pages_count: int,
mime_type: str,
) -> Document:
# If someone gave us the original filename, use it instead of doc.
@@ -835,6 +843,7 @@ class ConsumerPlugin(
created=create_date,
modified=create_date,
storage_type=storage_type,
pages_count=pages_count,
original_filename=self.filename,
)

View File

@@ -80,6 +80,7 @@ def get_schema():
has_owner=BOOLEAN(),
viewer_id=KEYWORD(commas=True),
checksum=TEXT(),
pages_count=NUMERIC(sortable=True),
original_filename=TEXT(sortable=True),
is_shared=BOOLEAN(),
)
@@ -181,6 +182,7 @@ def update_document(writer: AsyncWriter, doc: Document):
has_owner=doc.owner is not None,
viewer_id=viewer_ids if viewer_ids else None,
checksum=doc.checksum,
pages_count=doc.pages_count,
original_filename=doc.original_filename,
is_shared=len(viewer_ids) > 0,
)
@@ -247,6 +249,7 @@ class DelayedQuery:
"archive_serial_number": "asn",
"num_notes": "num_notes",
"owner": "owner",
"pages_count": "pages_count",
}
if field.startswith("-"):

View File

@@ -0,0 +1,109 @@
# Generated by Django 4.2.16 on 2024-09-21 15:44
import datetime
from pathlib import Path
import pikepdf
from django.conf import settings
from django.db import migrations
from django.db import models
from django.utils import timezone
from django.utils.termcolors import colorize as colourise
from documents.parsers import get_default_file_extension
class Document:
"""
Django's migrations restrict access to model methods, so this is a snapshot
of the methods that existed at the time this migration was written, since
we need to make use of a lot of these shortcuts here.
"""
def __init__(self, doc):
self.pk = doc.pk
self.correspondent = doc.correspondent
self.title = doc.title
self.mime_type = doc.mime_type
self.filename = doc.filename
self.created = doc.created
def __str__(self) -> str:
# Convert UTC database time to local time
created = datetime.date.isoformat(timezone.localdate(self.created))
res = f"{created}"
if self.correspondent:
res += f" {self.correspondent}"
if self.title:
res += f" {self.title}"
return res
@property
def file_type(self):
return get_default_file_extension(self.mime_type)
@property
def source_path(self) -> Path:
if self.filename:
fname = str(self.filename)
return (settings.ORIGINALS_DIR / Path(fname)).resolve()
def add_number_of_pages_to_pages_count(apps, schema_editor):
documentModel = apps.get_model("documents", "Document")
if not documentModel.objects.all().exists():
return
for doc in documentModel.objects.filter(mime_type="application/pdf"):
document = Document(doc)
print(
" {} {} {}".format(
colourise("*", fg="green"),
colourise("Calculating number of pages for", fg="white"),
colourise(document.filename, fg="cyan"),
),
)
pdf = pikepdf.open(document.source_path)
if pdf.pages is not None:
doc.pages_count = len(pdf.pages)
doc.save()
def remove_number_of_pages_to_pages_count(apps, schema_editor):
documentModel = apps.get_model("documents", "Document")
if not documentModel.objects.all().exists():
return
for document in documentModel.objects.filter(mime_type="application/pdf"):
document.pages_count = 0
document.save()
class Migration(migrations.Migration):
dependencies = [
("documents", "1052_document_transaction_id"),
]
operations = [
migrations.AddField(
model_name="document",
name="pages_count",
field=models.PositiveIntegerField(
blank=False,
null=True,
unique=False,
db_index=False,
),
),
migrations.RunPython(
add_number_of_pages_to_pages_count,
remove_number_of_pages_to_pages_count,
),
]

View File

@@ -205,6 +205,18 @@ class Document(SoftDeleteModel, ModelWithOwner):
help_text=_("The checksum of the archived document."),
)
pages_count = models.PositiveIntegerField(
_("pages count"),
blank=False,
null=True,
unique=False,
db_index=False,
validators=[MinValueValidator(1)],
help_text=_(
"The number of pages of the document.",
),
)
created = models.DateTimeField(_("created"), default=timezone.now, db_index=True)
modified = models.DateTimeField(
@@ -414,6 +426,7 @@ class SavedView(ModelWithOwner):
OWNER = ("owner", _("Owner"))
SHARED = ("shared", _("Shared"))
ASN = ("asn", _("ASN"))
PAGES_COUNT = ("pagescount", _("Pages"))
CUSTOM_FIELD = ("custom_field_%d", ("Custom Field"))
name = models.CharField(_("name"), max_length=128)

View File

@@ -367,6 +367,9 @@ class DocumentParser(LoggingMixin):
def extract_metadata(self, document_path, mime_type):
return []
def get_pages_count(self, document_path, mime_type):
return None
def parse(self, document_path, mime_type, file_name=None):
raise NotImplementedError

View File

@@ -759,6 +759,7 @@ class DocumentSerializer(
original_file_name = SerializerMethodField()
archived_file_name = SerializerMethodField()
created_date = serializers.DateField(required=False)
pages_count = SerializerMethodField()
custom_fields = CustomFieldInstanceSerializer(
many=True,
@@ -779,6 +780,9 @@ class DocumentSerializer(
required=False,
)
def get_pages_count(self, obj):
return obj.pages_count
def get_original_file_name(self, obj):
return obj.original_filename
@@ -894,6 +898,7 @@ class DocumentSerializer(
"notes",
"custom_fields",
"remove_inbox_tags",
"pages_count",
)
list_serializer_class = OwnedObjectListSerializer

View File

@@ -361,6 +361,7 @@ class DocumentViewSet(
"archive_serial_number",
"num_notes",
"owner",
"pages_count",
)
def get_queryset(self):
@@ -444,6 +445,24 @@ class DocumentViewSet(
logger.warning(f"No parser for {mime_type}")
return []
def get_pages_count(self, file, mime_type):
if not os.path.isfile(file):
return None
parser_class = get_parser_class_for_mime_type(mime_type)
if parser_class:
parser = parser_class(progress_callback=None, logging_group=None)
try:
return parser.get_pages_count(file)
except Exception: # pragma: no cover
logger.exception(f"Issue getting pages count for {file}")
# TODO: cover GPG errors, remove later.
return []
else: # pragma: no cover
logger.warning(f"No parser for {mime_type}")
return []
def get_filesize(self, filename):
if os.path.isfile(filename):
return os.stat(filename).st_size

View File

@@ -41,6 +41,15 @@ class RasterisedDocumentParser(DocumentParser):
"""
return OcrConfig()
def get_pages_count(self, document_path, mime_type):
pages_count = None
if mime_type == "application/pdf":
import pikepdf
pdf = pikepdf.open(document_path)
pages_count = len(pdf.pages)
return pages_count
def extract_metadata(self, document_path, mime_type):
result = []
if mime_type == "application/pdf":

View File

@@ -57,6 +57,20 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertContainsStrings(text.strip(), ["This is a test document."])
def test_get_pages_count(self):
parser = RasterisedDocumentParser(uuid.uuid4())
pages_count = parser.get_pages_count(
os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
"application/pdf",
)
self.assertEqual(pages_count, 1)
pages_count = parser.get_pages_count(
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
"application/pdf",
)
self.assertEqual(pages_count, 6)
def test_thumbnail(self):
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(