Feature: number of pages of document in documents list

2024-09-21 18:18:19 +00:00
parent 609fa9a212
commit 865856b06d
19 changed files with 318 additions and 58 deletions
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -586,6 +586,7 @@ class ConsumerPlugin(
        date = None
        thumbnail = None
        archive_path = None
+        pages_count = None

        try:
            self._send_progress(
@@ -621,6 +622,7 @@ class ConsumerPlugin(
                )
                date = parse_date(self.filename, text)
            archive_path = document_parser.get_archive_path()
+            pages_count = document_parser.get_pages_count(self.working_copy, mime_type)

        except ParseError as e:
            document_parser.cleanup()
@@ -662,7 +664,12 @@ class ConsumerPlugin(
        try:
            with transaction.atomic():
                # store the document.
-                document = self._store(text=text, date=date, mime_type=mime_type)
+                document = self._store(
+                    text=text,
+                    date=date,
+                    pages_count=pages_count,
+                    mime_type=mime_type,
+                )

                # If we get here, it was successful. Proceed with post-consume
                # hooks. If they fail, nothing will get changed.
@@ -790,6 +797,7 @@ class ConsumerPlugin(
        self,
        text: str,
        date: Optional[datetime.datetime],
+        pages_count: int,
        mime_type: str,
    ) -> Document:
        # If someone gave us the original filename, use it instead of doc.
@@ -835,6 +843,7 @@ class ConsumerPlugin(
            created=create_date,
            modified=create_date,
            storage_type=storage_type,
+            pages_count=pages_count,
            original_filename=self.filename,
        )

--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -80,6 +80,7 @@ def get_schema():
        has_owner=BOOLEAN(),
        viewer_id=KEYWORD(commas=True),
        checksum=TEXT(),
+        pages_count=NUMERIC(sortable=True),
        original_filename=TEXT(sortable=True),
        is_shared=BOOLEAN(),
    )
@@ -181,6 +182,7 @@ def update_document(writer: AsyncWriter, doc: Document):
        has_owner=doc.owner is not None,
        viewer_id=viewer_ids if viewer_ids else None,
        checksum=doc.checksum,
+        pages_count=doc.pages_count,
        original_filename=doc.original_filename,
        is_shared=len(viewer_ids) > 0,
    )
@@ -247,6 +249,7 @@ class DelayedQuery:
            "archive_serial_number": "asn",
            "num_notes": "num_notes",
            "owner": "owner",
+            "pages_count": "pages_count",
        }

        if field.startswith("-"):
--- a/src/documents/migrations/1053_document_pages_count.py
+++ b/src/documents/migrations/1053_document_pages_count.py
@@ -0,0 +1,109 @@
+# Generated by Django 4.2.16 on 2024-09-21 15:44
+
+import datetime
+from pathlib import Path
+
+import pikepdf
+from django.conf import settings
+from django.db import migrations
+from django.db import models
+from django.utils import timezone
+from django.utils.termcolors import colorize as colourise
+
+from documents.parsers import get_default_file_extension
+
+
+class Document:
+    """
+    Django's migrations restrict access to model methods, so this is a snapshot
+    of the methods that existed at the time this migration was written, since
+    we need to make use of a lot of these shortcuts here.
+    """
+
+    def __init__(self, doc):
+        self.pk = doc.pk
+        self.correspondent = doc.correspondent
+        self.title = doc.title
+        self.mime_type = doc.mime_type
+        self.filename = doc.filename
+        self.created = doc.created
+
+    def __str__(self) -> str:
+        # Convert UTC database time to local time
+        created = datetime.date.isoformat(timezone.localdate(self.created))
+
+        res = f"{created}"
+
+        if self.correspondent:
+            res += f" {self.correspondent}"
+        if self.title:
+            res += f" {self.title}"
+        return res
+
+    @property
+    def file_type(self):
+        return get_default_file_extension(self.mime_type)
+
+    @property
+    def source_path(self) -> Path:
+        if self.filename:
+            fname = str(self.filename)
+        return (settings.ORIGINALS_DIR / Path(fname)).resolve()
+
+
+def add_number_of_pages_to_pages_count(apps, schema_editor):
+    documentModel = apps.get_model("documents", "Document")
+
+    if not documentModel.objects.all().exists():
+        return
+
+    for doc in documentModel.objects.filter(mime_type="application/pdf"):
+        document = Document(doc)
+
+        print(
+            "    {} {} {}".format(
+                colourise("*", fg="green"),
+                colourise("Calculating number of pages for", fg="white"),
+                colourise(document.filename, fg="cyan"),
+            ),
+        )
+
+        pdf = pikepdf.open(document.source_path)
+
+        if pdf.pages is not None:
+            doc.pages_count = len(pdf.pages)
+            doc.save()
+
+
+def remove_number_of_pages_to_pages_count(apps, schema_editor):
+    documentModel = apps.get_model("documents", "Document")
+
+    if not documentModel.objects.all().exists():
+        return
+
+    for document in documentModel.objects.filter(mime_type="application/pdf"):
+        document.pages_count = 0
+        document.save()
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("documents", "1052_document_transaction_id"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="document",
+            name="pages_count",
+            field=models.PositiveIntegerField(
+                blank=False,
+                null=True,
+                unique=False,
+                db_index=False,
+            ),
+        ),
+        migrations.RunPython(
+            add_number_of_pages_to_pages_count,
+            remove_number_of_pages_to_pages_count,
+        ),
+    ]
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -205,6 +205,18 @@ class Document(SoftDeleteModel, ModelWithOwner):
        help_text=_("The checksum of the archived document."),
    )

+    pages_count = models.PositiveIntegerField(
+        _("pages count"),
+        blank=False,
+        null=True,
+        unique=False,
+        db_index=False,
+        validators=[MinValueValidator(1)],
+        help_text=_(
+            "The number of pages of the document.",
+        ),
+    )
+
    created = models.DateTimeField(_("created"), default=timezone.now, db_index=True)

    modified = models.DateTimeField(
@@ -414,6 +426,7 @@ class SavedView(ModelWithOwner):
        OWNER = ("owner", _("Owner"))
        SHARED = ("shared", _("Shared"))
        ASN = ("asn", _("ASN"))
+        PAGES_COUNT = ("pagescount", _("Pages"))
        CUSTOM_FIELD = ("custom_field_%d", ("Custom Field"))

    name = models.CharField(_("name"), max_length=128)
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -367,6 +367,9 @@ class DocumentParser(LoggingMixin):
    def extract_metadata(self, document_path, mime_type):
        return []

+    def get_pages_count(self, document_path, mime_type):
+        return None
+
    def parse(self, document_path, mime_type, file_name=None):
        raise NotImplementedError

--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -759,6 +759,7 @@ class DocumentSerializer(
    original_file_name = SerializerMethodField()
    archived_file_name = SerializerMethodField()
    created_date = serializers.DateField(required=False)
+    pages_count = SerializerMethodField()

    custom_fields = CustomFieldInstanceSerializer(
        many=True,
@@ -779,6 +780,9 @@ class DocumentSerializer(
        required=False,
    )

+    def get_pages_count(self, obj):
+        return obj.pages_count
+
    def get_original_file_name(self, obj):
        return obj.original_filename

@@ -894,6 +898,7 @@ class DocumentSerializer(
            "notes",
            "custom_fields",
            "remove_inbox_tags",
+            "pages_count",
        )
        list_serializer_class = OwnedObjectListSerializer

--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -361,6 +361,7 @@ class DocumentViewSet(
        "archive_serial_number",
        "num_notes",
        "owner",
+        "pages_count",
    )

    def get_queryset(self):
@@ -444,6 +445,24 @@ class DocumentViewSet(
            logger.warning(f"No parser for {mime_type}")
            return []

+    def get_pages_count(self, file, mime_type):
+        if not os.path.isfile(file):
+            return None
+
+        parser_class = get_parser_class_for_mime_type(mime_type)
+        if parser_class:
+            parser = parser_class(progress_callback=None, logging_group=None)
+
+            try:
+                return parser.get_pages_count(file)
+            except Exception:  # pragma: no cover
+                logger.exception(f"Issue getting pages count for {file}")
+                # TODO: cover GPG errors, remove later.
+                return []
+        else:  # pragma: no cover
+            logger.warning(f"No parser for {mime_type}")
+            return []
+
    def get_filesize(self, filename):
        if os.path.isfile(filename):
            return os.stat(filename).st_size
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -41,6 +41,15 @@ class RasterisedDocumentParser(DocumentParser):
        """
        return OcrConfig()

+    def get_pages_count(self, document_path, mime_type):
+        pages_count = None
+        if mime_type == "application/pdf":
+            import pikepdf
+
+            pdf = pikepdf.open(document_path)
+            pages_count = len(pdf.pages)
+        return pages_count
+
    def extract_metadata(self, document_path, mime_type):
        result = []
        if mime_type == "application/pdf":
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -57,6 +57,20 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

        self.assertContainsStrings(text.strip(), ["This is a test document."])

+    def test_get_pages_count(self):
+        parser = RasterisedDocumentParser(uuid.uuid4())
+        pages_count = parser.get_pages_count(
+            os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
+            "application/pdf",
+        )
+        self.assertEqual(pages_count, 1)
+
+        pages_count = parser.get_pages_count(
+            os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
+            "application/pdf",
+        )
+        self.assertEqual(pages_count, 6)
+
    def test_thumbnail(self):
        parser = RasterisedDocumentParser(uuid.uuid4())
        thumb = parser.get_thumbnail(