Full testing and some fixes of pdf actions

This commit is contained in:
shamoon 2024-03-03 20:00:03 -08:00
parent 713598d86b
commit 2004b424b5
3 changed files with 270 additions and 25 deletions

View File

@ -4,6 +4,7 @@ import logging
import os import os
from typing import Optional from typing import Optional
from django.conf import settings
from django.db.models import Q from django.db.models import Q
from documents.data_models import ConsumableDocument from documents.data_models import ConsumableDocument
@ -17,7 +18,6 @@ from documents.permissions import set_permissions_for_object
from documents.tasks import bulk_update_documents from documents.tasks import bulk_update_documents
from documents.tasks import consume_file from documents.tasks import consume_file
from documents.tasks import update_document_archive_file from documents.tasks import update_document_archive_file
from paperless import settings
logger = logging.getLogger("paperless.bulk_edit") logger = logging.getLogger("paperless.bulk_edit")
@ -160,6 +160,9 @@ def set_permissions(doc_ids, set_permissions, owner=None, merge=False):
def rotate(doc_ids: list[int], degrees: int): def rotate(doc_ids: list[int], degrees: int):
logger.info(
f"Attempting to rotate {len(doc_ids)} documents by {degrees} degrees.",
)
qs = Document.objects.filter(id__in=doc_ids) qs = Document.objects.filter(id__in=doc_ids)
affected_docs = [] affected_docs = []
import pikepdf import pikepdf
@ -175,10 +178,12 @@ def rotate(doc_ids: list[int], degrees: int):
update_document_archive_file.delay( update_document_archive_file.delay(
document_id=doc.id, document_id=doc.id,
) )
logger.info(f"Rotated document {doc.id} ({path}) by {degrees} degrees") logger.info(
f"Rotated document {doc.id} by {degrees} degrees",
)
affected_docs.append(doc.id) affected_docs.append(doc.id)
except Exception as e: except Exception as e:
logger.exception(f"Error rotating document {doc.id}", e) logger.exception(f"Error rotating document {doc.id}: {e}")
if len(affected_docs) > 0: if len(affected_docs) > 0:
bulk_update_documents.delay(document_ids=affected_docs) bulk_update_documents.delay(document_ids=affected_docs)
@ -187,30 +192,35 @@ def rotate(doc_ids: list[int], degrees: int):
def merge(doc_ids: list[int], metadata_document_id: Optional[int] = None): def merge(doc_ids: list[int], metadata_document_id: Optional[int] = None):
logger.info(
f"Attempting to merge {len(doc_ids)} documents into a single document.",
)
qs = Document.objects.filter(id__in=doc_ids) qs = Document.objects.filter(id__in=doc_ids)
affected_docs = []
import pikepdf import pikepdf
merged_pdf = pikepdf.new() merged_pdf = pikepdf.new()
# use doc_ids to preserve order # use doc_ids to preserve order
for doc_id in doc_ids: for doc_id in doc_ids:
doc = qs.get(id=doc_id) doc = qs.get(id=doc_id)
if doc is None:
continue
path = os.path.join(settings.ORIGINALS_DIR, str(doc.filename))
try: try:
with pikepdf.open(path, allow_overwriting_input=True) as pdf: with pikepdf.open(str(doc.source_path)) as pdf:
merged_pdf.pages.extend(pdf.pages) merged_pdf.pages.extend(pdf.pages)
affected_docs.append(doc.id)
except Exception as e: except Exception as e:
logger.exception( logger.exception(
f"Error merging document {doc.id}, it will not be included in the merge", f"Error merging document {doc.id}, it will not be included in the merge: {e}",
e,
) )
if len(affected_docs) == 0:
logger.warning("No documents were merged")
return "OK"
filepath = os.path.join( filepath = os.path.join(
settings.CONSUMPTION_DIR, settings.SCRATCH_DIR,
f"merged_{('_'.join([str(doc_id) for doc_id in doc_ids]))[:100]}.pdf", f"{'_'.join([str(doc_id) for doc_id in doc_ids])[:100]}_merged.pdf",
) )
merged_pdf.save(filepath) merged_pdf.save(filepath)
merged_pdf.close()
if metadata_document_id: if metadata_document_id:
metadata_document = qs.get(id=metadata_document_id) metadata_document = qs.get(id=metadata_document_id)
@ -239,19 +249,18 @@ def split(doc_ids: list[int], pages: list[list[int]]):
doc = Document.objects.get(id=doc_ids[0]) doc = Document.objects.get(id=doc_ids[0])
import pikepdf import pikepdf
path = os.path.join(settings.ORIGINALS_DIR, str(doc.filename))
try: try:
with pikepdf.open(path, allow_overwriting_input=True) as pdf: with pikepdf.open(doc.source_path) as pdf:
for idx, split_doc in enumerate(pages): for idx, split_doc in enumerate(pages):
dst = pikepdf.new() dst = pikepdf.new()
for page in split_doc: for page in split_doc:
dst.pages.append(pdf.pages[page - 1]) dst.pages.append(pdf.pages[page - 1])
filepath = os.path.join( filepath = os.path.join(
settings.CONSUMPTION_DIR, settings.SCRATCH_DIR,
f"{doc.filename}_{split_doc[0]}-{split_doc[-1]}.pdf", f"{doc.id}_{split_doc[0]}-{split_doc[-1]}.pdf",
) )
dst.save(filepath) dst.save(filepath)
dst.close()
overrides = DocumentMetadataOverrides().from_document(doc) overrides = DocumentMetadataOverrides().from_document(doc)
overrides.title = f"{doc.title} (split {idx + 1})" overrides.title = f"{doc.title} (split {idx + 1})"
@ -266,6 +275,6 @@ def split(doc_ids: list[int], pages: list[list[int]]):
overrides, overrides,
) )
except Exception as e: except Exception as e:
logger.exception(f"Error splitting document {doc.id}", e) logger.exception(f"Error splitting document {doc.id}: {e}")
return "OK" return "OK"

View File

@ -102,26 +102,30 @@ class DocumentMetadataOverrides:
overrides.storage_path_id = doc.storage_path.id if doc.storage_path else None overrides.storage_path_id = doc.storage_path.id if doc.storage_path else None
overrides.owner_id = doc.owner.id if doc.owner else None overrides.owner_id = doc.owner.id if doc.owner else None
overrides.tag_ids = list(doc.tags.values_list("id", flat=True)) overrides.tag_ids = list(doc.tags.values_list("id", flat=True))
overrides.view_users = get_users_with_perms( overrides.view_users = get_users_with_perms(
doc, doc,
only_with_perms_in=["view_document"], only_with_perms_in=["view_document"],
).values_list("id", flat=True) ).values_list("id", flat=True)
overrides.view_groups = get_groups_with_perms(
doc,
only_with_perms_in=["view_document"],
).values_list("id", flat=True)
overrides.change_users = get_users_with_perms( overrides.change_users = get_users_with_perms(
doc, doc,
only_with_perms_in=["change_document"], only_with_perms_in=["change_document"],
).values_list("id", flat=True) ).values_list("id", flat=True)
overrides.change_groups = get_groups_with_perms(
doc,
only_with_perms_in=["change_document"],
).values_list("id", flat=True)
overrides.custom_field_ids = list( overrides.custom_field_ids = list(
doc.custom_fields.values_list("id", flat=True), doc.custom_fields.values_list("id", flat=True),
) )
groups_with_perms = get_groups_with_perms(
doc,
attach_perms=True,
)
overrides.view_groups = [
group.id for group, perms in groups_with_perms if "view_document" in perms
]
overrides.change_groups = [
group.id for group, perms in groups_with_perms if "change_document" in perms
]
return overrides return overrides

View File

@ -1,3 +1,6 @@
import os
import shutil
from pathlib import Path
from unittest import mock from unittest import mock
from django.contrib.auth.models import Group from django.contrib.auth.models import Group
@ -275,3 +278,232 @@ class TestBulkEdit(DirectoriesMixin, TestCase):
self.doc1, self.doc1,
) )
self.assertEqual(groups_with_perms.count(), 2) self.assertEqual(groups_with_perms.count(), 2)
class TestPDFActions(DirectoriesMixin, TestCase):
def setUp(self):
super().setUp()
sample1 = os.path.join(self.dirs.scratch_dir, "sample.pdf")
shutil.copy(
os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000001.pdf",
),
sample1,
)
sample1_archive = os.path.join(self.dirs.archive_dir, "sample_archive.pdf")
shutil.copy(
os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000001.pdf",
),
sample1_archive,
)
sample2 = os.path.join(self.dirs.scratch_dir, "sample2.pdf")
shutil.copy(
os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000002.pdf",
),
sample2,
)
sample2_archive = os.path.join(self.dirs.archive_dir, "sample2_archive.pdf")
shutil.copy(
os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000002.pdf",
),
sample2_archive,
)
sample3 = os.path.join(self.dirs.scratch_dir, "sample3.pdf")
shutil.copy(
os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000003.pdf",
),
sample3,
)
self.doc1 = Document.objects.create(checksum="A", title="A", filename=sample1)
self.doc1.archive_filename = sample1_archive
self.doc1.save()
self.doc2 = Document.objects.create(checksum="B", title="B", filename=sample2)
self.doc2.archive_filename = sample2_archive
self.doc2.save()
self.doc3 = Document.objects.create(checksum="C", title="C", filename=sample3)
img_doc = os.path.join(self.dirs.scratch_dir, "sample_image.jpg")
shutil.copy(
os.path.join(
os.path.dirname(__file__),
"samples",
"simple.jpg",
),
img_doc,
)
self.img_doc = Document.objects.create(
checksum="D",
title="D",
filename=img_doc,
)
@mock.patch("documents.tasks.consume_file.delay")
def test_merge(self, mock_consume_file):
"""
GIVEN:
- Existing documents
WHEN:
- Merge action is called with 3 documents
THEN:
- Consume file should be called
"""
doc_ids = [self.doc1.id, self.doc2.id, self.doc3.id]
metadata_document_id = self.doc1.id
result = bulk_edit.merge(doc_ids)
expected_filename = (
f"{'_'.join([str(doc_id) for doc_id in doc_ids])[:100]}_merged.pdf"
)
mock_consume_file.assert_called()
consume_file_args, _ = mock_consume_file.call_args
self.assertEqual(
Path(consume_file_args[0].original_file).name,
expected_filename,
)
self.assertEqual(consume_file_args[1].title, None)
# With metadata_document_id overrides
result = bulk_edit.merge(doc_ids, metadata_document_id=metadata_document_id)
consume_file_args, _ = mock_consume_file.call_args
self.assertEqual(consume_file_args[1].title, "A (merged)")
self.assertEqual(result, "OK")
@mock.patch("documents.tasks.consume_file.delay")
@mock.patch("pikepdf.open")
def test_merge_with_errors(self, mock_open_pdf, mock_consume_file):
"""
GIVEN:
- Existing documents
WHEN:
- Merge action is called with 2 documents
- Error occurs when opening both files
THEN:
- Consume file should not be called
"""
mock_open_pdf.side_effect = Exception("Error opening PDF")
doc_ids = [self.doc2.id, self.doc3.id]
with self.assertLogs("paperless.bulk_edit", level="ERROR") as cm:
bulk_edit.merge(doc_ids)
error_str = cm.output[0]
expected_str = (
"Error merging document 2, it will not be included in the merge"
)
self.assertIn(expected_str, error_str)
mock_consume_file.assert_not_called()
@mock.patch("documents.tasks.consume_file.delay")
def test_split(self, mock_consume_file):
"""
GIVEN:
- Existing documents
WHEN:
- Split action is called with 1 document and 2 pages
THEN:
- Consume file should be called twice
"""
doc_ids = [self.doc2.id]
pages = [[1, 2], [3]]
result = bulk_edit.split(doc_ids, pages)
self.assertEqual(mock_consume_file.call_count, 2)
consume_file_args, _ = mock_consume_file.call_args
self.assertEqual(consume_file_args[1].title, "B (split 2)")
self.assertEqual(result, "OK")
@mock.patch("documents.tasks.consume_file.delay")
@mock.patch("pikepdf.Pdf.save")
def test_split_with_errors(self, mock_save_pdf, mock_consume_file):
"""
GIVEN:
- Existing documents
WHEN:
- Split action is called with 1 document and 2 page groups
- Error occurs when saving the files
THEN:
- Consume file should not be called
"""
mock_save_pdf.side_effect = Exception("Error saving PDF")
doc_ids = [self.doc2.id]
pages = [[1, 2], [3]]
with self.assertLogs("paperless.bulk_edit", level="ERROR") as cm:
bulk_edit.split(doc_ids, pages)
error_str = cm.output[0]
expected_str = "Error splitting document 2"
self.assertIn(expected_str, error_str)
mock_consume_file.assert_not_called()
@mock.patch("documents.tasks.bulk_update_documents.delay")
@mock.patch("documents.tasks.update_document_archive_file.delay")
def test_rotate(self, mock_update_document, mock_update_documents):
"""
GIVEN:
- Existing documents
WHEN:
- Rotate action is called with 2 documents
THEN:
- Rotate action should be called twice
"""
doc_ids = [self.doc1.id, self.doc2.id]
result = bulk_edit.rotate(doc_ids, 90)
self.assertEqual(mock_update_document.call_count, 2)
mock_update_documents.assert_called_once()
self.assertEqual(result, "OK")
@mock.patch("documents.tasks.bulk_update_documents.delay")
@mock.patch("documents.tasks.update_document_archive_file.delay")
@mock.patch("pikepdf.Pdf.save")
def test_rotate_with_error(
self,
mock_pdf_save,
mock_update_archive_file,
mock_update_documents,
):
"""
GIVEN:
- Existing documents
WHEN:
- Rotate action is called with 2 documents
- PikePDF raises an error
THEN:
- Rotate action should be called 0 times
"""
mock_pdf_save.side_effect = Exception("Error saving PDF")
doc_ids = [self.doc2.id, self.doc3.id]
with self.assertLogs("paperless.bulk_edit", level="ERROR") as cm:
bulk_edit.rotate(doc_ids, 90)
error_str = cm.output[0]
expected_str = "Error rotating document"
self.assertIn(expected_str, error_str)
mock_update_archive_file.assert_not_called()