diff --git a/src-ui/src/app/components/dashboard/widgets/saved-view-widget/saved-view-widget.component.spec.ts b/src-ui/src/app/components/dashboard/widgets/saved-view-widget/saved-view-widget.component.spec.ts
index cb120bb64..ef36b0806 100644
--- a/src-ui/src/app/components/dashboard/widgets/saved-view-widget/saved-view-widget.component.spec.ts
+++ b/src-ui/src/app/components/dashboard/widgets/saved-view-widget/saved-view-widget.component.spec.ts
@@ -65,6 +65,7 @@ const savedView: SavedView = {
DisplayField.CORRESPONDENT,
DisplayField.DOCUMENT_TYPE,
DisplayField.STORAGE_PATH,
+ DisplayField.PAGE_COUNT,
`${DisplayField.CUSTOM_FIELD}11` as any,
`${DisplayField.CUSTOM_FIELD}15` as any,
],
@@ -344,6 +345,7 @@ describe('SavedViewWidgetComponent', () => {
expect(component.getColumnTitle(DisplayField.STORAGE_PATH)).toEqual(
'Storage path'
)
+ expect(component.getColumnTitle(DisplayField.PAGE_COUNT)).toEqual('Pages')
})
it('should get correct column title for custom field', () => {
diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
index 1a8c7df82..f60056c42 100644
--- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
+++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.html
@@ -111,6 +111,12 @@
}
}
+ @if (displayFields.includes(DisplayField.PAGE_COUNT) && document.page_count) {
+
+
+ {document.page_count, plural, =1 {1 page} other {{{document.page_count}} pages}}
+
+ }
@if (displayFields.includes(DisplayField.OWNER) && document.owner && document.owner !== settingsService.currentUser.id) {
{{document.owner | username}}
diff --git a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.spec.ts b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.spec.ts
index a3f047f03..efd5076be 100644
--- a/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.spec.ts
+++ b/src-ui/src/app/components/document-list/document-card-large/document-card-large.component.spec.ts
@@ -31,6 +31,7 @@ const doc = {
correspondent: 8,
document_type: 10,
storage_path: null,
+ page_count: 8,
notes: [
{
id: 11,
@@ -80,6 +81,7 @@ describe('DocumentCardLargeComponent', () => {
it('should display a document', () => {
expect(fixture.nativeElement.textContent).toContain('Document 10')
expect(fixture.nativeElement.textContent).toContain('Cupcake ipsum')
+ expect(fixture.nativeElement.textContent).toContain('8 pages')
})
it('should show preview on mouseover after delay to preload content', fakeAsync(() => {
diff --git a/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.html b/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.html
index 92449214e..26f71ee8b 100644
--- a/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.html
+++ b/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.html
@@ -88,6 +88,14 @@
}
+ @if (displayFields.includes(DisplayField.PAGE_COUNT) && document.page_count) {
+
+
+
+ {document.page_count, plural, =1 {1 page} other {{{document.page_count}} pages}}
+
+
+ }
@if (displayFields.includes(DisplayField.ASN) && document.archive_serial_number | isNumber) {
diff --git a/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.spec.ts b/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.spec.ts
index fc15453be..b86453a25 100644
--- a/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.spec.ts
+++ b/src-ui/src/app/components/document-list/document-card-small/document-card-small.component.spec.ts
@@ -34,6 +34,7 @@ const doc = {
correspondent: 8,
document_type: 10,
storage_path: null,
+ page_count: 12,
notes: [
{
id: 11,
@@ -91,6 +92,10 @@ describe('DocumentCardSmallComponent', () => {
fixture.detectChanges()
})
+ it('should display page count', () => {
+ expect(fixture.nativeElement.textContent).toContain('12 pages')
+ })
+
it('should display a document, limit tags to 5', () => {
expect(fixture.nativeElement.textContent).toContain('Document 10')
expect(
diff --git a/src-ui/src/app/components/document-list/document-list.component.html b/src-ui/src/app/components/document-list/document-list.component.html
index 5ba14631e..ed5e8646a 100644
--- a/src-ui/src/app/components/document-list/document-list.component.html
+++ b/src-ui/src/app/components/document-list/document-list.component.html
@@ -246,6 +246,15 @@
(sort)="onSort($event)"
i18n>Added
}
+ @if (activeDisplayFields.includes(DisplayField.PAGE_COUNT)) {
+
Pages |
+ }
@if (activeDisplayFields.includes(DisplayField.SHARED)) {
Shared
@@ -330,6 +339,11 @@
{{d.added | customDate}}
}
+ @if (activeDisplayFields.includes(DisplayField.PAGE_COUNT)) {
+ |
+ {{ d.page_count }}
+ |
+ }
@if (activeDisplayFields.includes(DisplayField.SHARED)) {
@if (d.is_shared_by_requester) { Yes } @else { No }
diff --git a/src-ui/src/app/components/document-list/document-list.component.spec.ts b/src-ui/src/app/components/document-list/document-list.component.spec.ts
index 26758b3c0..ad85652b8 100644
--- a/src-ui/src/app/components/document-list/document-list.component.spec.ts
+++ b/src-ui/src/app/components/document-list/document-list.component.spec.ts
@@ -602,7 +602,7 @@ describe('DocumentListComponent', () => {
expect(
fixture.debugElement.queryAll(By.directive(SortableDirective))
- ).toHaveLength(9)
+ ).toHaveLength(10)
expect(component.notesEnabled).toBeTruthy()
settingsService.set(SETTINGS_KEYS.NOTES_ENABLED, false)
@@ -610,14 +610,14 @@ describe('DocumentListComponent', () => {
expect(component.notesEnabled).toBeFalsy()
expect(
fixture.debugElement.queryAll(By.directive(SortableDirective))
- ).toHaveLength(8)
+ ).toHaveLength(9)
// insufficient perms
jest.spyOn(permissionService, 'currentUserCan').mockReturnValue(false)
fixture.detectChanges()
expect(
fixture.debugElement.queryAll(By.directive(SortableDirective))
- ).toHaveLength(4)
+ ).toHaveLength(5)
})
it('should support toggle on document objects', () => {
diff --git a/src-ui/src/app/data/document.ts b/src-ui/src/app/data/document.ts
index 1571d2a53..0b630b8cd 100644
--- a/src-ui/src/app/data/document.ts
+++ b/src-ui/src/app/data/document.ts
@@ -26,6 +26,7 @@ export enum DisplayField {
OWNER = 'owner',
SHARED = 'shared',
ASN = 'asn',
+ PAGE_COUNT = 'pagecount',
}
export const DEFAULT_DISPLAY_FIELDS = [
@@ -73,6 +74,10 @@ export const DEFAULT_DISPLAY_FIELDS = [
id: DisplayField.ASN,
name: $localize`ASN`,
},
+ {
+ id: DisplayField.PAGE_COUNT,
+ name: $localize`Pages`,
+ },
]
export const DEFAULT_DASHBOARD_VIEW_PAGE_SIZE = 10
@@ -94,6 +99,7 @@ export const DOCUMENT_SORT_FIELDS = [
{ field: 'modified', name: $localize`Modified` },
{ field: 'num_notes', name: $localize`Notes` },
{ field: 'owner', name: $localize`Owner` },
+ { field: 'page_count', name: $localize`Pages` },
]
export const DOCUMENT_SORT_FIELDS_FULLTEXT = [
@@ -164,4 +170,6 @@ export interface Document extends ObjectWithPermissions {
// write-only field
remove_inbox_tags?: boolean
+
+ page_count?: number
}
diff --git a/src-ui/src/app/services/settings.service.ts b/src-ui/src/app/services/settings.service.ts
index 91d1cc320..c3ea3f856 100644
--- a/src-ui/src/app/services/settings.service.ts
+++ b/src-ui/src/app/services/settings.service.ts
@@ -345,6 +345,7 @@ export class SettingsService {
DisplayField.CREATED,
DisplayField.ADDED,
DisplayField.ASN,
+ DisplayField.PAGE_COUNT,
DisplayField.SHARED,
].includes(field.id)
) {
diff --git a/src-ui/src/styles.scss b/src-ui/src/styles.scss
index 412324142..c83ebd493 100644
--- a/src-ui/src/styles.scss
+++ b/src-ui/src/styles.scss
@@ -680,3 +680,13 @@ code {
}
}
}
+
+// pdfjs
+canvas.hiddenCanvasElement {
+ position: absolute;
+ left: 0;
+ bottom: 0;
+ z-index: -1;
+ height: 0;
+ width: 0;
+}
diff --git a/src/documents/barcodes.py b/src/documents/barcodes.py
index 97177cbf6..746d6014d 100644
--- a/src/documents/barcodes.py
+++ b/src/documents/barcodes.py
@@ -3,7 +3,6 @@ import re
import tempfile
from dataclasses import dataclass
from pathlib import Path
-from typing import Optional
from django.conf import settings
from pdf2image import convert_from_path
@@ -81,7 +80,7 @@ class BarcodePlugin(ConsumeTaskPlugin):
self._tiff_conversion_done = False
self.barcodes: list[Barcode] = []
- def run(self) -> Optional[str]:
+ def run(self) -> str | None:
# Some operations may use PIL, override pixel setting if needed
maybe_override_pixel_limit()
@@ -299,7 +298,7 @@ class BarcodePlugin(ConsumeTaskPlugin):
)
@property
- def asn(self) -> Optional[int]:
+ def asn(self) -> int | None:
"""
Search the parsed barcodes for any ASNs.
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
@@ -334,7 +333,7 @@ class BarcodePlugin(ConsumeTaskPlugin):
return asn
@property
- def tags(self) -> Optional[list[int]]:
+ def tags(self) -> list[int] | None:
"""
Search the parsed barcodes for any tags.
Returns the detected tag ids (or empty list)
diff --git a/src/documents/bulk_edit.py b/src/documents/bulk_edit.py
index 1f7a2a403..1aba8f9ec 100644
--- a/src/documents/bulk_edit.py
+++ b/src/documents/bulk_edit.py
@@ -3,7 +3,6 @@ import itertools
import logging
import os
import tempfile
-from typing import Optional
from celery import chain
from celery import chord
@@ -242,7 +241,7 @@ def rotate(doc_ids: list[int], degrees: int):
def merge(
doc_ids: list[int],
- metadata_document_id: Optional[int] = None,
+ metadata_document_id: int | None = None,
delete_originals: bool = False,
user: User = None,
):
@@ -387,6 +386,8 @@ def delete_pages(doc_ids: list[int], pages: list[int]):
pdf.remove_unreferenced_resources()
pdf.save()
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
+ if doc.page_count is not None:
+ doc.page_count = doc.page_count - len(pages)
doc.save()
update_document_archive_file.delay(document_id=doc.id)
logger.info(f"Deleted pages {pages} from document {doc.id}")
diff --git a/src/documents/caching.py b/src/documents/caching.py
index 4bcb22e21..6eb2b691f 100644
--- a/src/documents/caching.py
+++ b/src/documents/caching.py
@@ -19,8 +19,8 @@ logger = logging.getLogger("paperless.caching")
class MetadataCacheData:
original_checksum: str
original_metadata: list
- archive_checksum: Optional[str]
- archive_metadata: Optional[list]
+ archive_checksum: str | None
+ archive_metadata: list | None
@dataclass(frozen=True)
@@ -46,7 +46,7 @@ def get_suggestion_cache_key(document_id: int) -> str:
return f"doc_{document_id}_suggest"
-def get_suggestion_cache(document_id: int) -> Optional[SuggestionCacheData]:
+def get_suggestion_cache(document_id: int) -> SuggestionCacheData | None:
"""
If possible, return the cached suggestions for the given document ID.
The classifier needs to be matching in format and hash and the suggestions need to
@@ -121,13 +121,13 @@ def get_metadata_cache_key(document_id: int) -> str:
return f"doc_{document_id}_metadata"
-def get_metadata_cache(document_id: int) -> Optional[MetadataCacheData]:
+def get_metadata_cache(document_id: int) -> MetadataCacheData | None:
"""
Returns the cached document metadata for the given document ID, as long as the metadata
was cached once and the checksums have not changed
"""
doc_key = get_metadata_cache_key(document_id)
- doc_metadata: Optional[MetadataCacheData] = cache.get(doc_key)
+ doc_metadata: MetadataCacheData | None = cache.get(doc_key)
# The metadata exists in the cache
if doc_metadata is not None:
try:
@@ -161,7 +161,7 @@ def get_metadata_cache(document_id: int) -> Optional[MetadataCacheData]:
def set_metadata_cache(
document: Document,
original_metadata: list,
- archive_metadata: Optional[list],
+ archive_metadata: list | None,
*,
timeout=CACHE_50_MINUTES,
) -> None:
diff --git a/src/documents/classifier.py b/src/documents/classifier.py
index 66b06d69d..26a1ae478 100644
--- a/src/documents/classifier.py
+++ b/src/documents/classifier.py
@@ -78,9 +78,9 @@ class DocumentClassifier:
def __init__(self):
# last time a document changed and therefore training might be required
- self.last_doc_change_time: Optional[datetime] = None
+ self.last_doc_change_time: datetime | None = None
# Hash of primary keys of AUTO matching values last used in training
- self.last_auto_type_hash: Optional[bytes] = None
+ self.last_auto_type_hash: bytes | None = None
self.data_vectorizer = None
self.tags_binarizer = None
@@ -408,7 +408,7 @@ class DocumentClassifier:
return content
- def predict_correspondent(self, content: str) -> Optional[int]:
+ def predict_correspondent(self, content: str) -> int | None:
if self.correspondent_classifier:
X = self.data_vectorizer.transform([self.preprocess_content(content)])
correspondent_id = self.correspondent_classifier.predict(X)
@@ -419,7 +419,7 @@ class DocumentClassifier:
else:
return None
- def predict_document_type(self, content: str) -> Optional[int]:
+ def predict_document_type(self, content: str) -> int | None:
if self.document_type_classifier:
X = self.data_vectorizer.transform([self.preprocess_content(content)])
document_type_id = self.document_type_classifier.predict(X)
@@ -451,7 +451,7 @@ class DocumentClassifier:
else:
return []
- def predict_storage_path(self, content: str) -> Optional[int]:
+ def predict_storage_path(self, content: str) -> int | None:
if self.storage_path_classifier:
X = self.data_vectorizer.transform([self.preprocess_content(content)])
storage_path_id = self.storage_path_classifier.predict(X)
diff --git a/src/documents/conditionals.py b/src/documents/conditionals.py
index 14fe3096a..47d9bfe4b 100644
--- a/src/documents/conditionals.py
+++ b/src/documents/conditionals.py
@@ -1,6 +1,5 @@
from datetime import datetime
from datetime import timezone
-from typing import Optional
from django.conf import settings
from django.core.cache import cache
@@ -15,7 +14,7 @@ from documents.classifier import DocumentClassifier
from documents.models import Document
-def suggestions_etag(request, pk: int) -> Optional[str]:
+def suggestions_etag(request, pk: int) -> str | None:
"""
Returns an optional string for the ETag, allowing browser caching of
suggestions if the classifier has not been changed and the suggested dates
@@ -42,7 +41,7 @@ def suggestions_etag(request, pk: int) -> Optional[str]:
return None
-def suggestions_last_modified(request, pk: int) -> Optional[datetime]:
+def suggestions_last_modified(request, pk: int) -> datetime | None:
"""
Returns the datetime of classifier last modification. This is slightly off,
as there is not way to track the suggested date setting modification, but it seems
@@ -67,7 +66,7 @@ def suggestions_last_modified(request, pk: int) -> Optional[datetime]:
return None
-def metadata_etag(request, pk: int) -> Optional[str]:
+def metadata_etag(request, pk: int) -> str | None:
"""
Metadata is extracted from the original file, so use its checksum as the
ETag
@@ -80,7 +79,7 @@ def metadata_etag(request, pk: int) -> Optional[str]:
return None
-def metadata_last_modified(request, pk: int) -> Optional[datetime]:
+def metadata_last_modified(request, pk: int) -> datetime | None:
"""
Metadata is extracted from the original file, so use its modified. Strictly speaking, this is
not the modification of the original file, but of the database object, but might as well
@@ -94,7 +93,7 @@ def metadata_last_modified(request, pk: int) -> Optional[datetime]:
return None
-def preview_etag(request, pk: int) -> Optional[str]:
+def preview_etag(request, pk: int) -> str | None:
"""
ETag for the document preview, using the original or archive checksum, depending on the request
"""
@@ -110,7 +109,7 @@ def preview_etag(request, pk: int) -> Optional[str]:
return None
-def preview_last_modified(request, pk: int) -> Optional[datetime]:
+def preview_last_modified(request, pk: int) -> datetime | None:
"""
Uses the documents modified time to set the Last-Modified header. Not strictly
speaking correct, but close enough and quick
@@ -123,7 +122,7 @@ def preview_last_modified(request, pk: int) -> Optional[datetime]:
return None
-def thumbnail_last_modified(request, pk: int) -> Optional[datetime]:
+def thumbnail_last_modified(request, pk: int) -> datetime | None:
"""
Returns the filesystem last modified either from cache or from filesystem.
Cache should be (slightly?) faster than filesystem
diff --git a/src/documents/consumer.py b/src/documents/consumer.py
index d90b88f5a..803d82510 100644
--- a/src/documents/consumer.py
+++ b/src/documents/consumer.py
@@ -5,8 +5,6 @@ import tempfile
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING
-from typing import Optional
-from typing import Union
import magic
from django.conf import settings
@@ -61,7 +59,7 @@ class WorkflowTriggerPlugin(
):
NAME: str = "WorkflowTriggerPlugin"
- def run(self) -> Optional[str]:
+ def run(self) -> str | None:
"""
Get overrides from matching workflows
"""
@@ -278,7 +276,7 @@ class ConsumerPlugin(
current_progress: int,
max_progress: int,
status: ProgressStatusOptions,
- message: Optional[Union[ConsumerStatusShortMessage, str]] = None,
+ message: ConsumerStatusShortMessage | str | None = None,
document_id=None,
): # pragma: no cover
self.status_mgr.send_progress(
@@ -294,10 +292,10 @@ class ConsumerPlugin(
def _fail(
self,
- message: Union[ConsumerStatusShortMessage, str],
- log_message: Optional[str] = None,
+ message: ConsumerStatusShortMessage | str,
+ log_message: str | None = None,
exc_info=None,
- exception: Optional[Exception] = None,
+ exception: Exception | None = None,
):
self._send_progress(100, 100, ProgressStatusOptions.FAILED, message)
self.log.error(log_message or message, exc_info=exc_info)
@@ -532,6 +530,7 @@ class ConsumerPlugin(
)
self.working_copy = Path(tempdir.name) / Path(self.filename)
copy_file_with_basic_stats(self.input_doc.original_file, self.working_copy)
+ self.unmodified_original = None
# Determine the parser class.
@@ -539,11 +538,40 @@ class ConsumerPlugin(
self.log.debug(f"Detected mime type: {mime_type}")
+ if (
+ Path(self.filename).suffix.lower() == ".pdf"
+ and mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+ ):
+ try:
+ # The file might be a pdf, but the mime type is wrong.
+ # Try to clean with qpdf
+ self.log.debug(
+ "Detected possible PDF with wrong mime type, trying to clean with qpdf",
+ )
+ run_subprocess(
+ [
+ "qpdf",
+ "--replace-input",
+ self.working_copy,
+ ],
+ logger=self.log,
+ )
+ mime_type = magic.from_file(self.working_copy, mime=True)
+ self.log.debug(f"Detected mime type after qpdf: {mime_type}")
+ # Save the original file for later
+ self.unmodified_original = (
+ Path(tempdir.name) / Path("uo") / Path(self.filename)
+ )
+ copy_file_with_basic_stats(
+ self.input_doc.original_file,
+ self.unmodified_original,
+ )
+ except Exception as e:
+ self.log.error(f"Error attempting to clean PDF: {e}")
+
# Based on the mime type, get the parser for that type
- parser_class: Optional[type[DocumentParser]] = (
- get_parser_class_for_mime_type(
- mime_type,
- )
+ parser_class: type[DocumentParser] | None = get_parser_class_for_mime_type(
+ mime_type,
)
if not parser_class:
tempdir.cleanup()
@@ -586,6 +614,7 @@ class ConsumerPlugin(
date = None
thumbnail = None
archive_path = None
+ page_count = None
try:
self._send_progress(
@@ -621,6 +650,7 @@ class ConsumerPlugin(
)
date = parse_date(self.filename, text)
archive_path = document_parser.get_archive_path()
+ page_count = document_parser.get_page_count(self.working_copy, mime_type)
except ParseError as e:
document_parser.cleanup()
@@ -662,7 +692,12 @@ class ConsumerPlugin(
try:
with transaction.atomic():
# store the document.
- document = self._store(text=text, date=date, mime_type=mime_type)
+ document = self._store(
+ text=text,
+ date=date,
+ page_count=page_count,
+ mime_type=mime_type,
+ )
# If we get here, it was successful. Proceed with post-consume
# hooks. If they fail, nothing will get changed.
@@ -682,7 +717,9 @@ class ConsumerPlugin(
self._write(
document.storage_type,
- self.working_copy,
+ self.unmodified_original
+ if self.unmodified_original is not None
+ else self.working_copy,
document.source_path,
)
@@ -718,6 +755,8 @@ class ConsumerPlugin(
self.log.debug(f"Deleting file {self.working_copy}")
self.input_doc.original_file.unlink()
self.working_copy.unlink()
+ if self.unmodified_original is not None: # pragma: no cover
+ self.unmodified_original.unlink()
# https://github.com/jonaswinkler/paperless-ng/discussions/1037
shadow_file = os.path.join(
@@ -789,7 +828,8 @@ class ConsumerPlugin(
def _store(
self,
text: str,
- date: Optional[datetime.datetime],
+ date: datetime.datetime | None,
+ page_count: int | None,
mime_type: str,
) -> Document:
# If someone gave us the original filename, use it instead of doc.
@@ -835,6 +875,7 @@ class ConsumerPlugin(
created=create_date,
modified=create_date,
storage_type=storage_type,
+ page_count=page_count,
original_filename=self.filename,
)
@@ -916,7 +957,7 @@ def parse_doc_title_w_placeholders(
owner_username: str,
local_added: datetime.datetime,
original_filename: str,
- created: Optional[datetime.datetime] = None,
+ created: datetime.datetime | None = None,
) -> str:
"""
Available title placeholders for Workflows depend on what has already been assigned,
diff --git a/src/documents/data_models.py b/src/documents/data_models.py
index b99c8511d..231e59005 100644
--- a/src/documents/data_models.py
+++ b/src/documents/data_models.py
@@ -2,7 +2,6 @@ import dataclasses
import datetime
from enum import IntEnum
from pathlib import Path
-from typing import Optional
import magic
from guardian.shortcuts import get_groups_with_perms
@@ -17,20 +16,20 @@ class DocumentMetadataOverrides:
meaning no override is happening
"""
- filename: Optional[str] = None
- title: Optional[str] = None
- correspondent_id: Optional[int] = None
- document_type_id: Optional[int] = None
- tag_ids: Optional[list[int]] = None
- storage_path_id: Optional[int] = None
- created: Optional[datetime.datetime] = None
- asn: Optional[int] = None
- owner_id: Optional[int] = None
- view_users: Optional[list[int]] = None
- view_groups: Optional[list[int]] = None
- change_users: Optional[list[int]] = None
- change_groups: Optional[list[int]] = None
- custom_field_ids: Optional[list[int]] = None
+ filename: str | None = None
+ title: str | None = None
+ correspondent_id: int | None = None
+ document_type_id: int | None = None
+ tag_ids: list[int] | None = None
+ storage_path_id: int | None = None
+ created: datetime.datetime | None = None
+ asn: int | None = None
+ owner_id: int | None = None
+ view_users: list[int] | None = None
+ view_groups: list[int] | None = None
+ change_users: list[int] | None = None
+ change_groups: list[int] | None = None
+ custom_field_ids: list[int] | None = None
def update(self, other: "DocumentMetadataOverrides") -> "DocumentMetadataOverrides":
"""
@@ -156,7 +155,7 @@ class ConsumableDocument:
source: DocumentSource
original_file: Path
- mailrule_id: Optional[int] = None
+ mailrule_id: int | None = None
mime_type: str = dataclasses.field(init=False, default=None)
def __post_init__(self):
diff --git a/src/documents/double_sided.py b/src/documents/double_sided.py
index bfe66f4fe..3c3ec4723 100644
--- a/src/documents/double_sided.py
+++ b/src/documents/double_sided.py
@@ -4,7 +4,6 @@ import os
import shutil
from pathlib import Path
from typing import Final
-from typing import Optional
from django.conf import settings
from pikepdf import Pdf
@@ -37,7 +36,7 @@ class CollatePlugin(NoCleanupPluginMixin, NoSetupPluginMixin, ConsumeTaskPlugin)
in self.input_doc.original_file.parts
)
- def run(self) -> Optional[str]:
+ def run(self) -> str | None:
"""
Tries to collate pages from 2 single sided scans of a double sided
document.
diff --git a/src/documents/filters.py b/src/documents/filters.py
index b6ac591fe..255e7d3d7 100644
--- a/src/documents/filters.py
+++ b/src/documents/filters.py
@@ -2,9 +2,8 @@ import functools
import inspect
import json
import operator
+from collections.abc import Callable
from contextlib import contextmanager
-from typing import Callable
-from typing import Union
from django.contrib.contenttypes.models import ContentType
from django.db.models import CharField
@@ -311,7 +310,7 @@ class CustomFieldQueryParser:
`max_query_depth` and `max_atom_count` can be set to guard against generating arbitrarily
complex SQL queries.
"""
- self._custom_fields: dict[Union[int, str], CustomField] = {}
+ self._custom_fields: dict[int | str, CustomField] = {}
self._validation_prefix = validation_prefix
# Dummy ModelSerializer used to convert a Django models.Field to serializers.Field.
self._model_serializer = serializers.ModelSerializer()
@@ -345,7 +344,7 @@ class CustomFieldQueryParser:
Applies rule (1, 2, 3) or (4, 5, 6) based on the length of the expr.
"""
with self._track_query_depth():
- if isinstance(expr, (list, tuple)):
+ if isinstance(expr, list | tuple):
if len(expr) == 2:
return self._parse_logical_expr(*expr)
elif len(expr) == 3:
@@ -359,7 +358,7 @@ class CustomFieldQueryParser:
"""
Handles [`q0`, `q1`, ..., `qn`] in rule 4 & 5.
"""
- if not isinstance(exprs, (list, tuple)) or not exprs:
+ if not isinstance(exprs, list | tuple) or not exprs:
raise serializers.ValidationError(
[_("Invalid expression list. Must be nonempty.")],
)
diff --git a/src/documents/index.py b/src/documents/index.py
index d95a80213..eacd1f99b 100644
--- a/src/documents/index.py
+++ b/src/documents/index.py
@@ -6,7 +6,6 @@ from contextlib import contextmanager
from datetime import datetime
from datetime import timezone
from shutil import rmtree
-from typing import Optional
from django.conf import settings
from django.db.models import QuerySet
@@ -80,6 +79,7 @@ def get_schema():
has_owner=BOOLEAN(),
viewer_id=KEYWORD(commas=True),
checksum=TEXT(),
+ page_count=NUMERIC(sortable=True),
original_filename=TEXT(sortable=True),
is_shared=BOOLEAN(),
)
@@ -181,6 +181,7 @@ def update_document(writer: AsyncWriter, doc: Document):
has_owner=doc.owner is not None,
viewer_id=viewer_ids if viewer_ids else None,
checksum=doc.checksum,
+ page_count=doc.page_count,
original_filename=doc.original_filename,
is_shared=len(viewer_ids) > 0,
)
@@ -247,6 +248,7 @@ class DelayedQuery:
"archive_serial_number": "asn",
"num_notes": "num_notes",
"owner": "owner",
+ "page_count": "page_count",
}
if field.startswith("-"):
@@ -386,7 +388,7 @@ def autocomplete(
ix: FileIndex,
term: str,
limit: int = 10,
- user: Optional[User] = None,
+ user: User | None = None,
):
"""
Mimics whoosh.reading.IndexReader.most_distinctive_terms with permissions
@@ -422,7 +424,7 @@ def autocomplete(
return terms
-def get_permissions_criterias(user: Optional[User] = None):
+def get_permissions_criterias(user: User | None = None):
user_criterias = [query.Term("has_owner", False)]
if user is not None:
if user.is_superuser: # superusers see all docs
diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py
index 97f9fcc59..1eb2f6541 100644
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -251,7 +251,7 @@ class Command(BaseCommand):
self.handle_inotify(directory, recursive, options["testing"])
else:
if INotify is None and settings.CONSUMER_POLLING == 0: # pragma: no cover
- logger.warn("Using polling as INotify import failed")
+ logger.warning("Using polling as INotify import failed")
self.handle_polling(directory, recursive, options["testing"])
logger.debug("Consumer exiting.")
@@ -267,7 +267,7 @@ class Command(BaseCommand):
polling_interval = settings.CONSUMER_POLLING
if polling_interval == 0: # pragma: no cover
# Only happens if INotify failed to import
- logger.warn("Using polling of 10s, consider setting this")
+ logger.warning("Using polling of 10s, consider setting this")
polling_interval = 10
with ThreadPoolExecutor(max_workers=4) as pool:
diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py
index 618c1a4e5..3d7352c1a 100644
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -6,7 +6,6 @@ import tempfile
import time
from pathlib import Path
from typing import TYPE_CHECKING
-from typing import Optional
import tqdm
from django.conf import settings
@@ -183,7 +182,7 @@ class Command(CryptMixin, BaseCommand):
self.zip_export: bool = options["zip"]
self.data_only: bool = options["data_only"]
self.no_progress_bar: bool = options["no_progress_bar"]
- self.passphrase: Optional[str] = options.get("passphrase")
+ self.passphrase: str | None = options.get("passphrase")
self.files_in_export_dir: set[Path] = set()
self.exported_files: set[str] = set()
@@ -427,7 +426,7 @@ class Command(CryptMixin, BaseCommand):
document: Document,
base_name: str,
document_dict: dict,
- ) -> tuple[Path, Optional[Path], Optional[Path]]:
+ ) -> tuple[Path, Path | None, Path | None]:
"""
Generates the targets for a given document, including the original file, archive file and thumbnail (depending on settings).
"""
@@ -461,8 +460,8 @@ class Command(CryptMixin, BaseCommand):
self,
document: Document,
original_target: Path,
- thumbnail_target: Optional[Path],
- archive_target: Optional[Path],
+ thumbnail_target: Path | None,
+ archive_target: Path | None,
) -> None:
"""
Copies files from the document storage location to the specified target location.
@@ -512,7 +511,7 @@ class Command(CryptMixin, BaseCommand):
def check_and_copy(
self,
source: Path,
- source_checksum: Optional[str],
+ source_checksum: str | None,
target: Path,
):
"""
diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py
index 3535e1476..a402466f4 100644
--- a/src/documents/management/commands/document_importer.py
+++ b/src/documents/management/commands/document_importer.py
@@ -3,7 +3,6 @@ import logging
import os
from contextlib import contextmanager
from pathlib import Path
-from typing import Optional
import tqdm
from django.conf import settings
@@ -228,8 +227,8 @@ class Command(CryptMixin, BaseCommand):
self.data_only: bool = options["data_only"]
self.no_progress_bar: bool = options["no_progress_bar"]
self.passphrase: str | None = options.get("passphrase")
- self.version: Optional[str] = None
- self.salt: Optional[str] = None
+ self.version: str | None = None
+ self.salt: str | None = None
self.manifest_paths = []
self.manifest = []
diff --git a/src/documents/management/commands/mixins.py b/src/documents/management/commands/mixins.py
index 823631586..212ecf597 100644
--- a/src/documents/management/commands/mixins.py
+++ b/src/documents/management/commands/mixins.py
@@ -1,9 +1,7 @@
import base64
import os
from argparse import ArgumentParser
-from typing import Optional
from typing import TypedDict
-from typing import Union
from cryptography.fernet import Fernet
from cryptography.hazmat.primitives import hashes
@@ -103,7 +101,7 @@ class CryptMixin:
},
]
- def get_crypt_params(self) -> dict[str, dict[str, Union[str, int]]]:
+ def get_crypt_params(self) -> dict[str, dict[str, str | int]]:
return {
EXPORTER_CRYPTO_SETTINGS_NAME: {
EXPORTER_CRYPTO_ALGO_NAME: self.kdf_algorithm,
@@ -128,7 +126,7 @@ class CryptMixin:
EXPORTER_CRYPTO_SALT_NAME
]
- def setup_crypto(self, *, passphrase: str, salt: Optional[str] = None):
+ def setup_crypto(self, *, passphrase: str, salt: str | None = None):
"""
Constructs a class for encryption or decryption using the specified passphrase and salt
diff --git a/src/documents/matching.py b/src/documents/matching.py
index 586ca3a6a..36fa9a2c6 100644
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -1,7 +1,6 @@
import logging
import re
from fnmatch import fnmatch
-from typing import Union
from documents.classifier import DocumentClassifier
from documents.data_models import ConsumableDocument
@@ -20,7 +19,7 @@ logger = logging.getLogger("paperless.matching")
def log_reason(
- matching_model: Union[MatchingModel, WorkflowTrigger],
+ matching_model: MatchingModel | WorkflowTrigger,
document: Document,
reason: str,
):
@@ -386,7 +385,7 @@ def existing_document_matches_workflow(
def document_matches_workflow(
- document: Union[ConsumableDocument, Document],
+ document: ConsumableDocument | Document,
workflow: Workflow,
trigger_type: WorkflowTrigger.WorkflowTriggerType,
) -> bool:
diff --git a/src/documents/migrations/1053_document_page_count.py b/src/documents/migrations/1053_document_page_count.py
new file mode 100644
index 000000000..13549e00f
--- /dev/null
+++ b/src/documents/migrations/1053_document_page_count.py
@@ -0,0 +1,62 @@
+# Generated by Django 4.2.16 on 2024-09-21 15:44
+from pathlib import Path
+
+import pikepdf
+from django.conf import settings
+from django.db import migrations
+from django.db import models
+from django.utils.termcolors import colorize as colourise
+
+
+def source_path(self):
+ if self.filename:
+ fname = str(self.filename)
+
+ return Path(settings.ORIGINALS_DIR / fname).resolve()
+
+
+def add_number_of_pages_to_page_count(apps, schema_editor):
+ Document = apps.get_model("documents", "Document")
+
+ if not Document.objects.all().exists():
+ return
+
+ for doc in Document.objects.filter(mime_type="application/pdf"):
+ print(
+ " {} {} {}".format(
+ colourise("*", fg="green"),
+ colourise("Calculating number of pages for", fg="white"),
+ colourise(doc.filename, fg="cyan"),
+ ),
+ )
+
+ try:
+ with pikepdf.Pdf.open(source_path(doc)) as pdf:
+ if pdf.pages is not None:
+ doc.page_count = len(pdf.pages)
+ doc.save()
+ except Exception as e: # pragma: no cover
+ print(f"Error retrieving number of pages for {doc.filename}: {e}")
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("documents", "1052_document_transaction_id"),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name="document",
+ name="page_count",
+ field=models.PositiveIntegerField(
+ blank=False,
+ null=True,
+ unique=False,
+ db_index=False,
+ ),
+ ),
+ migrations.RunPython(
+ add_number_of_pages_to_page_count,
+ migrations.RunPython.noop,
+ ),
+ ]
diff --git a/src/documents/models.py b/src/documents/models.py
index 452c57c78..b0a6bdd61 100644
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -5,7 +5,6 @@ import re
from collections import OrderedDict
from pathlib import Path
from typing import Final
-from typing import Optional
import dateutil.parser
import pathvalidate
@@ -205,6 +204,18 @@ class Document(SoftDeleteModel, ModelWithOwner):
help_text=_("The checksum of the archived document."),
)
+ page_count = models.PositiveIntegerField(
+ _("page count"),
+ blank=False,
+ null=True,
+ unique=False,
+ db_index=False,
+ validators=[MinValueValidator(1)],
+ help_text=_(
+ "The number of pages of the document.",
+ ),
+ )
+
created = models.DateTimeField(_("created"), default=timezone.now, db_index=True)
modified = models.DateTimeField(
@@ -314,7 +325,7 @@ class Document(SoftDeleteModel, ModelWithOwner):
return self.archive_filename is not None
@property
- def archive_path(self) -> Optional[Path]:
+ def archive_path(self) -> Path | None:
if self.has_archive_version:
return (settings.ARCHIVE_DIR / Path(str(self.archive_filename))).resolve()
else:
@@ -414,6 +425,7 @@ class SavedView(ModelWithOwner):
OWNER = ("owner", _("Owner"))
SHARED = ("shared", _("Shared"))
ASN = ("asn", _("ASN"))
+ PAGE_COUNT = ("pagecount", _("Pages"))
CUSTOM_FIELD = ("custom_field_%d", ("Custom Field"))
name = models.CharField(_("name"), max_length=128)
diff --git a/src/documents/parsers.py b/src/documents/parsers.py
index 1297162e2..2d73dc63f 100644
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -10,7 +10,6 @@ from collections.abc import Iterator
from functools import lru_cache
from pathlib import Path
from re import Match
-from typing import Optional
from django.conf import settings
from django.utils import timezone
@@ -107,7 +106,7 @@ def get_supported_file_extensions() -> set[str]:
return extensions
-def get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentParser"]]:
+def get_parser_class_for_mime_type(mime_type: str) -> type["DocumentParser"] | None:
"""
Returns the best parser (by weight) for the given mimetype or
None if no parser exists
@@ -252,7 +251,7 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> Path:
return out_path
-def parse_date(filename, text) -> Optional[datetime.datetime]:
+def parse_date(filename, text) -> datetime.datetime | None:
return next(parse_date_generator(filename, text), None)
@@ -277,7 +276,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
},
)
- def __filter(date: datetime.datetime) -> Optional[datetime.datetime]:
+ def __filter(date: datetime.datetime) -> datetime.datetime | None:
if (
date is not None
and date.year > 1900
@@ -290,7 +289,7 @@ def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
def __process_match(
match: Match[str],
date_order: str,
- ) -> Optional[datetime.datetime]:
+ ) -> datetime.datetime | None:
date_string = match.group(0)
try:
@@ -339,7 +338,7 @@ class DocumentParser(LoggingMixin):
self.archive_path = None
self.text = None
- self.date: Optional[datetime.datetime] = None
+ self.date: datetime.datetime | None = None
self.progress_callback = progress_callback
def progress(self, current_progress, max_progress):
@@ -367,6 +366,9 @@ class DocumentParser(LoggingMixin):
def extract_metadata(self, document_path, mime_type):
return []
+ def get_page_count(self, document_path, mime_type):
+ return None
+
def parse(self, document_path, mime_type, file_name=None):
raise NotImplementedError
@@ -382,7 +384,7 @@ class DocumentParser(LoggingMixin):
def get_text(self):
return self.text
- def get_date(self) -> Optional[datetime.datetime]:
+ def get_date(self) -> datetime.datetime | None:
return self.date
def cleanup(self):
diff --git a/src/documents/plugins/base.py b/src/documents/plugins/base.py
index 14d6ea696..81f0c86c3 100644
--- a/src/documents/plugins/base.py
+++ b/src/documents/plugins/base.py
@@ -1,7 +1,6 @@
import abc
from pathlib import Path
from typing import Final
-from typing import Optional
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
@@ -88,7 +87,7 @@ class ConsumeTaskPlugin(abc.ABC):
"""
@abc.abstractmethod
- def run(self) -> Optional[str]:
+ def run(self) -> str | None:
"""
The bulk of plugin processing, this does whatever action the plugin is for.
diff --git a/src/documents/plugins/helpers.py b/src/documents/plugins/helpers.py
index 2d3686db4..20380b852 100644
--- a/src/documents/plugins/helpers.py
+++ b/src/documents/plugins/helpers.py
@@ -1,7 +1,5 @@
import enum
from typing import TYPE_CHECKING
-from typing import Optional
-from typing import Union
from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer
@@ -23,9 +21,9 @@ class ProgressManager:
of the open/close of the layer to ensure messages go out and everything is cleaned up
"""
- def __init__(self, filename: str, task_id: Optional[str] = None) -> None:
+ def __init__(self, filename: str, task_id: str | None = None) -> None:
self.filename = filename
- self._channel: Optional[RedisPubSubChannelLayer] = None
+ self._channel: RedisPubSubChannelLayer | None = None
self.task_id = task_id
def __enter__(self):
@@ -57,7 +55,7 @@ class ProgressManager:
message: str,
current_progress: int,
max_progress: int,
- extra_args: Optional[dict[str, Union[str, int, None]]] = None,
+ extra_args: dict[str, str | int | None] | None = None,
) -> None:
# Ensure the layer is open
self.open()
diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py
index 5218cbf8a..30f3dd26d 100644
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -750,6 +750,7 @@ class DocumentSerializer(
original_file_name = SerializerMethodField()
archived_file_name = SerializerMethodField()
created_date = serializers.DateField(required=False)
+ page_count = SerializerMethodField()
custom_fields = CustomFieldInstanceSerializer(
many=True,
@@ -770,6 +771,9 @@ class DocumentSerializer(
required=False,
)
+ def get_page_count(self, obj):
+ return obj.page_count
+
def get_original_file_name(self, obj):
return obj.original_filename
@@ -885,6 +889,7 @@ class DocumentSerializer(
"notes",
"custom_fields",
"remove_inbox_tags",
+ "page_count",
)
list_serializer_class = OwnedObjectListSerializer
@@ -1384,9 +1389,18 @@ class PostDocumentSerializer(serializers.Serializer):
mime_type = magic.from_buffer(document_data, mime=True)
if not is_mime_type_supported(mime_type):
- raise serializers.ValidationError(
- _("File type %(type)s not supported") % {"type": mime_type},
- )
+ if (
+ mime_type in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+ and document.name.endswith(
+ ".pdf",
+ )
+ ):
+ # If the file is an invalid PDF, we can try to recover it later in the consumer
+ mime_type = "application/pdf"
+ else:
+ raise serializers.ValidationError(
+ _("File type %(type)s not supported") % {"type": mime_type},
+ )
return document.name, document_data
diff --git a/src/documents/signals/handlers.py b/src/documents/signals/handlers.py
index 2cf73ca41..cf6733dd5 100644
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -1,7 +1,6 @@
import logging
import os
import shutil
-from typing import Optional
from celery import states
from celery.signals import before_task_publish
@@ -62,7 +61,7 @@ def _suggestion_printer(
suggestion_type: str,
document: Document,
selected: MatchingModel,
- base_url: Optional[str] = None,
+ base_url: str | None = None,
):
"""
Smaller helper to reduce duplication when just outputting suggestions to the console
@@ -80,7 +79,7 @@ def set_correspondent(
sender,
document: Document,
logging_group=None,
- classifier: Optional[DocumentClassifier] = None,
+ classifier: DocumentClassifier | None = None,
replace=False,
use_first=True,
suggest=False,
@@ -135,7 +134,7 @@ def set_document_type(
sender,
document: Document,
logging_group=None,
- classifier: Optional[DocumentClassifier] = None,
+ classifier: DocumentClassifier | None = None,
replace=False,
use_first=True,
suggest=False,
@@ -191,7 +190,7 @@ def set_tags(
sender,
document: Document,
logging_group=None,
- classifier: Optional[DocumentClassifier] = None,
+ classifier: DocumentClassifier | None = None,
replace=False,
suggest=False,
base_url=None,
@@ -246,7 +245,7 @@ def set_storage_path(
sender,
document: Document,
logging_group=None,
- classifier: Optional[DocumentClassifier] = None,
+ classifier: DocumentClassifier | None = None,
replace=False,
use_first=True,
suggest=False,
diff --git a/src/documents/tasks.py b/src/documents/tasks.py
index 2f1bc2ee4..8f5ee51bc 100644
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -5,7 +5,6 @@ import uuid
from datetime import timedelta
from pathlib import Path
from tempfile import TemporaryDirectory
-from typing import Optional
import tqdm
from celery import Task
@@ -106,7 +105,7 @@ def train_classifier():
def consume_file(
self: Task,
input_doc: ConsumableDocument,
- overrides: Optional[DocumentMetadataOverrides] = None,
+ overrides: DocumentMetadataOverrides | None = None,
):
# Default no overrides
if overrides is None:
diff --git a/src/documents/tests/samples/invalid_pdf.pdf b/src/documents/tests/samples/invalid_pdf.pdf
new file mode 100644
index 000000000..f226c2d84
Binary files /dev/null and b/src/documents/tests/samples/invalid_pdf.pdf differ
diff --git a/src/documents/tests/test_api_documents.py b/src/documents/tests/test_api_documents.py
index ee2e8ee1e..b1cd43932 100644
--- a/src/documents/tests/test_api_documents.py
+++ b/src/documents/tests/test_api_documents.py
@@ -1402,6 +1402,27 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
self.assertEqual(overrides.filename, "simple.pdf")
self.assertEqual(overrides.custom_field_ids, [custom_field.id])
+ def test_upload_invalid_pdf(self):
+ """
+ GIVEN: Invalid PDF named "*.pdf" that mime_type is in settings.CONSUMER_PDF_RECOVERABLE_MIME_TYPES
+ WHEN: Upload the file
+ THEN: The file is not rejected
+ """
+ self.consume_file_mock.return_value = celery.result.AsyncResult(
+ id=str(uuid.uuid4()),
+ )
+
+ with open(
+ os.path.join(os.path.dirname(__file__), "samples", "invalid_pdf.pdf"),
+ "rb",
+ ) as f:
+ response = self.client.post(
+ "/api/documents/post_document/",
+ {"document": f},
+ )
+
+ self.assertEqual(response.status_code, status.HTTP_200_OK)
+
def test_get_metadata(self):
doc = Document.objects.create(
title="test",
diff --git a/src/documents/tests/test_api_filter_by_custom_fields.py b/src/documents/tests/test_api_filter_by_custom_fields.py
index 421376e44..327855c4c 100644
--- a/src/documents/tests/test_api_filter_by_custom_fields.py
+++ b/src/documents/tests/test_api_filter_by_custom_fields.py
@@ -1,6 +1,6 @@
import json
+from collections.abc import Callable
from datetime import date
-from typing import Callable
from unittest.mock import Mock
from urllib.parse import quote
diff --git a/src/documents/tests/test_bulk_edit.py b/src/documents/tests/test_bulk_edit.py
index fed93cd01..d80116a80 100644
--- a/src/documents/tests/test_bulk_edit.py
+++ b/src/documents/tests/test_bulk_edit.py
@@ -389,6 +389,7 @@ class TestPDFActions(DirectoriesMixin, TestCase):
title="B",
filename=sample2,
mime_type="application/pdf",
+ page_count=8,
)
self.doc2.archive_filename = sample2_archive
self.doc2.save()
@@ -681,14 +682,20 @@ class TestPDFActions(DirectoriesMixin, TestCase):
THEN:
- Save should be called once
- Archive file should be updated once
+ - The document's page_count should be reduced by the number of deleted pages
"""
doc_ids = [self.doc2.id]
+ initial_page_count = self.doc2.page_count
pages = [1, 3]
result = bulk_edit.delete_pages(doc_ids, pages)
mock_pdf_save.assert_called_once()
mock_update_archive_file.assert_called_once()
self.assertEqual(result, "OK")
+ expected_page_count = initial_page_count - len(pages)
+ self.doc2.refresh_from_db()
+ self.assertEqual(self.doc2.page_count, expected_page_count)
+
@mock.patch("documents.tasks.update_document_archive_file.delay")
@mock.patch("pikepdf.Pdf.save")
def test_delete_pages_with_error(self, mock_pdf_save, mock_update_archive_file):
diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py
index 5b56e2cca..aa452e15b 100644
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -235,6 +235,8 @@ class FaultyGenericExceptionParser(_BaseTestParser):
def fake_magic_from_file(file, mime=False):
if mime:
+ if file.name.startswith("invalid_pdf"):
+ return "application/octet-stream"
if os.path.splitext(file)[1] == ".pdf":
return "application/pdf"
elif os.path.splitext(file)[1] == ".png":
@@ -952,6 +954,27 @@ class TestConsumer(
sanity_check()
+ @mock.patch("documents.consumer.run_subprocess")
+ def test_try_to_clean_invalid_pdf(self, m):
+ shutil.copy(
+ Path(__file__).parent / "samples" / "invalid_pdf.pdf",
+ settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
+ )
+ with self.get_consumer(
+ settings.CONSUMPTION_DIR / "invalid_pdf.pdf",
+ ) as consumer:
+ # fails because no qpdf
+ self.assertRaises(ConsumerError, consumer.run)
+
+ m.assert_called_once()
+
+ args, _ = m.call_args
+
+ command = args[0]
+
+ self.assertEqual(command[0], "qpdf")
+ self.assertEqual(command[1], "--replace-input")
+
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
class TestConsumerCreatedDate(DirectoriesMixin, GetConsumerMixin, TestCase):
diff --git a/src/documents/tests/test_double_sided.py b/src/documents/tests/test_double_sided.py
index 64cd7be48..5d068b735 100644
--- a/src/documents/tests/test_double_sided.py
+++ b/src/documents/tests/test_double_sided.py
@@ -2,7 +2,6 @@ import datetime as dt
import os
import shutil
from pathlib import Path
-from typing import Union
from unittest import mock
from django.test import TestCase
@@ -34,7 +33,7 @@ class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.dirs.double_sided_dir.mkdir()
self.staging_file = self.dirs.scratch_dir / STAGING_FILE_NAME
- def consume_file(self, srcname, dstname: Union[str, Path] = "foo.pdf"):
+ def consume_file(self, srcname, dstname: str | Path = "foo.pdf"):
"""
Starts the consume process and also ensures the
destination file does not exist afterwards
diff --git a/src/documents/tests/test_migration_archive_files.py b/src/documents/tests/test_migration_archive_files.py
index 17dc8f040..5039e13de 100644
--- a/src/documents/tests/test_migration_archive_files.py
+++ b/src/documents/tests/test_migration_archive_files.py
@@ -3,7 +3,6 @@ import importlib
import os
import shutil
from pathlib import Path
-from typing import Optional
from unittest import mock
from django.conf import settings
@@ -66,8 +65,8 @@ def make_test_document(
mime_type: str,
original: str,
original_filename: str,
- archive: Optional[str] = None,
- archive_filename: Optional[str] = None,
+ archive: str | None = None,
+ archive_filename: str | None = None,
):
doc = document_class()
doc.filename = original_filename
diff --git a/src/documents/tests/test_migration_document_pages_count.py b/src/documents/tests/test_migration_document_pages_count.py
new file mode 100644
index 000000000..e656bf1b8
--- /dev/null
+++ b/src/documents/tests/test_migration_document_pages_count.py
@@ -0,0 +1,59 @@
+import os
+import shutil
+from pathlib import Path
+
+from django.conf import settings
+
+from documents.tests.utils import TestMigrations
+
+
+def source_path_before(self):
+ if self.filename:
+ fname = str(self.filename)
+
+ return os.path.join(settings.ORIGINALS_DIR, fname)
+
+
+class TestMigrateDocumentPageCount(TestMigrations):
+ migrate_from = "1052_document_transaction_id"
+ migrate_to = "1053_document_page_count"
+
+ def setUpBeforeMigration(self, apps):
+ Document = apps.get_model("documents", "Document")
+ doc = Document.objects.create(
+ title="test1",
+ mime_type="application/pdf",
+ filename="file1.pdf",
+ )
+ self.doc_id = doc.id
+ shutil.copy(
+ Path(__file__).parent / "samples" / "simple.pdf",
+ source_path_before(doc),
+ )
+
+ def testDocumentPageCountMigrated(self):
+ Document = self.apps.get_model("documents", "Document")
+
+ doc = Document.objects.get(id=self.doc_id)
+ self.assertEqual(doc.page_count, 1)
+
+
+class TestMigrateDocumentPageCountBackwards(TestMigrations):
+ migrate_from = "1053_document_page_count"
+ migrate_to = "1052_document_transaction_id"
+
+ def setUpBeforeMigration(self, apps):
+ Document = apps.get_model("documents", "Document")
+ doc = Document.objects.create(
+ title="test1",
+ mime_type="application/pdf",
+ filename="file1.pdf",
+ page_count=8,
+ )
+ self.doc_id = doc.id
+
+ def test_remove_number_of_pages_to_page_count(self):
+ Document = self.apps.get_model("documents", "Document")
+ self.assertFalse(
+ "page_count" in [field.name for field in Document._meta.get_fields()],
+ )
diff --git a/src/documents/tests/test_migration_encrypted_webp_conversion.py b/src/documents/tests/test_migration_encrypted_webp_conversion.py
index 35a711901..0660df368 100644
--- a/src/documents/tests/test_migration_encrypted_webp_conversion.py
+++ b/src/documents/tests/test_migration_encrypted_webp_conversion.py
@@ -1,10 +1,9 @@
import importlib
import shutil
import tempfile
+from collections.abc import Callable
from collections.abc import Iterable
from pathlib import Path
-from typing import Callable
-from typing import Union
from unittest import mock
from django.test import override_settings
@@ -115,7 +114,7 @@ class TestMigrateToEncrytpedWebPThumbnails(TestMigrations):
def assert_file_count_by_extension(
self,
ext: str,
- dir: Union[str, Path],
+ dir: str | Path,
expected_count: int,
):
"""
diff --git a/src/documents/tests/test_migration_webp_conversion.py b/src/documents/tests/test_migration_webp_conversion.py
index cf0ee22a8..cd148ed6f 100644
--- a/src/documents/tests/test_migration_webp_conversion.py
+++ b/src/documents/tests/test_migration_webp_conversion.py
@@ -1,10 +1,9 @@
import importlib
import shutil
import tempfile
+from collections.abc import Callable
from collections.abc import Iterable
from pathlib import Path
-from typing import Callable
-from typing import Union
from unittest import mock
from django.test import override_settings
@@ -86,7 +85,7 @@ class TestMigrateWebPThumbnails(TestMigrations):
def assert_file_count_by_extension(
self,
ext: str,
- dir: Union[str, Path],
+ dir: str | Path,
expected_count: int,
):
"""
diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py
index 4ec0851df..cd4db84e6 100644
--- a/src/documents/tests/utils.py
+++ b/src/documents/tests/utils.py
@@ -3,15 +3,13 @@ import tempfile
import time
import warnings
from collections import namedtuple
+from collections.abc import Callable
from collections.abc import Generator
from collections.abc import Iterator
from contextlib import contextmanager
from os import PathLike
from pathlib import Path
from typing import Any
-from typing import Callable
-from typing import Optional
-from typing import Union
from unittest import mock
import httpx
@@ -91,7 +89,7 @@ def paperless_environment():
def util_call_with_backoff(
method_or_callable: Callable,
- args: Union[list, tuple],
+ args: list | tuple,
*,
skip_on_50x_err=True,
) -> tuple[bool, Any]:
@@ -170,22 +168,22 @@ class FileSystemAssertsMixin:
Utilities for checks various state information of the file system
"""
- def assertIsFile(self, path: Union[PathLike, str]):
+ def assertIsFile(self, path: PathLike | str):
self.assertTrue(Path(path).resolve().is_file(), f"File does not exist: {path}")
- def assertIsNotFile(self, path: Union[PathLike, str]):
+ def assertIsNotFile(self, path: PathLike | str):
self.assertFalse(Path(path).resolve().is_file(), f"File does exist: {path}")
- def assertIsDir(self, path: Union[PathLike, str]):
+ def assertIsDir(self, path: PathLike | str):
self.assertTrue(Path(path).resolve().is_dir(), f"Dir does not exist: {path}")
- def assertIsNotDir(self, path: Union[PathLike, str]):
+ def assertIsNotDir(self, path: PathLike | str):
self.assertFalse(Path(path).resolve().is_dir(), f"Dir does exist: {path}")
def assertFilesEqual(
self,
- path1: Union[PathLike, str],
- path2: Union[PathLike, str],
+ path1: PathLike | str,
+ path2: PathLike | str,
):
path1 = Path(path1)
path2 = Path(path2)
@@ -196,7 +194,7 @@ class FileSystemAssertsMixin:
self.assertEqual(hash1, hash2, "File SHA256 mismatch")
- def assertFileCountInDir(self, path: Union[PathLike, str], count: int):
+ def assertFileCountInDir(self, path: PathLike | str, count: int):
path = Path(path).resolve()
self.assertTrue(path.is_dir(), f"Path {path} is not a directory")
files = [x for x in path.iterdir() if x.is_file()]
@@ -340,7 +338,7 @@ class GetConsumerMixin:
def get_consumer(
self,
filepath: Path,
- overrides: Union[DocumentMetadataOverrides, None] = None,
+ overrides: DocumentMetadataOverrides | None = None,
source: DocumentSource = DocumentSource.ConsumeFolder,
) -> Generator[ConsumerPlugin, None, None]:
# Store this for verification
@@ -368,7 +366,7 @@ class DummyProgressManager:
mock.patch("documents.tasks.ProgressManager", DummyProgressManager)
"""
- def __init__(self, filename: str, task_id: Optional[str] = None) -> None:
+ def __init__(self, filename: str, task_id: str | None = None) -> None:
self.filename = filename
self.task_id = task_id
self.payloads = []
@@ -392,7 +390,7 @@ class DummyProgressManager:
message: str,
current_progress: int,
max_progress: int,
- extra_args: Optional[dict[str, Union[str, int]]] = None,
+ extra_args: dict[str, str | int] | None = None,
) -> None:
# Ensure the layer is open
self.open()
diff --git a/src/documents/utils.py b/src/documents/utils.py
index 0af1f54e3..d8a8e8ab2 100644
--- a/src/documents/utils.py
+++ b/src/documents/utils.py
@@ -4,21 +4,19 @@ from os import utime
from pathlib import Path
from subprocess import CompletedProcess
from subprocess import run
-from typing import Optional
-from typing import Union
from django.conf import settings
from PIL import Image
def _coerce_to_path(
- source: Union[Path, str],
- dest: Union[Path, str],
+ source: Path | str,
+ dest: Path | str,
) -> tuple[Path, Path]:
return Path(source).resolve(), Path(dest).resolve()
-def copy_basic_file_stats(source: Union[Path, str], dest: Union[Path, str]) -> None:
+def copy_basic_file_stats(source: Path | str, dest: Path | str) -> None:
"""
Copies only the m_time and a_time attributes from source to destination.
Both are expected to exist.
@@ -33,8 +31,8 @@ def copy_basic_file_stats(source: Union[Path, str], dest: Union[Path, str]) -> N
def copy_file_with_basic_stats(
- source: Union[Path, str],
- dest: Union[Path, str],
+ source: Path | str,
+ dest: Path | str,
) -> None:
"""
A sort of simpler copy2 that doesn't copy extended file attributes,
@@ -53,7 +51,7 @@ def maybe_override_pixel_limit() -> None:
"""
Maybe overrides the PIL limit on pixel count, if configured to allow it
"""
- limit: Optional[Union[float, int]] = settings.MAX_IMAGE_PIXELS
+ limit: float | int | None = settings.MAX_IMAGE_PIXELS
if limit is not None and limit >= 0:
pixel_count = limit
if pixel_count == 0:
@@ -63,8 +61,8 @@ def maybe_override_pixel_limit() -> None:
def run_subprocess(
arguments: list[str],
- env: Optional[dict[str, str]] = None,
- logger: Optional[logging.Logger] = None,
+ env: dict[str, str] | None = None,
+ logger: logging.Logger | None = None,
*,
check_exit_code: bool = True,
log_stdout: bool = True,
diff --git a/src/documents/views.py b/src/documents/views.py
index a8a5bf97d..c870c15b5 100644
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -361,6 +361,7 @@ class DocumentViewSet(
"archive_serial_number",
"num_notes",
"owner",
+ "page_count",
)
def get_queryset(self):
@@ -1637,9 +1638,8 @@ class RemoteVersionView(GenericAPIView):
try:
remote_json = json.loads(remote)
remote_version = remote_json["tag_name"]
- # Basically PEP 616 but that only went in 3.9
- if remote_version.startswith("ngx-"):
- remote_version = remote_version[len("ngx-") :]
+ # Some early tags used ngx-x.y.z
+ remote_version = remote_version.removeprefix("ngx-")
except ValueError:
logger.debug("An error occurred parsing remote version json")
except urllib.error.URLError:
diff --git a/src/paperless/config.py b/src/paperless/config.py
index 00afb2a13..8a40fc6c6 100644
--- a/src/paperless/config.py
+++ b/src/paperless/config.py
@@ -1,6 +1,5 @@
import dataclasses
import json
-from typing import Optional
from django.conf import settings
@@ -44,18 +43,18 @@ class OcrConfig(OutputTypeConfig):
correspond almost directly to the OCRMyPDF options
"""
- pages: Optional[int] = dataclasses.field(init=False)
+ pages: int | None = dataclasses.field(init=False)
language: str = dataclasses.field(init=False)
mode: str = dataclasses.field(init=False)
skip_archive_file: str = dataclasses.field(init=False)
- image_dpi: Optional[int] = dataclasses.field(init=False)
+ image_dpi: int | None = dataclasses.field(init=False)
clean: str = dataclasses.field(init=False)
deskew: bool = dataclasses.field(init=False)
rotate: bool = dataclasses.field(init=False)
rotate_threshold: float = dataclasses.field(init=False)
- max_image_pixel: Optional[float] = dataclasses.field(init=False)
+ max_image_pixel: float | None = dataclasses.field(init=False)
color_conversion_strategy: str = dataclasses.field(init=False)
- user_args: Optional[dict[str, str]] = dataclasses.field(init=False)
+ user_args: dict[str, str] | None = dataclasses.field(init=False)
def __post_init__(self) -> None:
super().__post_init__()
diff --git a/src/paperless/settings.py b/src/paperless/settings.py
index 95badde2f..ab943f30f 100644
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -9,8 +9,6 @@ from os import PathLike
from pathlib import Path
from platform import machine
from typing import Final
-from typing import Optional
-from typing import Union
from urllib.parse import urlparse
from celery.schedules import crontab
@@ -57,7 +55,7 @@ def __get_int(key: str, default: int) -> int:
return int(os.getenv(key, default))
-def __get_optional_int(key: str) -> Optional[int]:
+def __get_optional_int(key: str) -> int | None:
"""
Returns None if the environment key is not present, otherwise an integer
"""
@@ -75,7 +73,7 @@ def __get_float(key: str, default: float) -> float:
def __get_path(
key: str,
- default: Union[PathLike, str],
+ default: PathLike | str,
) -> Path:
"""
Return a normalized, absolute path based on the environment variable or a default,
@@ -86,7 +84,7 @@ def __get_path(
return Path(default).resolve()
-def __get_optional_path(key: str) -> Optional[Path]:
+def __get_optional_path(key: str) -> Path | None:
"""
Returns None if the environment key is not present, otherwise a fully resolved Path
"""
@@ -97,7 +95,7 @@ def __get_optional_path(key: str) -> Optional[Path]:
def __get_list(
key: str,
- default: Optional[list[str]] = None,
+ default: list[str] | None = None,
sep: str = ",",
) -> list[str]:
"""
@@ -112,7 +110,7 @@ def __get_list(
return []
-def _parse_redis_url(env_redis: Optional[str]) -> tuple[str, str]:
+def _parse_redis_url(env_redis: str | None) -> tuple[str, str]:
"""
Gets the Redis information from the environment or a default and handles
converting from incompatible django_channels and celery formats.
@@ -960,6 +958,8 @@ CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT",
)
+CONSUMER_PDF_RECOVERABLE_MIME_TYPES = ("application/octet-stream",)
+
OCR_PAGES = __get_optional_int("PAPERLESS_OCR_PAGES")
# The default language that tesseract will attempt to use when parsing
@@ -987,7 +987,7 @@ OCR_ROTATE_PAGES_THRESHOLD: Final[float] = __get_float(
12.0,
)
-OCR_MAX_IMAGE_PIXELS: Final[Optional[int]] = __get_optional_int(
+OCR_MAX_IMAGE_PIXELS: Final[int | None] = __get_optional_int(
"PAPERLESS_OCR_MAX_IMAGE_PIXELS",
)
@@ -998,7 +998,7 @@ OCR_COLOR_CONVERSION_STRATEGY = os.getenv(
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS")
-MAX_IMAGE_PIXELS: Final[Optional[int]] = __get_optional_int(
+MAX_IMAGE_PIXELS: Final[int | None] = __get_optional_int(
"PAPERLESS_MAX_IMAGE_PIXELS",
)
@@ -1126,7 +1126,7 @@ APP_LOGO = os.getenv("PAPERLESS_APP_LOGO", None)
###############################################################################
-def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
+def _get_nltk_language_setting(ocr_lang: str) -> str | None:
"""
Maps an ISO-639-1 language code supported by Tesseract into
an optional NLTK language name. This is the set of common supported
@@ -1163,7 +1163,7 @@ def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
NLTK_ENABLED: Final[bool] = __get_boolean("PAPERLESS_ENABLE_NLTK", "yes")
-NLTK_LANGUAGE: Optional[str] = _get_nltk_language_setting(OCR_LANGUAGE)
+NLTK_LANGUAGE: str | None = _get_nltk_language_setting(OCR_LANGUAGE)
###############################################################################
# Email (SMTP) Backend #
@@ -1185,7 +1185,7 @@ if DEBUG: # pragma: no cover
# Email Preprocessors #
###############################################################################
-EMAIL_GNUPG_HOME: Final[Optional[str]] = os.getenv("PAPERLESS_EMAIL_GNUPG_HOME")
+EMAIL_GNUPG_HOME: Final[str | None] = os.getenv("PAPERLESS_EMAIL_GNUPG_HOME")
EMAIL_ENABLE_GPG_DECRYPTOR: Final[bool] = __get_boolean(
"PAPERLESS_ENABLE_GPG_DECRYPTOR",
)
diff --git a/src/paperless_mail/mail.py b/src/paperless_mail/mail.py
index 4ecd44659..b52a2ebe4 100644
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -10,8 +10,6 @@ from datetime import timedelta
from fnmatch import fnmatch
from pathlib import Path
from typing import TYPE_CHECKING
-from typing import Optional
-from typing import Union
import magic
import pathvalidate
@@ -84,7 +82,7 @@ class BaseMailAction:
read mails when the action is to mark mails as read).
"""
- def get_criteria(self) -> Union[dict, LogicOperator]:
+ def get_criteria(self) -> dict | LogicOperator:
"""
Returns filtering criteria/query for this mail action.
"""
@@ -453,7 +451,7 @@ class MailAccountHandler(LoggingMixin):
else:
self.log.debug(f"Skipping mail preprocessor {preprocessor_type.NAME}")
- def _correspondent_from_name(self, name: str) -> Optional[Correspondent]:
+ def _correspondent_from_name(self, name: str) -> Correspondent | None:
try:
return Correspondent.objects.get_or_create(name=name)[0]
except DatabaseError as e:
@@ -465,7 +463,7 @@ class MailAccountHandler(LoggingMixin):
message: MailMessage,
att: MailAttachment,
rule: MailRule,
- ) -> Optional[str]:
+ ) -> str | None:
if rule.assign_title_from == MailRule.TitleSource.FROM_SUBJECT:
return message.subject
@@ -484,7 +482,7 @@ class MailAccountHandler(LoggingMixin):
self,
message: MailMessage,
rule: MailRule,
- ) -> Optional[Correspondent]:
+ ) -> Correspondent | None:
c_from = rule.assign_correspondent_from
if c_from == MailRule.CorrespondentSource.FROM_NOTHING:
@@ -688,7 +686,7 @@ class MailAccountHandler(LoggingMixin):
def filename_inclusion_matches(
self,
- filter_attachment_filename_include: Optional[str],
+ filter_attachment_filename_include: str | None,
filename: str,
) -> bool:
if filter_attachment_filename_include:
@@ -707,7 +705,7 @@ class MailAccountHandler(LoggingMixin):
def filename_exclusion_matches(
self,
- filter_attachment_filename_exclude: Optional[str],
+ filter_attachment_filename_exclude: str | None,
filename: str,
) -> bool:
if filter_attachment_filename_exclude:
diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py
index 4e83844e2..d98fb7238 100644
--- a/src/paperless_mail/parsers.py
+++ b/src/paperless_mail/parsers.py
@@ -1,7 +1,6 @@
import re
from html import escape
from pathlib import Path
-from typing import Optional
from bleach import clean
from bleach import linkify
@@ -33,7 +32,7 @@ class MailDocumentParser(DocumentParser):
logging_name = "paperless.parsing.mail"
- def _settings_to_gotenberg_pdfa(self) -> Optional[PdfAFormat]:
+ def _settings_to_gotenberg_pdfa(self) -> PdfAFormat | None:
"""
Converts our requested PDF/A output into the Gotenberg API
format
@@ -44,7 +43,7 @@ class MailDocumentParser(DocumentParser):
}:
return PdfAFormat.A2b
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1: # pragma: no cover
- self.log.warn(
+ self.log.warning(
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
)
return PdfAFormat.A2b
diff --git a/src/paperless_mail/tests/test_mail.py b/src/paperless_mail/tests/test_mail.py
index bdadc7450..c12b54ffe 100644
--- a/src/paperless_mail/tests/test_mail.py
+++ b/src/paperless_mail/tests/test_mail.py
@@ -4,8 +4,6 @@ import random
import uuid
from collections import namedtuple
from contextlib import AbstractContextManager
-from typing import Optional
-from typing import Union
from unittest import mock
import pytest
@@ -199,11 +197,11 @@ class MessageBuilder:
def create_message(
self,
- attachments: Union[int, list[_AttachmentDef]] = 1,
+ attachments: int | list[_AttachmentDef] = 1,
body: str = "",
subject: str = "the subject",
from_: str = "no_one@mail.com",
- to: Optional[list[str]] = None,
+ to: list[str] | None = None,
seen: bool = False,
flagged: bool = False,
processed: bool = False,
@@ -622,8 +620,8 @@ class TestMail(
@dataclasses.dataclass(frozen=True)
class FilterTestCase:
name: str
- include_pattern: Optional[str]
- exclude_pattern: Optional[str]
+ include_pattern: str | None
+ exclude_pattern: str | None
expected_matches: list[str]
tests = [
diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py
index 4e92990f1..6b9ec3d93 100644
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -3,7 +3,6 @@ import re
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
-from typing import Optional
from django.conf import settings
from PIL import Image
@@ -41,6 +40,15 @@ class RasterisedDocumentParser(DocumentParser):
"""
return OcrConfig()
+ def get_page_count(self, document_path, mime_type):
+ page_count = None
+ if mime_type == "application/pdf":
+ import pikepdf
+
+ with pikepdf.Pdf.open(document_path) as pdf:
+ page_count = len(pdf.pages)
+ return page_count
+
def extract_metadata(self, document_path, mime_type):
result = []
if mime_type == "application/pdf":
@@ -115,7 +123,7 @@ class RasterisedDocumentParser(DocumentParser):
)
return no_alpha_image
- def get_dpi(self, image) -> Optional[int]:
+ def get_dpi(self, image) -> int | None:
try:
with Image.open(image) as im:
x, y = im.info["dpi"]
@@ -124,7 +132,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log.warning(f"Error while getting DPI from image {image}: {e}")
return None
- def calculate_a4_dpi(self, image) -> Optional[int]:
+ def calculate_a4_dpi(self, image) -> int | None:
try:
with Image.open(image) as im:
width, height = im.size
@@ -139,9 +147,9 @@ class RasterisedDocumentParser(DocumentParser):
def extract_text(
self,
- sidecar_file: Optional[Path],
+ sidecar_file: Path | None,
pdf_file: Path,
- ) -> Optional[str]:
+ ) -> str | None:
# When re-doing OCR, the sidecar contains ONLY the new text, not
# the whole text, so do not utilize it in that case
if (
diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py
index d63d965c5..45a5939ab 100644
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -57,6 +57,30 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertContainsStrings(text.strip(), ["This is a test document."])
+ def test_get_page_count(self):
+ """
+ GIVEN:
+ - PDF file with a single page
+ - PDF file with multiple pages
+ WHEN:
+ - The number of pages is requested
+ THEN:
+ - The method returns 1 as the expected number of pages
+ - The method returns the correct number of pages (6)
+ """
+ parser = RasterisedDocumentParser(uuid.uuid4())
+ page_count = parser.get_page_count(
+ os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
+ "application/pdf",
+ )
+ self.assertEqual(page_count, 1)
+
+ page_count = parser.get_page_count(
+ os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
+ "application/pdf",
+ )
+ self.assertEqual(page_count, 6)
+
def test_thumbnail(self):
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(
diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py
index 519f6c6ae..f51a03916 100644
--- a/src/paperless_tika/parsers.py
+++ b/src/paperless_tika/parsers.py
@@ -102,7 +102,7 @@ class TikaDocumentParser(DocumentParser):
}:
route.pdf_format(PdfAFormat.A2b)
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
- self.log.warn(
+ self.log.warning(
"Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
)
route.pdf_format(PdfAFormat.A2b)
|