Get storage paths working

This commit is contained in:
Martin Tan 2023-06-13 00:07:08 +08:00
parent 3e5886584f
commit 1c042d4aaf
9 changed files with 2985 additions and 2959 deletions

View File

@ -1,7 +1,7 @@
import { SettingsService } from './services/settings.service' import { SettingsService } from './services/settings.service'
import { SETTINGS_KEYS } from './data/paperless-uisettings' import { SETTINGS_KEYS } from './data/paperless-uisettings'
import { Component, OnDestroy, OnInit, Renderer2 } from '@angular/core' import { Component, OnDestroy, OnInit, Renderer2 } from '@angular/core'
import { Router } from '@angular/router' import { ActivatedRoute, Router } from '@angular/router'
import { Subscription } from 'rxjs' import { Subscription } from 'rxjs'
import { ConsumerStatusService } from './services/consumer-status.service' import { ConsumerStatusService } from './services/consumer-status.service'
import { ToastService } from './services/toast.service' import { ToastService } from './services/toast.service'
@ -34,6 +34,7 @@ export class AppComponent implements OnInit, OnDestroy {
private consumerStatusService: ConsumerStatusService, private consumerStatusService: ConsumerStatusService,
private toastService: ToastService, private toastService: ToastService,
private router: Router, private router: Router,
private route: ActivatedRoute,
private uploadDocumentsService: UploadDocumentsService, private uploadDocumentsService: UploadDocumentsService,
private tasksService: TasksService, private tasksService: TasksService,
public tourService: TourService, public tourService: TourService,
@ -265,7 +266,9 @@ export class AppComponent implements OnInit, OnDestroy {
public dropped(files: NgxFileDropEntry[]) { public dropped(files: NgxFileDropEntry[]) {
this.fileLeave(true) this.fileLeave(true)
this.uploadDocumentsService.uploadFiles(files) let storagePathId = parseInt(this.route.snapshot.queryParams['spid'])
storagePathId = !isNaN(storagePathId) ? storagePathId : undefined
this.uploadDocumentsService.uploadFiles(files, storagePathId)
this.toastService.showInfo($localize`Initiating upload...`, 3000) this.toastService.showInfo($localize`Initiating upload...`, 3000)
} }
} }

View File

@ -157,7 +157,6 @@ export class ExplorerComponent
this.route.queryParamMap this.route.queryParamMap
.pipe(takeUntil(this.unsubscribeNotifier)) .pipe(takeUntil(this.unsubscribeNotifier))
.subscribe((queryParams) => { .subscribe((queryParams) => {
console.log('query params updated:', queryParams)
this.list.loadFromQueryParams(queryParams) this.list.loadFromQueryParams(queryParams)
this.unmodifiedFilterRules = [] this.unmodifiedFilterRules = []
}) })

View File

@ -181,7 +181,6 @@ export class StoragePathListViewService {
) )
.subscribe({ .subscribe({
next: (result) => { next: (result) => {
console.log('result:', result)
this.initialized = true this.initialized = true
this.isReloading = false this.isReloading = false
activeListViewState.collectionSize = result.count activeListViewState.collectionSize = result.count

View File

@ -21,13 +21,18 @@ export class UploadDocumentsService {
private settings: SettingsService private settings: SettingsService
) {} ) {}
uploadFiles(files: NgxFileDropEntry[]) { uploadFiles(files: NgxFileDropEntry[], storagePathId?: number) {
for (const droppedFile of files) { for (const droppedFile of files) {
if (droppedFile.fileEntry.isFile) { if (droppedFile.fileEntry.isFile) {
const fileEntry = droppedFile.fileEntry as FileSystemFileEntry const fileEntry = droppedFile.fileEntry as FileSystemFileEntry
fileEntry.file((file: File) => { fileEntry.file((file: File) => {
let formData = new FormData() let formData = new FormData()
formData.append('document', file, file.name) formData.append('document', file, file.name)
if (storagePathId) {
formData.append('storage_path_id', storagePathId.toString())
}
let status = this.consumerStatusService.newFileUpload(file.name) let status = this.consumerStatusService.newFileUpload(file.name)
status.message = $localize`Connecting...` status.message = $localize`Connecting...`

File diff suppressed because it is too large Load Diff

View File

@ -1,62 +1,63 @@
import dataclasses import dataclasses
import datetime import datetime
import enum import enum
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from typing import Optional from typing import Optional
import magic import magic
@dataclasses.dataclass @dataclasses.dataclass
class DocumentMetadataOverrides: class DocumentMetadataOverrides:
""" """
Manages overrides for document fields which normally would Manages overrides for document fields which normally would
be set from content or matching. All fields default to None, be set from content or matching. All fields default to None,
meaning no override is happening meaning no override is happening
""" """
filename: Optional[str] = None filename: Optional[str] = None
title: Optional[str] = None title: Optional[str] = None
correspondent_id: Optional[int] = None correspondent_id: Optional[int] = None
document_type_id: Optional[int] = None document_type_id: Optional[int] = None
tag_ids: Optional[List[int]] = None tag_ids: Optional[List[int]] = None
created: Optional[datetime.datetime] = None created: Optional[datetime.datetime] = None
asn: Optional[int] = None asn: Optional[int] = None
owner_id: Optional[int] = None owner_id: Optional[int] = None
storage_path_id: Optional[int] = None
class DocumentSource(enum.IntEnum):
""" class DocumentSource(enum.IntEnum):
The source of an incoming document. May have other uses in the future """
""" The source of an incoming document. May have other uses in the future
"""
ConsumeFolder = enum.auto()
ApiUpload = enum.auto() ConsumeFolder = enum.auto()
MailFetch = enum.auto() ApiUpload = enum.auto()
MailFetch = enum.auto()
@dataclasses.dataclass
class ConsumableDocument: @dataclasses.dataclass
""" class ConsumableDocument:
Encapsulates an incoming document, either from consume folder, API upload """
or mail fetching and certain useful operations on it. Encapsulates an incoming document, either from consume folder, API upload
""" or mail fetching and certain useful operations on it.
"""
source: DocumentSource
original_file: Path source: DocumentSource
mime_type: str = dataclasses.field(init=False, default=None) original_file: Path
mime_type: str = dataclasses.field(init=False, default=None)
def __post_init__(self):
""" def __post_init__(self):
After a dataclass is initialized, this is called to finalize some data """
1. Make sure the original path is an absolute, fully qualified path After a dataclass is initialized, this is called to finalize some data
2. Get the mime type of the file 1. Make sure the original path is an absolute, fully qualified path
""" 2. Get the mime type of the file
# Always fully qualify the path first thing """
# Just in case, convert to a path if it's a str # Always fully qualify the path first thing
self.original_file = Path(self.original_file).resolve() # Just in case, convert to a path if it's a str
self.original_file = Path(self.original_file).resolve()
# Get the file type once at init
# Note this function isn't called when the object is unpickled # Get the file type once at init
self.mime_type = magic.from_file(self.original_file, mime=True) # Note this function isn't called when the object is unpickled
self.mime_type = magic.from_file(self.original_file, mime=True)

File diff suppressed because it is too large Load Diff

View File

@ -1,303 +1,304 @@
import hashlib import hashlib
import logging import logging
import shutil import shutil
import uuid import uuid
from typing import Optional from typing import Optional
from typing import Type from typing import Type
import tqdm import tqdm
from asgiref.sync import async_to_sync from asgiref.sync import async_to_sync
from celery import shared_task from celery import shared_task
from channels.layers import get_channel_layer from channels.layers import get_channel_layer
from django.conf import settings from django.conf import settings
from django.db import transaction from django.db import transaction
from django.db.models.signals import post_save from django.db.models.signals import post_save
from documents import barcodes from documents import barcodes
from documents import index from documents import index
from documents import sanity_checker from documents import sanity_checker
from documents.classifier import DocumentClassifier from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier from documents.classifier import load_classifier
from documents.consumer import Consumer from documents.consumer import Consumer
from documents.consumer import ConsumerError from documents.consumer import ConsumerError
from documents.data_models import ConsumableDocument from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource from documents.data_models import DocumentSource
from documents.file_handling import create_source_path_directory from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename from documents.file_handling import generate_unique_filename
from documents.models import Correspondent from documents.models import Correspondent
from documents.models import Document from documents.models import Document
from documents.models import DocumentType from documents.models import DocumentType
from documents.models import StoragePath from documents.models import StoragePath
from documents.models import Tag from documents.models import Tag
from documents.parsers import DocumentParser from documents.parsers import DocumentParser
from documents.parsers import get_parser_class_for_mime_type from documents.parsers import get_parser_class_for_mime_type
from documents.sanity_checker import SanityCheckFailedException from documents.sanity_checker import SanityCheckFailedException
from filelock import FileLock from filelock import FileLock
from redis.exceptions import ConnectionError from redis.exceptions import ConnectionError
from whoosh.writing import AsyncWriter from whoosh.writing import AsyncWriter
logger = logging.getLogger("paperless.tasks") logger = logging.getLogger("paperless.tasks")
@shared_task @shared_task
def index_optimize(): def index_optimize():
ix = index.open_index() ix = index.open_index()
writer = AsyncWriter(ix) writer = AsyncWriter(ix)
writer.commit(optimize=True) writer.commit(optimize=True)
def index_reindex(progress_bar_disable=False): def index_reindex(progress_bar_disable=False):
documents = Document.objects.all() documents = Document.objects.all()
ix = index.open_index(recreate=True) ix = index.open_index(recreate=True)
with AsyncWriter(ix) as writer: with AsyncWriter(ix) as writer:
for document in tqdm.tqdm(documents, disable=progress_bar_disable): for document in tqdm.tqdm(documents, disable=progress_bar_disable):
index.update_document(writer, document) index.update_document(writer, document)
@shared_task @shared_task
def train_classifier(): def train_classifier():
if ( if (
not Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() not Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
and not DocumentType.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() and not DocumentType.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
and not StoragePath.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() and not StoragePath.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
): ):
return return
classifier = load_classifier() classifier = load_classifier()
if not classifier: if not classifier:
classifier = DocumentClassifier() classifier = DocumentClassifier()
try: try:
if classifier.train(): if classifier.train():
logger.info( logger.info(
f"Saving updated classifier model to {settings.MODEL_FILE}...", f"Saving updated classifier model to {settings.MODEL_FILE}...",
) )
classifier.save() classifier.save()
else: else:
logger.debug("Training data unchanged.") logger.debug("Training data unchanged.")
except Exception as e: except Exception as e:
logger.warning("Classifier error: " + str(e)) logger.warning("Classifier error: " + str(e))
@shared_task @shared_task
def consume_file( def consume_file(
input_doc: ConsumableDocument, input_doc: ConsumableDocument,
overrides: Optional[DocumentMetadataOverrides] = None, overrides: Optional[DocumentMetadataOverrides] = None,
): ):
# Default no overrides # Default no overrides
if overrides is None: if overrides is None:
overrides = DocumentMetadataOverrides() overrides = DocumentMetadataOverrides()
# read all barcodes in the current document # read all barcodes in the current document
if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE: if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
doc_barcode_info = barcodes.scan_file_for_barcodes( doc_barcode_info = barcodes.scan_file_for_barcodes(
input_doc.original_file, input_doc.original_file,
input_doc.mime_type, input_doc.mime_type,
) )
# split document by separator pages, if enabled # split document by separator pages, if enabled
if settings.CONSUMER_ENABLE_BARCODES: if settings.CONSUMER_ENABLE_BARCODES:
separators = barcodes.get_separating_barcodes(doc_barcode_info.barcodes) separators = barcodes.get_separating_barcodes(doc_barcode_info.barcodes)
if len(separators) > 0: if len(separators) > 0:
logger.debug( logger.debug(
f"Pages with separators found in: {input_doc.original_file}", f"Pages with separators found in: {input_doc.original_file}",
) )
document_list = barcodes.separate_pages( document_list = barcodes.separate_pages(
doc_barcode_info.pdf_path, doc_barcode_info.pdf_path,
separators, separators,
) )
if document_list: if document_list:
# If the file is an upload, it's in the scratch directory # If the file is an upload, it's in the scratch directory
# Move it to consume directory to be picked up # Move it to consume directory to be picked up
# Otherwise, use the current parent to keep possible tags # Otherwise, use the current parent to keep possible tags
# from subdirectories # from subdirectories
if input_doc.source != DocumentSource.ConsumeFolder: if input_doc.source != DocumentSource.ConsumeFolder:
save_to_dir = settings.CONSUMPTION_DIR save_to_dir = settings.CONSUMPTION_DIR
else: else:
# Note this uses the original file, because it's in the # Note this uses the original file, because it's in the
# consume folder already and may include additional path # consume folder already and may include additional path
# components for tagging # components for tagging
# the .path is somewhere in scratch in this case # the .path is somewhere in scratch in this case
save_to_dir = input_doc.original_file.parent save_to_dir = input_doc.original_file.parent
for n, document in enumerate(document_list): for n, document in enumerate(document_list):
# save to consumption dir # save to consumption dir
# rename it to the original filename with number prefix # rename it to the original filename with number prefix
if overrides.filename is not None: if overrides.filename is not None:
newname = f"{str(n)}_{overrides.filename}" newname = f"{str(n)}_{overrides.filename}"
else: else:
newname = None newname = None
barcodes.save_to_dir( barcodes.save_to_dir(
document, document,
newname=newname, newname=newname,
target_dir=save_to_dir, target_dir=save_to_dir,
) )
# Split file has been copied safely, remove it # Split file has been copied safely, remove it
document.unlink() document.unlink()
# And clean up the directory as well, now it's empty # And clean up the directory as well, now it's empty
shutil.rmtree(document_list[0].parent) shutil.rmtree(document_list[0].parent)
# This file has been split into multiple files without issue # This file has been split into multiple files without issue
# remove the original and working copy # remove the original and working copy
input_doc.original_file.unlink() input_doc.original_file.unlink()
# If the original file was a TIFF, remove the PDF generated from it # If the original file was a TIFF, remove the PDF generated from it
if input_doc.mime_type == "image/tiff": if input_doc.mime_type == "image/tiff":
logger.debug( logger.debug(
f"Deleting file {doc_barcode_info.pdf_path}", f"Deleting file {doc_barcode_info.pdf_path}",
) )
doc_barcode_info.pdf_path.unlink() doc_barcode_info.pdf_path.unlink()
# notify the sender, otherwise the progress bar # notify the sender, otherwise the progress bar
# in the UI stays stuck # in the UI stays stuck
payload = { payload = {
"filename": overrides.filename or input_doc.original_file.name, "filename": overrides.filename or input_doc.original_file.name,
"task_id": None, "task_id": None,
"current_progress": 100, "current_progress": 100,
"max_progress": 100, "max_progress": 100,
"status": "SUCCESS", "status": "SUCCESS",
"message": "finished", "message": "finished",
} }
try: try:
async_to_sync(get_channel_layer().group_send)( async_to_sync(get_channel_layer().group_send)(
"status_updates", "status_updates",
{"type": "status_update", "data": payload}, {"type": "status_update", "data": payload},
) )
except ConnectionError as e: except ConnectionError as e:
logger.warning(f"ConnectionError on status send: {str(e)}") logger.warning(f"ConnectionError on status send: {str(e)}")
# consuming stops here, since the original document with # consuming stops here, since the original document with
# the barcodes has been split and will be consumed separately # the barcodes has been split and will be consumed separately
return "File successfully split" return "File successfully split"
# try reading the ASN from barcode # try reading the ASN from barcode
if settings.CONSUMER_ENABLE_ASN_BARCODE: if settings.CONSUMER_ENABLE_ASN_BARCODE:
overrides.asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) overrides.asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
if overrides.asn: if overrides.asn:
logger.info(f"Found ASN in barcode: {overrides.asn}") logger.info(f"Found ASN in barcode: {overrides.asn}")
# continue with consumption if no barcode was found # continue with consumption if no barcode was found
document = Consumer().try_consume_file( document = Consumer().try_consume_file(
input_doc.original_file, input_doc.original_file,
override_filename=overrides.filename, override_filename=overrides.filename,
override_title=overrides.title, override_title=overrides.title,
override_correspondent_id=overrides.correspondent_id, override_correspondent_id=overrides.correspondent_id,
override_document_type_id=overrides.document_type_id, override_document_type_id=overrides.document_type_id,
override_tag_ids=overrides.tag_ids, override_tag_ids=overrides.tag_ids,
override_created=overrides.created, override_created=overrides.created,
override_asn=overrides.asn, override_asn=overrides.asn,
override_owner_id=overrides.owner_id, override_owner_id=overrides.owner_id,
) override_storage_path_id=overrides.storage_path_id
)
if document:
return f"Success. New document id {document.pk} created" if document:
else: return f"Success. New document id {document.pk} created"
raise ConsumerError( else:
"Unknown error: Returned document was null, but " raise ConsumerError(
"no error message was given.", "Unknown error: Returned document was null, but "
) "no error message was given.",
)
@shared_task
def sanity_check(): @shared_task
messages = sanity_checker.check_sanity() def sanity_check():
messages = sanity_checker.check_sanity()
messages.log_messages()
messages.log_messages()
if messages.has_error:
raise SanityCheckFailedException("Sanity check failed with errors. See log.") if messages.has_error:
elif messages.has_warning: raise SanityCheckFailedException("Sanity check failed with errors. See log.")
return "Sanity check exited with warnings. See log." elif messages.has_warning:
elif len(messages) > 0: return "Sanity check exited with warnings. See log."
return "Sanity check exited with infos. See log." elif len(messages) > 0:
else: return "Sanity check exited with infos. See log."
return "No issues detected." else:
return "No issues detected."
@shared_task
def bulk_update_documents(document_ids): @shared_task
documents = Document.objects.filter(id__in=document_ids) def bulk_update_documents(document_ids):
documents = Document.objects.filter(id__in=document_ids)
ix = index.open_index()
ix = index.open_index()
for doc in documents:
post_save.send(Document, instance=doc, created=False) for doc in documents:
post_save.send(Document, instance=doc, created=False)
with AsyncWriter(ix) as writer:
for doc in documents: with AsyncWriter(ix) as writer:
index.update_document(writer, doc) for doc in documents:
index.update_document(writer, doc)
@shared_task
def update_document_archive_file(document_id): @shared_task
""" def update_document_archive_file(document_id):
Re-creates the archive file of a document, including new OCR content and thumbnail """
""" Re-creates the archive file of a document, including new OCR content and thumbnail
document = Document.objects.get(id=document_id) """
document = Document.objects.get(id=document_id)
mime_type = document.mime_type
mime_type = document.mime_type
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
if not parser_class:
logger.error( if not parser_class:
f"No parser found for mime type {mime_type}, cannot " logger.error(
f"archive document {document} (ID: {document_id})", f"No parser found for mime type {mime_type}, cannot "
) f"archive document {document} (ID: {document_id})",
return )
return
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
try:
parser.parse(document.source_path, mime_type, document.get_public_filename()) try:
parser.parse(document.source_path, mime_type, document.get_public_filename())
thumbnail = parser.get_thumbnail(
document.source_path, thumbnail = parser.get_thumbnail(
mime_type, document.source_path,
document.get_public_filename(), mime_type,
) document.get_public_filename(),
)
if parser.get_archive_path():
with transaction.atomic(): if parser.get_archive_path():
with open(parser.get_archive_path(), "rb") as f: with transaction.atomic():
checksum = hashlib.md5(f.read()).hexdigest() with open(parser.get_archive_path(), "rb") as f:
# I'm going to save first so that in case the file move checksum = hashlib.md5(f.read()).hexdigest()
# fails, the database is rolled back. # I'm going to save first so that in case the file move
# We also don't use save() since that triggers the filehandling # fails, the database is rolled back.
# logic, and we don't want that yet (file not yet in place) # We also don't use save() since that triggers the filehandling
document.archive_filename = generate_unique_filename( # logic, and we don't want that yet (file not yet in place)
document, document.archive_filename = generate_unique_filename(
archive_filename=True, document,
) archive_filename=True,
Document.objects.filter(pk=document.pk).update( )
archive_checksum=checksum, Document.objects.filter(pk=document.pk).update(
content=parser.get_text(), archive_checksum=checksum,
archive_filename=document.archive_filename, content=parser.get_text(),
) archive_filename=document.archive_filename,
with FileLock(settings.MEDIA_LOCK): )
create_source_path_directory(document.archive_path) with FileLock(settings.MEDIA_LOCK):
shutil.move(parser.get_archive_path(), document.archive_path) create_source_path_directory(document.archive_path)
shutil.move(thumbnail, document.thumbnail_path) shutil.move(parser.get_archive_path(), document.archive_path)
shutil.move(thumbnail, document.thumbnail_path)
with index.open_index_writer() as writer:
index.update_document(writer, document) with index.open_index_writer() as writer:
index.update_document(writer, document)
except Exception:
logger.exception( except Exception:
f"Error while parsing document {document} (ID: {document_id})", logger.exception(
) f"Error while parsing document {document} (ID: {document_id})",
finally: )
parser.cleanup() finally:
parser.cleanup()

File diff suppressed because it is too large Load Diff