diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 0ed583fc8..e171f1721 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -1,656 +1,656 @@ -import datetime -import hashlib -import os -import shutil -import tempfile -import uuid -from pathlib import Path -from subprocess import CompletedProcess -from subprocess import run -from typing import Optional -from typing import Type - -import magic -from asgiref.sync import async_to_sync -from channels.layers import get_channel_layer -from django.conf import settings -from django.contrib.auth.models import User -from django.db import transaction -from django.db.models import Q -from django.utils import timezone -from filelock import FileLock -from rest_framework.reverse import reverse - -from .classifier import load_classifier -from .file_handling import create_source_path_directory -from .file_handling import generate_unique_filename -from .loggers import LoggingMixin -from .models import Correspondent, StoragePath -from .models import Document -from .models import DocumentType -from .models import FileInfo -from .models import Tag -from .parsers import DocumentParser -from .parsers import get_parser_class_for_mime_type -from .parsers import parse_date -from .parsers import ParseError -from .signals import document_consumption_finished -from .signals import document_consumption_started - - -class ConsumerError(Exception): - pass - - -MESSAGE_DOCUMENT_ALREADY_EXISTS = "document_already_exists" -MESSAGE_ASN_ALREADY_EXISTS = "asn_already_exists" -MESSAGE_ASN_RANGE = "asn_value_out_of_range" -MESSAGE_FILE_NOT_FOUND = "file_not_found" -MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND = "pre_consume_script_not_found" -MESSAGE_PRE_CONSUME_SCRIPT_ERROR = "pre_consume_script_error" -MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND = "post_consume_script_not_found" -MESSAGE_POST_CONSUME_SCRIPT_ERROR = "post_consume_script_error" -MESSAGE_NEW_FILE = "new_file" -MESSAGE_UNSUPPORTED_TYPE = "unsupported_type" -MESSAGE_PARSING_DOCUMENT = "parsing_document" -MESSAGE_GENERATING_THUMBNAIL = "generating_thumbnail" -MESSAGE_PARSE_DATE = "parse_date" -MESSAGE_SAVE_DOCUMENT = "save_document" -MESSAGE_FINISHED = "finished" - - -class Consumer(LoggingMixin): - - logging_name = "paperless.consumer" - - def _send_progress( - self, - current_progress, - max_progress, - status, - message=None, - document_id=None, - ): - payload = { - "filename": os.path.basename(self.filename) if self.filename else None, - "task_id": self.task_id, - "current_progress": current_progress, - "max_progress": max_progress, - "status": status, - "message": message, - "document_id": document_id, - } - async_to_sync(self.channel_layer.group_send)( - "status_updates", - {"type": "status_update", "data": payload}, - ) - - def _fail( - self, - message, - log_message=None, - exc_info=None, - exception: Optional[Exception] = None, - ): - self._send_progress(100, 100, "FAILED", message) - self.log("error", log_message or message, exc_info=exc_info) - raise ConsumerError(f"{self.filename}: {log_message or message}") from exception - - def __init__(self): - super().__init__() - self.path: Optional[Path] = None - self.original_path: Optional[Path] = None - self.filename = None - self.override_title = None - self.override_correspondent_id = None - self.override_tag_ids = None - self.override_document_type_id = None - self.override_asn = None - self.task_id = None - self.owner_id = None - - self.channel_layer = get_channel_layer() - - def pre_check_file_exists(self): - if not os.path.isfile(self.path): - self._fail( - MESSAGE_FILE_NOT_FOUND, - f"Cannot consume {self.path}: File not found.", - ) - - def pre_check_duplicate(self): - with open(self.path, "rb") as f: - checksum = hashlib.md5(f.read()).hexdigest() - existing_doc = Document.objects.filter( - Q(checksum=checksum) | Q(archive_checksum=checksum), - ) - if existing_doc.exists(): - if settings.CONSUMER_DELETE_DUPLICATES: - os.unlink(self.path) - self._fail( - MESSAGE_DOCUMENT_ALREADY_EXISTS, - f"Not consuming {self.filename}: It is a duplicate of" - f" {existing_doc.get().title} (#{existing_doc.get().pk})", - ) - - def pre_check_directories(self): - os.makedirs(settings.SCRATCH_DIR, exist_ok=True) - os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) - os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) - os.makedirs(settings.ARCHIVE_DIR, exist_ok=True) - - def pre_check_asn_value(self): - """ - Check that if override_asn is given, it is unique and within a valid range - """ - if not self.override_asn: - # check not necessary in case no ASN gets set - return - # Validate the range is above zero and less than uint32_t max - # otherwise, Whoosh can't handle it in the index - if ( - self.override_asn < Document.ARCHIVE_SERIAL_NUMBER_MIN - or self.override_asn > Document.ARCHIVE_SERIAL_NUMBER_MAX - ): - self._fail( - MESSAGE_ASN_RANGE, - f"Not consuming {self.filename}: " - f"Given ASN {self.override_asn} is out of range " - f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, " - f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]", - ) - if Document.objects.filter(archive_serial_number=self.override_asn).exists(): - self._fail( - MESSAGE_ASN_ALREADY_EXISTS, - f"Not consuming {self.filename}: Given ASN already exists!", - ) - - def run_pre_consume_script(self): - if not settings.PRE_CONSUME_SCRIPT: - return - - if not os.path.isfile(settings.PRE_CONSUME_SCRIPT): - self._fail( - MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND, - f"Configured pre-consume script " - f"{settings.PRE_CONSUME_SCRIPT} does not exist.", - ) - - self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}") - - working_file_path = str(self.path) - original_file_path = str(self.original_path) - - script_env = os.environ.copy() - script_env["DOCUMENT_SOURCE_PATH"] = original_file_path - script_env["DOCUMENT_WORKING_PATH"] = working_file_path - - try: - completed_proc = run( - args=[ - settings.PRE_CONSUME_SCRIPT, - original_file_path, - ], - env=script_env, - capture_output=True, - ) - - self._log_script_outputs(completed_proc) - - # Raises exception on non-zero output - completed_proc.check_returncode() - - except Exception as e: - self._fail( - MESSAGE_PRE_CONSUME_SCRIPT_ERROR, - f"Error while executing pre-consume script: {e}", - exc_info=True, - exception=e, - ) - - def run_post_consume_script(self, document: Document): - if not settings.POST_CONSUME_SCRIPT: - return - - if not os.path.isfile(settings.POST_CONSUME_SCRIPT): - self._fail( - MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND, - f"Configured post-consume script " - f"{settings.POST_CONSUME_SCRIPT} does not exist.", - ) - - self.log( - "info", - f"Executing post-consume script {settings.POST_CONSUME_SCRIPT}", - ) - - script_env = os.environ.copy() - - script_env["DOCUMENT_ID"] = str(document.pk) - script_env["DOCUMENT_CREATED"] = str(document.created) - script_env["DOCUMENT_MODIFIED"] = str(document.modified) - script_env["DOCUMENT_ADDED"] = str(document.added) - script_env["DOCUMENT_FILE_NAME"] = document.get_public_filename() - script_env["DOCUMENT_SOURCE_PATH"] = os.path.normpath(document.source_path) - script_env["DOCUMENT_ARCHIVE_PATH"] = os.path.normpath( - str(document.archive_path), - ) - script_env["DOCUMENT_THUMBNAIL_PATH"] = os.path.normpath( - document.thumbnail_path, - ) - script_env["DOCUMENT_DOWNLOAD_URL"] = reverse( - "document-download", - kwargs={"pk": document.pk}, - ) - script_env["DOCUMENT_THUMBNAIL_URL"] = reverse( - "document-thumb", - kwargs={"pk": document.pk}, - ) - script_env["DOCUMENT_CORRESPONDENT"] = str(document.correspondent) - script_env["DOCUMENT_TAGS"] = str( - ",".join(document.tags.all().values_list("name", flat=True)), - ) - script_env["DOCUMENT_ORIGINAL_FILENAME"] = str(document.original_filename) - - try: - completed_proc = run( - args=[ - settings.POST_CONSUME_SCRIPT, - str(document.pk), - document.get_public_filename(), - os.path.normpath(document.source_path), - os.path.normpath(document.thumbnail_path), - reverse("document-download", kwargs={"pk": document.pk}), - reverse("document-thumb", kwargs={"pk": document.pk}), - str(document.correspondent), - str(",".join(document.tags.all().values_list("name", flat=True))), - ], - env=script_env, - capture_output=True, - ) - - self._log_script_outputs(completed_proc) - - # Raises exception on non-zero output - completed_proc.check_returncode() - - except Exception as e: - self._fail( - MESSAGE_POST_CONSUME_SCRIPT_ERROR, - f"Error while executing post-consume script: {e}", - exc_info=True, - exception=e, - ) - - def try_consume_file( - self, - path: Path, - override_filename=None, - override_title=None, - override_correspondent_id=None, - override_document_type_id=None, - override_tag_ids=None, - task_id=None, - override_created=None, - override_asn=None, - override_owner_id=None, - override_storage_path_id=None, - full_path=None, - ) -> Document: - """ - Return the document object if it was successfully created. - """ - - self.path = Path(path).resolve() - self.filename = override_filename or self.path.name - self.override_title = override_title - self.override_correspondent_id = override_correspondent_id - self.override_document_type_id = override_document_type_id - self.override_tag_ids = override_tag_ids - self.task_id = task_id or str(uuid.uuid4()) - self.override_created = override_created - self.override_asn = override_asn - self.override_owner_id = override_owner_id - self.override_storage_path_id = override_storage_path_id - self.full_path = full_path - - self._send_progress(0, 100, "STARTING", MESSAGE_NEW_FILE) - - # this is for grouping logging entries for this particular file - # together. - - self.renew_logging_group() - - # Make sure that preconditions for consuming the file are met. - - self.pre_check_file_exists() - self.pre_check_directories() - self.pre_check_duplicate() - self.pre_check_asn_value() - - self.log("info", f"Consuming {self.filename}") - - # For the actual work, copy the file into a tempdir - self.original_path = self.path - tempdir = tempfile.TemporaryDirectory( - prefix="paperless-ngx", - dir=settings.SCRATCH_DIR, - ) - self.path = Path(tempdir.name) / Path(self.filename) - shutil.copy(self.original_path, self.path) - - # Determine the parser class. - - mime_type = magic.from_file(self.path, mime=True) - - self.log("debug", f"Detected mime type: {mime_type}") - - # Based on the mime type, get the parser for that type - parser_class: Optional[Type[DocumentParser]] = get_parser_class_for_mime_type( - mime_type, - ) - if not parser_class: - tempdir.cleanup() - self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}") - - # Notify all listeners that we're going to do some work. - - document_consumption_started.send( - sender=self.__class__, - filename=self.path, - logging_group=self.logging_group, - ) - - self.run_pre_consume_script() - - def progress_callback(current_progress, max_progress): - # recalculate progress to be within 20 and 80 - p = int((current_progress / max_progress) * 50 + 20) - self._send_progress(p, 100, "WORKING") - - # This doesn't parse the document yet, but gives us a parser. - - document_parser: DocumentParser = parser_class( - self.logging_group, - progress_callback, - ) - - self.log("debug", f"Parser: {type(document_parser).__name__}") - - # However, this already created working directories which we have to - # clean up. - - # Parse the document. This may take some time. - - text = None - date = None - thumbnail = None - archive_path = None - - try: - self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT) - self.log("debug", f"Parsing {self.filename}...") - document_parser.parse(self.path, mime_type, self.filename) - - self.log("debug", f"Generating thumbnail for {self.filename}...") - self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL) - thumbnail = document_parser.get_thumbnail( - self.path, - mime_type, - self.filename, - ) - - text = document_parser.get_text() - date = document_parser.get_date() - if date is None: - self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE) - date = parse_date(self.filename, text) - archive_path = document_parser.get_archive_path() - - except ParseError as e: - document_parser.cleanup() - tempdir.cleanup() - self._fail( - str(e), - f"Error while consuming document {self.filename}: {e}", - exc_info=True, - exception=e, - ) - - # Prepare the document classifier. - - # TODO: I don't really like to do this here, but this way we avoid - # reloading the classifier multiple times, since there are multiple - # post-consume hooks that all require the classifier. - - classifier = load_classifier() - - self._send_progress(95, 100, "WORKING", MESSAGE_SAVE_DOCUMENT) - # now that everything is done, we can start to store the document - # in the system. This will be a transaction and reasonably fast. - try: - with transaction.atomic(): - - # store the document. - document = self._store(text=text, date=date, mime_type=mime_type) - - # If we get here, it was successful. Proceed with post-consume - # hooks. If they fail, nothing will get changed. - - document_consumption_finished.send( - sender=self.__class__, - document=document, - logging_group=self.logging_group, - classifier=classifier, - ) - - # After everything is in the database, copy the files into - # place. If this fails, we'll also rollback the transaction. - with FileLock(settings.MEDIA_LOCK): - document.filename = generate_unique_filename(document) - create_source_path_directory(document.source_path) - - self._write(document.storage_type, self.path, document.source_path) - - self._write( - document.storage_type, - thumbnail, - document.thumbnail_path, - ) - - if archive_path and os.path.isfile(archive_path): - document.archive_filename = generate_unique_filename( - document, - archive_filename=True, - ) - create_source_path_directory(document.archive_path) - self._write( - document.storage_type, - archive_path, - document.archive_path, - ) - - with open(archive_path, "rb") as f: - document.archive_checksum = hashlib.md5( - f.read(), - ).hexdigest() - - # Don't save with the lock active. Saving will cause the file - # renaming logic to acquire the lock as well. - # This triggers things like file renaming - document.save() - - # Delete the file only if it was successfully consumed - self.log("debug", f"Deleting file {self.path}") - os.unlink(self.path) - self.original_path.unlink() - - # https://github.com/jonaswinkler/paperless-ng/discussions/1037 - shadow_file = os.path.join( - os.path.dirname(self.original_path), - "._" + os.path.basename(self.original_path), - ) - - if os.path.isfile(shadow_file): - self.log("debug", f"Deleting file {shadow_file}") - os.unlink(shadow_file) - - except Exception as e: - self._fail( - str(e), - f"The following error occurred while consuming " - f"{self.filename}: {e}", - exc_info=True, - exception=e, - ) - finally: - document_parser.cleanup() - tempdir.cleanup() - - self.run_post_consume_script(document) - - self.log("info", f"Document {document} consumption finished") - - self._send_progress(100, 100, "SUCCESS", MESSAGE_FINISHED, document.id) - - # Return the most up to date fields - document.refresh_from_db() - - return document - - def _store( - self, - text: str, - date: Optional[datetime.datetime], - mime_type: str, - ) -> Document: - - # If someone gave us the original filename, use it instead of doc. - - file_info = FileInfo.from_filename(self.filename) - - self.log("debug", "Saving record to database") - - if self.override_created is not None: - create_date = self.override_created - self.log( - "debug", - f"Creation date from post_documents parameter: {create_date}", - ) - elif file_info.created is not None: - create_date = file_info.created - self.log("debug", f"Creation date from FileInfo: {create_date}") - elif date is not None: - create_date = date - self.log("debug", f"Creation date from parse_date: {create_date}") - else: - stats = os.stat(self.path) - create_date = timezone.make_aware( - datetime.datetime.fromtimestamp(stats.st_mtime), - ) - self.log("debug", f"Creation date from st_mtime: {create_date}") - - storage_type = Document.STORAGE_TYPE_UNENCRYPTED - - with open(self.path, "rb") as f: - document = Document.objects.create( - title=(self.override_title or file_info.title)[:127], - content=text, - mime_type=mime_type, - checksum=hashlib.md5(f.read()).hexdigest(), - created=create_date, - modified=create_date, - storage_type=storage_type, - original_filename=self.filename, - ) - - self.apply_overrides(document) - - document.save() - - return document - - def apply_overrides(self, document: Document): - if self.override_correspondent_id: - document.correspondent = Correspondent.objects.get( - pk=self.override_correspondent_id, - ) - - if self.override_document_type_id: - document.document_type = DocumentType.objects.get( - pk=self.override_document_type_id, - ) - - if self.override_tag_ids: - for tag_id in self.override_tag_ids: - document.tags.add(Tag.objects.get(pk=tag_id)) - - if self.override_asn: - document.archive_serial_number = self.override_asn - - if self.override_owner_id: - document.owner = User.objects.get( - pk=self.override_owner_id, - ) - - if self.override_storage_path_id: - document.storage_path = StoragePath.objects.get( - id=self.override_storage_path_id, - ) - - if self.full_path: - folders = self.full_path.split('/')[:-1] - folders = [i for i in folders if i] - folder_path = '/'.join(folders) - print(f'folder_path: {folder_path}') - - for i in range(len(folders)): - sub_path = '/'.join(folders[:i+1]) - # Source: https://stackoverflow.com/a/21750566/5575610 - if StoragePath.objects.filter(path=sub_path).exists(): continue - print(f'Creating StoragePath: {sub_path}') - StoragePath.objects.create(name=sub_path, path=sub_path) - - print(f'Assigning StoragePath: "{folder_path}" to file') - document.storage_path = StoragePath.objects.get(path=folder_path) - - - def _write(self, storage_type, source, target): - with open(source, "rb") as read_file, open(target, "wb") as write_file: - write_file.write(read_file.read()) - - def _log_script_outputs(self, completed_process: CompletedProcess): - """ - Decodes a process stdout and stderr streams and logs them to the main log - """ - # Log what the script exited as - self.log( - "info", - f"{completed_process.args[0]} exited {completed_process.returncode}", - ) - - # Decode the output (if any) - if len(completed_process.stdout): - stdout_str = ( - completed_process.stdout.decode("utf8", errors="ignore") - .strip() - .split( - "\n", - ) - ) - self.log("info", "Script stdout:") - for line in stdout_str: - self.log("info", line) - - if len(completed_process.stderr): - stderr_str = ( - completed_process.stderr.decode("utf8", errors="ignore") - .strip() - .split( - "\n", - ) - ) - - self.log("warning", "Script stderr:") - for line in stderr_str: - self.log("warning", line) +import datetime +import hashlib +import os +import shutil +import tempfile +import uuid +from pathlib import Path +from subprocess import CompletedProcess +from subprocess import run +from typing import Optional +from typing import Type + +import magic +from asgiref.sync import async_to_sync +from channels.layers import get_channel_layer +from django.conf import settings +from django.contrib.auth.models import User +from django.db import transaction +from django.db.models import Q +from django.utils import timezone +from filelock import FileLock +from rest_framework.reverse import reverse + +from .classifier import load_classifier +from .file_handling import create_source_path_directory +from .file_handling import generate_unique_filename +from .loggers import LoggingMixin +from .models import Correspondent, StoragePath +from .models import Document +from .models import DocumentType +from .models import FileInfo +from .models import Tag +from .parsers import DocumentParser +from .parsers import get_parser_class_for_mime_type +from .parsers import parse_date +from .parsers import ParseError +from .signals import document_consumption_finished +from .signals import document_consumption_started + + +class ConsumerError(Exception): + pass + + +MESSAGE_DOCUMENT_ALREADY_EXISTS = "document_already_exists" +MESSAGE_ASN_ALREADY_EXISTS = "asn_already_exists" +MESSAGE_ASN_RANGE = "asn_value_out_of_range" +MESSAGE_FILE_NOT_FOUND = "file_not_found" +MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND = "pre_consume_script_not_found" +MESSAGE_PRE_CONSUME_SCRIPT_ERROR = "pre_consume_script_error" +MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND = "post_consume_script_not_found" +MESSAGE_POST_CONSUME_SCRIPT_ERROR = "post_consume_script_error" +MESSAGE_NEW_FILE = "new_file" +MESSAGE_UNSUPPORTED_TYPE = "unsupported_type" +MESSAGE_PARSING_DOCUMENT = "parsing_document" +MESSAGE_GENERATING_THUMBNAIL = "generating_thumbnail" +MESSAGE_PARSE_DATE = "parse_date" +MESSAGE_SAVE_DOCUMENT = "save_document" +MESSAGE_FINISHED = "finished" + + +class Consumer(LoggingMixin): + + logging_name = "paperless.consumer" + + def _send_progress( + self, + current_progress, + max_progress, + status, + message=None, + document_id=None, + ): + payload = { + "filename": os.path.basename(self.filename) if self.filename else None, + "task_id": self.task_id, + "current_progress": current_progress, + "max_progress": max_progress, + "status": status, + "message": message, + "document_id": document_id, + } + async_to_sync(self.channel_layer.group_send)( + "status_updates", + {"type": "status_update", "data": payload}, + ) + + def _fail( + self, + message, + log_message=None, + exc_info=None, + exception: Optional[Exception] = None, + ): + self._send_progress(100, 100, "FAILED", message) + self.log("error", log_message or message, exc_info=exc_info) + raise ConsumerError(f"{self.filename}: {log_message or message}") from exception + + def __init__(self): + super().__init__() + self.path: Optional[Path] = None + self.original_path: Optional[Path] = None + self.filename = None + self.override_title = None + self.override_correspondent_id = None + self.override_tag_ids = None + self.override_document_type_id = None + self.override_asn = None + self.task_id = None + self.owner_id = None + + self.channel_layer = get_channel_layer() + + def pre_check_file_exists(self): + if not os.path.isfile(self.path): + self._fail( + MESSAGE_FILE_NOT_FOUND, + f"Cannot consume {self.path}: File not found.", + ) + + def pre_check_duplicate(self): + with open(self.path, "rb") as f: + checksum = hashlib.md5(f.read()).hexdigest() + existing_doc = Document.objects.filter( + Q(checksum=checksum) | Q(archive_checksum=checksum), + ) + if existing_doc.exists(): + if settings.CONSUMER_DELETE_DUPLICATES: + os.unlink(self.path) + self._fail( + MESSAGE_DOCUMENT_ALREADY_EXISTS, + f"Not consuming {self.filename}: It is a duplicate of" + f" {existing_doc.get().title} (#{existing_doc.get().pk})", + ) + + def pre_check_directories(self): + os.makedirs(settings.SCRATCH_DIR, exist_ok=True) + os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True) + os.makedirs(settings.ORIGINALS_DIR, exist_ok=True) + os.makedirs(settings.ARCHIVE_DIR, exist_ok=True) + + def pre_check_asn_value(self): + """ + Check that if override_asn is given, it is unique and within a valid range + """ + if not self.override_asn: + # check not necessary in case no ASN gets set + return + # Validate the range is above zero and less than uint32_t max + # otherwise, Whoosh can't handle it in the index + if ( + self.override_asn < Document.ARCHIVE_SERIAL_NUMBER_MIN + or self.override_asn > Document.ARCHIVE_SERIAL_NUMBER_MAX + ): + self._fail( + MESSAGE_ASN_RANGE, + f"Not consuming {self.filename}: " + f"Given ASN {self.override_asn} is out of range " + f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, " + f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}]", + ) + if Document.objects.filter(archive_serial_number=self.override_asn).exists(): + self._fail( + MESSAGE_ASN_ALREADY_EXISTS, + f"Not consuming {self.filename}: Given ASN already exists!", + ) + + def run_pre_consume_script(self): + if not settings.PRE_CONSUME_SCRIPT: + return + + if not os.path.isfile(settings.PRE_CONSUME_SCRIPT): + self._fail( + MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND, + f"Configured pre-consume script " + f"{settings.PRE_CONSUME_SCRIPT} does not exist.", + ) + + self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}") + + working_file_path = str(self.path) + original_file_path = str(self.original_path) + + script_env = os.environ.copy() + script_env["DOCUMENT_SOURCE_PATH"] = original_file_path + script_env["DOCUMENT_WORKING_PATH"] = working_file_path + + try: + completed_proc = run( + args=[ + settings.PRE_CONSUME_SCRIPT, + original_file_path, + ], + env=script_env, + capture_output=True, + ) + + self._log_script_outputs(completed_proc) + + # Raises exception on non-zero output + completed_proc.check_returncode() + + except Exception as e: + self._fail( + MESSAGE_PRE_CONSUME_SCRIPT_ERROR, + f"Error while executing pre-consume script: {e}", + exc_info=True, + exception=e, + ) + + def run_post_consume_script(self, document: Document): + if not settings.POST_CONSUME_SCRIPT: + return + + if not os.path.isfile(settings.POST_CONSUME_SCRIPT): + self._fail( + MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND, + f"Configured post-consume script " + f"{settings.POST_CONSUME_SCRIPT} does not exist.", + ) + + self.log( + "info", + f"Executing post-consume script {settings.POST_CONSUME_SCRIPT}", + ) + + script_env = os.environ.copy() + + script_env["DOCUMENT_ID"] = str(document.pk) + script_env["DOCUMENT_CREATED"] = str(document.created) + script_env["DOCUMENT_MODIFIED"] = str(document.modified) + script_env["DOCUMENT_ADDED"] = str(document.added) + script_env["DOCUMENT_FILE_NAME"] = document.get_public_filename() + script_env["DOCUMENT_SOURCE_PATH"] = os.path.normpath(document.source_path) + script_env["DOCUMENT_ARCHIVE_PATH"] = os.path.normpath( + str(document.archive_path), + ) + script_env["DOCUMENT_THUMBNAIL_PATH"] = os.path.normpath( + document.thumbnail_path, + ) + script_env["DOCUMENT_DOWNLOAD_URL"] = reverse( + "document-download", + kwargs={"pk": document.pk}, + ) + script_env["DOCUMENT_THUMBNAIL_URL"] = reverse( + "document-thumb", + kwargs={"pk": document.pk}, + ) + script_env["DOCUMENT_CORRESPONDENT"] = str(document.correspondent) + script_env["DOCUMENT_TAGS"] = str( + ",".join(document.tags.all().values_list("name", flat=True)), + ) + script_env["DOCUMENT_ORIGINAL_FILENAME"] = str(document.original_filename) + + try: + completed_proc = run( + args=[ + settings.POST_CONSUME_SCRIPT, + str(document.pk), + document.get_public_filename(), + os.path.normpath(document.source_path), + os.path.normpath(document.thumbnail_path), + reverse("document-download", kwargs={"pk": document.pk}), + reverse("document-thumb", kwargs={"pk": document.pk}), + str(document.correspondent), + str(",".join(document.tags.all().values_list("name", flat=True))), + ], + env=script_env, + capture_output=True, + ) + + self._log_script_outputs(completed_proc) + + # Raises exception on non-zero output + completed_proc.check_returncode() + + except Exception as e: + self._fail( + MESSAGE_POST_CONSUME_SCRIPT_ERROR, + f"Error while executing post-consume script: {e}", + exc_info=True, + exception=e, + ) + + def try_consume_file( + self, + path: Path, + override_filename=None, + override_title=None, + override_correspondent_id=None, + override_document_type_id=None, + override_tag_ids=None, + task_id=None, + override_created=None, + override_asn=None, + override_owner_id=None, + override_storage_path_id=None, + full_path=None, + ) -> Document: + """ + Return the document object if it was successfully created. + """ + + self.path = Path(path).resolve() + self.filename = override_filename or self.path.name + self.override_title = override_title + self.override_correspondent_id = override_correspondent_id + self.override_document_type_id = override_document_type_id + self.override_tag_ids = override_tag_ids + self.task_id = task_id or str(uuid.uuid4()) + self.override_created = override_created + self.override_asn = override_asn + self.override_owner_id = override_owner_id + self.override_storage_path_id = override_storage_path_id + self.full_path = full_path + + self._send_progress(0, 100, "STARTING", MESSAGE_NEW_FILE) + + # this is for grouping logging entries for this particular file + # together. + + self.renew_logging_group() + + # Make sure that preconditions for consuming the file are met. + + self.pre_check_file_exists() + self.pre_check_directories() + self.pre_check_duplicate() + self.pre_check_asn_value() + + self.log("info", f"Consuming {self.filename}") + + # For the actual work, copy the file into a tempdir + self.original_path = self.path + tempdir = tempfile.TemporaryDirectory( + prefix="paperless-ngx", + dir=settings.SCRATCH_DIR, + ) + self.path = Path(tempdir.name) / Path(self.filename) + shutil.copy(self.original_path, self.path) + + # Determine the parser class. + + mime_type = magic.from_file(self.path, mime=True) + + self.log("debug", f"Detected mime type: {mime_type}") + + # Based on the mime type, get the parser for that type + parser_class: Optional[Type[DocumentParser]] = get_parser_class_for_mime_type( + mime_type, + ) + if not parser_class: + tempdir.cleanup() + self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}") + + # Notify all listeners that we're going to do some work. + + document_consumption_started.send( + sender=self.__class__, + filename=self.path, + logging_group=self.logging_group, + ) + + self.run_pre_consume_script() + + def progress_callback(current_progress, max_progress): + # recalculate progress to be within 20 and 80 + p = int((current_progress / max_progress) * 50 + 20) + self._send_progress(p, 100, "WORKING") + + # This doesn't parse the document yet, but gives us a parser. + + document_parser: DocumentParser = parser_class( + self.logging_group, + progress_callback, + ) + + self.log("debug", f"Parser: {type(document_parser).__name__}") + + # However, this already created working directories which we have to + # clean up. + + # Parse the document. This may take some time. + + text = None + date = None + thumbnail = None + archive_path = None + + try: + self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT) + self.log("debug", f"Parsing {self.filename}...") + document_parser.parse(self.path, mime_type, self.filename) + + self.log("debug", f"Generating thumbnail for {self.filename}...") + self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL) + thumbnail = document_parser.get_thumbnail( + self.path, + mime_type, + self.filename, + ) + + text = document_parser.get_text() + date = document_parser.get_date() + if date is None: + self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE) + date = parse_date(self.filename, text) + archive_path = document_parser.get_archive_path() + + except ParseError as e: + document_parser.cleanup() + tempdir.cleanup() + self._fail( + str(e), + f"Error while consuming document {self.filename}: {e}", + exc_info=True, + exception=e, + ) + + # Prepare the document classifier. + + # TODO: I don't really like to do this here, but this way we avoid + # reloading the classifier multiple times, since there are multiple + # post-consume hooks that all require the classifier. + + classifier = load_classifier() + + self._send_progress(95, 100, "WORKING", MESSAGE_SAVE_DOCUMENT) + # now that everything is done, we can start to store the document + # in the system. This will be a transaction and reasonably fast. + try: + with transaction.atomic(): + + # store the document. + document = self._store(text=text, date=date, mime_type=mime_type) + + # If we get here, it was successful. Proceed with post-consume + # hooks. If they fail, nothing will get changed. + + document_consumption_finished.send( + sender=self.__class__, + document=document, + logging_group=self.logging_group, + classifier=classifier, + ) + + # After everything is in the database, copy the files into + # place. If this fails, we'll also rollback the transaction. + with FileLock(settings.MEDIA_LOCK): + document.filename = generate_unique_filename(document) + create_source_path_directory(document.source_path) + + self._write(document.storage_type, self.path, document.source_path) + + self._write( + document.storage_type, + thumbnail, + document.thumbnail_path, + ) + + if archive_path and os.path.isfile(archive_path): + document.archive_filename = generate_unique_filename( + document, + archive_filename=True, + ) + create_source_path_directory(document.archive_path) + self._write( + document.storage_type, + archive_path, + document.archive_path, + ) + + with open(archive_path, "rb") as f: + document.archive_checksum = hashlib.md5( + f.read(), + ).hexdigest() + + # Don't save with the lock active. Saving will cause the file + # renaming logic to acquire the lock as well. + # This triggers things like file renaming + document.save() + + # Delete the file only if it was successfully consumed + self.log("debug", f"Deleting file {self.path}") + os.unlink(self.path) + self.original_path.unlink() + + # https://github.com/jonaswinkler/paperless-ng/discussions/1037 + shadow_file = os.path.join( + os.path.dirname(self.original_path), + "._" + os.path.basename(self.original_path), + ) + + if os.path.isfile(shadow_file): + self.log("debug", f"Deleting file {shadow_file}") + os.unlink(shadow_file) + + except Exception as e: + self._fail( + str(e), + f"The following error occurred while consuming " + f"{self.filename}: {e}", + exc_info=True, + exception=e, + ) + finally: + document_parser.cleanup() + tempdir.cleanup() + + self.run_post_consume_script(document) + + self.log("info", f"Document {document} consumption finished") + + self._send_progress(100, 100, "SUCCESS", MESSAGE_FINISHED, document.id) + + # Return the most up to date fields + document.refresh_from_db() + + return document + + def _store( + self, + text: str, + date: Optional[datetime.datetime], + mime_type: str, + ) -> Document: + + # If someone gave us the original filename, use it instead of doc. + + file_info = FileInfo.from_filename(self.filename) + + self.log("debug", "Saving record to database") + + if self.override_created is not None: + create_date = self.override_created + self.log( + "debug", + f"Creation date from post_documents parameter: {create_date}", + ) + elif file_info.created is not None: + create_date = file_info.created + self.log("debug", f"Creation date from FileInfo: {create_date}") + elif date is not None: + create_date = date + self.log("debug", f"Creation date from parse_date: {create_date}") + else: + stats = os.stat(self.path) + create_date = timezone.make_aware( + datetime.datetime.fromtimestamp(stats.st_mtime), + ) + self.log("debug", f"Creation date from st_mtime: {create_date}") + + storage_type = Document.STORAGE_TYPE_UNENCRYPTED + + with open(self.path, "rb") as f: + document = Document.objects.create( + title=(self.override_title or file_info.title)[:127], + content=text, + mime_type=mime_type, + checksum=hashlib.md5(f.read()).hexdigest(), + created=create_date, + modified=create_date, + storage_type=storage_type, + original_filename=self.filename, + ) + + self.apply_overrides(document) + + document.save() + + return document + + def apply_overrides(self, document: Document): + if self.override_correspondent_id: + document.correspondent = Correspondent.objects.get( + pk=self.override_correspondent_id, + ) + + if self.override_document_type_id: + document.document_type = DocumentType.objects.get( + pk=self.override_document_type_id, + ) + + if self.override_tag_ids: + for tag_id in self.override_tag_ids: + document.tags.add(Tag.objects.get(pk=tag_id)) + + if self.override_asn: + document.archive_serial_number = self.override_asn + + if self.override_owner_id: + document.owner = User.objects.get( + pk=self.override_owner_id, + ) + + if self.override_storage_path_id: + document.storage_path = StoragePath.objects.get( + id=self.override_storage_path_id, + ) + + if self.full_path: + folders = self.full_path.split('/')[:-1] + folders = [i for i in folders if i] + folder_path = '/'.join(folders) + print(f'folder_path: {folder_path}') + + for i in range(len(folders)): + sub_path = '/'.join(folders[:i+1]) + # Source: https://stackoverflow.com/a/21750566/5575610 + if StoragePath.objects.filter(path=sub_path).exists(): continue + print(f'Creating StoragePath: {sub_path}') + StoragePath.objects.create(name=sub_path, path=sub_path) + + print(f'Assigning StoragePath: "{folder_path}" to file') + document.storage_path = StoragePath.objects.get(path=folder_path) + + + def _write(self, storage_type, source, target): + with open(source, "rb") as read_file, open(target, "wb") as write_file: + write_file.write(read_file.read()) + + def _log_script_outputs(self, completed_process: CompletedProcess): + """ + Decodes a process stdout and stderr streams and logs them to the main log + """ + # Log what the script exited as + self.log( + "info", + f"{completed_process.args[0]} exited {completed_process.returncode}", + ) + + # Decode the output (if any) + if len(completed_process.stdout): + stdout_str = ( + completed_process.stdout.decode("utf8", errors="ignore") + .strip() + .split( + "\n", + ) + ) + self.log("info", "Script stdout:") + for line in stdout_str: + self.log("info", line) + + if len(completed_process.stderr): + stderr_str = ( + completed_process.stderr.decode("utf8", errors="ignore") + .strip() + .split( + "\n", + ) + ) + + self.log("warning", "Script stderr:") + for line in stderr_str: + self.log("warning", line) diff --git a/src/documents/data_models.py b/src/documents/data_models.py index 047f4b695..9ea720b00 100644 --- a/src/documents/data_models.py +++ b/src/documents/data_models.py @@ -1,64 +1,64 @@ -import dataclasses -import datetime -import enum -from pathlib import Path -from typing import List -from typing import Optional - -import magic - - -@dataclasses.dataclass -class DocumentMetadataOverrides: - """ - Manages overrides for document fields which normally would - be set from content or matching. All fields default to None, - meaning no override is happening - """ - - filename: Optional[str] = None - title: Optional[str] = None - correspondent_id: Optional[int] = None - document_type_id: Optional[int] = None - tag_ids: Optional[List[int]] = None - created: Optional[datetime.datetime] = None - asn: Optional[int] = None - owner_id: Optional[int] = None - storage_path_id: Optional[int] = None - full_path: Optional[str] = None - - -class DocumentSource(enum.IntEnum): - """ - The source of an incoming document. May have other uses in the future - """ - - ConsumeFolder = enum.auto() - ApiUpload = enum.auto() - MailFetch = enum.auto() - - -@dataclasses.dataclass -class ConsumableDocument: - """ - Encapsulates an incoming document, either from consume folder, API upload - or mail fetching and certain useful operations on it. - """ - - source: DocumentSource - original_file: Path - mime_type: str = dataclasses.field(init=False, default=None) - - def __post_init__(self): - """ - After a dataclass is initialized, this is called to finalize some data - 1. Make sure the original path is an absolute, fully qualified path - 2. Get the mime type of the file - """ - # Always fully qualify the path first thing - # Just in case, convert to a path if it's a str - self.original_file = Path(self.original_file).resolve() - - # Get the file type once at init - # Note this function isn't called when the object is unpickled - self.mime_type = magic.from_file(self.original_file, mime=True) +import dataclasses +import datetime +import enum +from pathlib import Path +from typing import List +from typing import Optional + +import magic + + +@dataclasses.dataclass +class DocumentMetadataOverrides: + """ + Manages overrides for document fields which normally would + be set from content or matching. All fields default to None, + meaning no override is happening + """ + + filename: Optional[str] = None + title: Optional[str] = None + correspondent_id: Optional[int] = None + document_type_id: Optional[int] = None + tag_ids: Optional[List[int]] = None + created: Optional[datetime.datetime] = None + asn: Optional[int] = None + owner_id: Optional[int] = None + storage_path_id: Optional[int] = None + full_path: Optional[str] = None + + +class DocumentSource(enum.IntEnum): + """ + The source of an incoming document. May have other uses in the future + """ + + ConsumeFolder = enum.auto() + ApiUpload = enum.auto() + MailFetch = enum.auto() + + +@dataclasses.dataclass +class ConsumableDocument: + """ + Encapsulates an incoming document, either from consume folder, API upload + or mail fetching and certain useful operations on it. + """ + + source: DocumentSource + original_file: Path + mime_type: str = dataclasses.field(init=False, default=None) + + def __post_init__(self): + """ + After a dataclass is initialized, this is called to finalize some data + 1. Make sure the original path is an absolute, fully qualified path + 2. Get the mime type of the file + """ + # Always fully qualify the path first thing + # Just in case, convert to a path if it's a str + self.original_file = Path(self.original_file).resolve() + + # Get the file type once at init + # Note this function isn't called when the object is unpickled + self.mime_type = magic.from_file(self.original_file, mime=True) diff --git a/src/documents/filters.py b/src/documents/filters.py index fe2414776..271b91108 100644 --- a/src/documents/filters.py +++ b/src/documents/filters.py @@ -1,159 +1,159 @@ -from django.db.models import Q -from django_filters.rest_framework import BooleanFilter -from django_filters.rest_framework import Filter -from django_filters.rest_framework import FilterSet -from rest_framework_guardian.filters import ObjectPermissionsFilter - -from .models import Correspondent -from .models import Document -from .models import DocumentType -from .models import Log -from .models import StoragePath -from .models import Tag - - -CHAR_KWARGS = ["istartswith", "iendswith", "icontains", "iexact"] -ID_KWARGS = ["in", "exact"] -INT_KWARGS = ["exact", "gt", "gte", "lt", "lte", "isnull"] -DATE_KWARGS = ["year", "month", "day", "date__gt", "gt", "date__lt", "lt"] - - -class CorrespondentFilterSet(FilterSet): - class Meta: - model = Correspondent - fields = {"name": CHAR_KWARGS} - - -class TagFilterSet(FilterSet): - class Meta: - model = Tag - fields = {"name": CHAR_KWARGS} - - -class DocumentTypeFilterSet(FilterSet): - class Meta: - model = DocumentType - fields = {"name": CHAR_KWARGS} - - -class ObjectFilter(Filter): - def __init__(self, exclude=False, in_list=False, field_name=""): - super().__init__() - self.exclude = exclude - self.in_list = in_list - self.field_name = field_name - - def filter(self, qs, value): - if not value: - return qs - - try: - object_ids = [int(x) for x in value.split(",")] - except ValueError: - return qs - - if self.in_list: - qs = qs.filter(**{f"{self.field_name}__id__in": object_ids}).distinct() - else: - for obj_id in object_ids: - if self.exclude: - qs = qs.exclude(**{f"{self.field_name}__id": obj_id}) - else: - qs = qs.filter(**{f"{self.field_name}__id": obj_id}) - - return qs - - -class InboxFilter(Filter): - def filter(self, qs, value): - if value == "true": - return qs.filter(tags__is_inbox_tag=True) - elif value == "false": - return qs.exclude(tags__is_inbox_tag=True) - else: - return qs - - -class TitleContentFilter(Filter): - def filter(self, qs, value): - if value: - return qs.filter(Q(title__icontains=value) | Q(content__icontains=value)) - else: - return qs - - -class DocumentFilterSet(FilterSet): - - is_tagged = BooleanFilter( - label="Is tagged", - field_name="tags", - lookup_expr="isnull", - exclude=True, - ) - - tags__id__all = ObjectFilter(field_name="tags") - - tags__id__none = ObjectFilter(field_name="tags", exclude=True) - - tags__id__in = ObjectFilter(field_name="tags", in_list=True) - - correspondent__id__none = ObjectFilter(field_name="correspondent", exclude=True) - - document_type__id__none = ObjectFilter(field_name="document_type", exclude=True) - - storage_path__id__none = ObjectFilter(field_name="storage_path", exclude=True) - - is_in_inbox = InboxFilter() - - title_content = TitleContentFilter() - - class Meta: - model = Document - fields = { - "title": CHAR_KWARGS, - "content": CHAR_KWARGS, - "archive_serial_number": INT_KWARGS, - "created": DATE_KWARGS, - "added": DATE_KWARGS, - "modified": DATE_KWARGS, - "correspondent": ["isnull"], - "correspondent__id": ID_KWARGS, - "correspondent__name": CHAR_KWARGS, - "tags__id": ID_KWARGS, - "tags__name": CHAR_KWARGS, - "document_type": ["isnull"], - "document_type__id": ID_KWARGS, - "document_type__name": CHAR_KWARGS, - "storage_path": ["isnull"], - "storage_path__id": ID_KWARGS, - "storage_path__name": CHAR_KWARGS, - } - - -class LogFilterSet(FilterSet): - class Meta: - model = Log - fields = {"level": INT_KWARGS, "created": DATE_KWARGS, "group": ID_KWARGS} - - -class StoragePathFilterSet(FilterSet): - class Meta: - model = StoragePath - fields = { - "name": CHAR_KWARGS, - "path": CHAR_KWARGS, - } - - -class ObjectOwnedOrGrantedPermissionsFilter(ObjectPermissionsFilter): - """ - A filter backend that limits results to those where the requesting user - has read object level permissions, owns the objects, or objects without - an owner (for backwards compat) - """ - - def filter_queryset(self, request, queryset, view): - objects_with_perms = super().filter_queryset(request, queryset, view) - objects_owned = queryset.filter(owner=request.user) - objects_unowned = queryset.filter(owner__isnull=True) - return objects_with_perms | objects_owned | objects_unowned +from django.db.models import Q +from django_filters.rest_framework import BooleanFilter +from django_filters.rest_framework import Filter +from django_filters.rest_framework import FilterSet +from rest_framework_guardian.filters import ObjectPermissionsFilter + +from .models import Correspondent +from .models import Document +from .models import DocumentType +from .models import Log +from .models import StoragePath +from .models import Tag + + +CHAR_KWARGS = ["istartswith", "iendswith", "icontains", "iexact"] +ID_KWARGS = ["in", "exact"] +INT_KWARGS = ["exact", "gt", "gte", "lt", "lte", "isnull"] +DATE_KWARGS = ["year", "month", "day", "date__gt", "gt", "date__lt", "lt"] + + +class CorrespondentFilterSet(FilterSet): + class Meta: + model = Correspondent + fields = {"name": CHAR_KWARGS} + + +class TagFilterSet(FilterSet): + class Meta: + model = Tag + fields = {"name": CHAR_KWARGS} + + +class DocumentTypeFilterSet(FilterSet): + class Meta: + model = DocumentType + fields = {"name": CHAR_KWARGS} + + +class ObjectFilter(Filter): + def __init__(self, exclude=False, in_list=False, field_name=""): + super().__init__() + self.exclude = exclude + self.in_list = in_list + self.field_name = field_name + + def filter(self, qs, value): + if not value: + return qs + + try: + object_ids = [int(x) for x in value.split(",")] + except ValueError: + return qs + + if self.in_list: + qs = qs.filter(**{f"{self.field_name}__id__in": object_ids}).distinct() + else: + for obj_id in object_ids: + if self.exclude: + qs = qs.exclude(**{f"{self.field_name}__id": obj_id}) + else: + qs = qs.filter(**{f"{self.field_name}__id": obj_id}) + + return qs + + +class InboxFilter(Filter): + def filter(self, qs, value): + if value == "true": + return qs.filter(tags__is_inbox_tag=True) + elif value == "false": + return qs.exclude(tags__is_inbox_tag=True) + else: + return qs + + +class TitleContentFilter(Filter): + def filter(self, qs, value): + if value: + return qs.filter(Q(title__icontains=value) | Q(content__icontains=value)) + else: + return qs + + +class DocumentFilterSet(FilterSet): + + is_tagged = BooleanFilter( + label="Is tagged", + field_name="tags", + lookup_expr="isnull", + exclude=True, + ) + + tags__id__all = ObjectFilter(field_name="tags") + + tags__id__none = ObjectFilter(field_name="tags", exclude=True) + + tags__id__in = ObjectFilter(field_name="tags", in_list=True) + + correspondent__id__none = ObjectFilter(field_name="correspondent", exclude=True) + + document_type__id__none = ObjectFilter(field_name="document_type", exclude=True) + + storage_path__id__none = ObjectFilter(field_name="storage_path", exclude=True) + + is_in_inbox = InboxFilter() + + title_content = TitleContentFilter() + + class Meta: + model = Document + fields = { + "title": CHAR_KWARGS, + "content": CHAR_KWARGS, + "archive_serial_number": INT_KWARGS, + "created": DATE_KWARGS, + "added": DATE_KWARGS, + "modified": DATE_KWARGS, + "correspondent": ["isnull"], + "correspondent__id": ID_KWARGS, + "correspondent__name": CHAR_KWARGS, + "tags__id": ID_KWARGS, + "tags__name": CHAR_KWARGS, + "document_type": ["isnull"], + "document_type__id": ID_KWARGS, + "document_type__name": CHAR_KWARGS, + "storage_path": ["isnull"], + "storage_path__id": ID_KWARGS, + "storage_path__name": CHAR_KWARGS, + } + + +class LogFilterSet(FilterSet): + class Meta: + model = Log + fields = {"level": INT_KWARGS, "created": DATE_KWARGS, "group": ID_KWARGS} + + +class StoragePathFilterSet(FilterSet): + class Meta: + model = StoragePath + fields = { + "name": CHAR_KWARGS, + "path": CHAR_KWARGS, + } + + +class ObjectOwnedOrGrantedPermissionsFilter(ObjectPermissionsFilter): + """ + A filter backend that limits results to those where the requesting user + has read object level permissions, owns the objects, or objects without + an owner (for backwards compat) + """ + + def filter_queryset(self, request, queryset, view): + objects_with_perms = super().filter_queryset(request, queryset, view) + objects_owned = queryset.filter(owner=request.user) + objects_unowned = queryset.filter(owner__isnull=True) + return objects_with_perms | objects_owned | objects_unowned diff --git a/src/documents/index.py b/src/documents/index.py index a1b2f2b4d..21768f65f 100644 --- a/src/documents/index.py +++ b/src/documents/index.py @@ -1,343 +1,343 @@ -import logging -import math -import os -from contextlib import contextmanager - -from dateutil.parser import isoparse -from django.conf import settings -from django.utils import timezone -from documents.models import Document, Metadata -from documents.models import Note -from guardian.shortcuts import get_users_with_perms -from whoosh import classify -from whoosh import highlight -from whoosh import query -from whoosh.fields import BOOLEAN -from whoosh.fields import DATETIME -from whoosh.fields import KEYWORD -from whoosh.fields import NUMERIC -from whoosh.fields import Schema -from whoosh.fields import TEXT -from whoosh.highlight import HtmlFormatter -from whoosh.index import create_in -from whoosh.index import exists_in -from whoosh.index import open_dir -from whoosh.qparser import MultifieldParser -from whoosh.qparser.dateparse import DateParserPlugin -from whoosh.searching import ResultsPage -from whoosh.searching import Searcher -from whoosh.writing import AsyncWriter - -logger = logging.getLogger("paperless.index") - - -def get_schema(): - return Schema( - id=NUMERIC(stored=True, unique=True), - title=TEXT(sortable=True), - content=TEXT(), - asn=NUMERIC(sortable=True, signed=False), - correspondent=TEXT(sortable=True), - correspondent_id=NUMERIC(), - has_correspondent=BOOLEAN(), - tag=KEYWORD(commas=True, scorable=True, lowercase=True), - tag_id=KEYWORD(commas=True, scorable=True), - has_tag=BOOLEAN(), - type=TEXT(sortable=True), - type_id=NUMERIC(), - has_type=BOOLEAN(), - created=DATETIME(sortable=True), - modified=DATETIME(sortable=True), - added=DATETIME(sortable=True), - path=TEXT(sortable=True), - path_id=NUMERIC(), - has_path=BOOLEAN(), - notes=TEXT(), - metadatas=TEXT(), - owner=TEXT(), - owner_id=NUMERIC(), - has_owner=BOOLEAN(), - viewer_id=KEYWORD(commas=True), - ) - - -def open_index(recreate=False): - try: - if exists_in(settings.INDEX_DIR) and not recreate: - return open_dir(settings.INDEX_DIR, schema=get_schema()) - except Exception: - logger.exception("Error while opening the index, recreating.") - - if not os.path.isdir(settings.INDEX_DIR): - os.makedirs(settings.INDEX_DIR, exist_ok=True) - return create_in(settings.INDEX_DIR, get_schema()) - - -@contextmanager -def open_index_writer(optimize=False): - writer = AsyncWriter(open_index()) - - try: - yield writer - except Exception as e: - logger.exception(str(e)) - writer.cancel() - finally: - writer.commit(optimize=optimize) - - -@contextmanager -def open_index_searcher(): - searcher = open_index().searcher() - - try: - yield searcher - finally: - searcher.close() - - -def update_document(writer: AsyncWriter, doc: Document): - tags = ",".join([t.name for t in doc.tags.all()]) - tags_ids = ",".join([str(t.id) for t in doc.tags.all()]) - notes = ",".join([str(c.note) for c in Note.objects.filter(document=doc)]) - latest_metadata = Metadata.objects.filter(document=doc).order_by('-created').first() - metadatas = str(latest_metadata) if latest_metadata else '' - asn = doc.archive_serial_number - if asn is not None and ( - asn < Document.ARCHIVE_SERIAL_NUMBER_MIN - or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX - ): - logger.error( - f"Not indexing Archive Serial Number {asn} of document {doc.pk}. " - f"ASN is out of range " - f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, " - f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.", - ) - asn = 0 - users_with_perms = get_users_with_perms( - doc, - only_with_perms_in=["view_document"], - ) - viewer_ids = ",".join([str(u.id) for u in users_with_perms]) - writer.update_document( - id=doc.pk, - title=doc.title, - content=doc.content, - correspondent=doc.correspondent.name if doc.correspondent else None, - correspondent_id=doc.correspondent.id if doc.correspondent else None, - has_correspondent=doc.correspondent is not None, - tag=tags if tags else None, - tag_id=tags_ids if tags_ids else None, - has_tag=len(tags) > 0, - type=doc.document_type.name if doc.document_type else None, - type_id=doc.document_type.id if doc.document_type else None, - has_type=doc.document_type is not None, - created=doc.created, - added=doc.added, - asn=asn, - modified=doc.modified, - path=doc.storage_path.name if doc.storage_path else None, - path_id=doc.storage_path.id if doc.storage_path else None, - has_path=doc.storage_path is not None, - notes=notes, - # metadatas=metadatas, - owner=doc.owner.username if doc.owner else None, - owner_id=doc.owner.id if doc.owner else None, - has_owner=doc.owner is not None, - viewer_id=viewer_ids if viewer_ids else None, - ) - - -def remove_document(writer, doc): - remove_document_by_id(writer, doc.pk) - - -def remove_document_by_id(writer, doc_id): - writer.delete_by_term("id", doc_id) - - -def add_or_update_document(document): - with open_index_writer() as writer: - update_document(writer, document) - - -def remove_document_from_index(document): - with open_index_writer() as writer: - remove_document(writer, document) - - -class DelayedQuery: - def _get_query(self): - raise NotImplementedError - - def _get_query_filter(self): - criterias = [] - for k, v in self.query_params.items(): - if k == "correspondent__id": - criterias.append(query.Term("correspondent_id", v)) - elif k == "tags__id__all": - for tag_id in v.split(","): - criterias.append(query.Term("tag_id", tag_id)) - elif k == "tags__id__none": - for tag_id in v.split(","): - criterias.append(query.Not(query.Term("tag_id", tag_id))) - elif k == "document_type__id": - criterias.append(query.Term("type_id", v)) - elif k == "correspondent__isnull": - criterias.append(query.Term("has_correspondent", v == "false")) - elif k == "is_tagged": - criterias.append(query.Term("has_tag", v == "true")) - elif k == "document_type__isnull": - criterias.append(query.Term("has_type", v == "false")) - elif k == "created__date__lt": - criterias.append( - query.DateRange("created", start=None, end=isoparse(v)), - ) - elif k == "created__date__gt": - criterias.append( - query.DateRange("created", start=isoparse(v), end=None), - ) - elif k == "added__date__gt": - criterias.append(query.DateRange("added", start=isoparse(v), end=None)) - elif k == "added__date__lt": - criterias.append(query.DateRange("added", start=None, end=isoparse(v))) - elif k == "storage_path__id": - criterias.append(query.Term("path_id", v)) - elif k == "storage_path__isnull": - criterias.append(query.Term("has_path", v == "false")) - - user_criterias = [query.Term("has_owner", False)] - if "user" in self.query_params: - user_criterias.append(query.Term("owner_id", self.query_params["user"])) - user_criterias.append( - query.Term("viewer_id", str(self.query_params["user"])), - ) - if len(criterias) > 0: - criterias.append(query.Or(user_criterias)) - return query.And(criterias) - else: - return query.Or(user_criterias) - - def _get_query_sortedby(self): - if "ordering" not in self.query_params: - return None, False - - field: str = self.query_params["ordering"] - - sort_fields_map = { - "created": "created", - "modified": "modified", - "added": "added", - "title": "title", - "correspondent__name": "correspondent", - "document_type__name": "type", - "archive_serial_number": "asn", - } - - if field.startswith("-"): - field = field[1:] - reverse = True - else: - reverse = False - - if field not in sort_fields_map: - return None, False - else: - return sort_fields_map[field], reverse - - def __init__(self, searcher: Searcher, query_params, page_size): - self.searcher = searcher - self.query_params = query_params - self.page_size = page_size - self.saved_results = dict() - self.first_score = None - - def __len__(self): - page = self[0:1] - return len(page) - - def __getitem__(self, item): - if item.start in self.saved_results: - return self.saved_results[item.start] - - q, mask = self._get_query() - sortedby, reverse = self._get_query_sortedby() - - page: ResultsPage = self.searcher.search_page( - q, - mask=mask, - filter=self._get_query_filter(), - pagenum=math.floor(item.start / self.page_size) + 1, - pagelen=self.page_size, - sortedby=sortedby, - reverse=reverse, - ) - page.results.fragmenter = highlight.ContextFragmenter(surround=50) - page.results.formatter = HtmlFormatter(tagname="span", between=" ... ") - - if not self.first_score and len(page.results) > 0 and sortedby is None: - self.first_score = page.results[0].score - - page.results.top_n = list( - map( - lambda hit: ( - (hit[0] / self.first_score) if self.first_score else None, - hit[1], - ), - page.results.top_n, - ), - ) - - self.saved_results[item.start] = page - - return page - - -class DelayedFullTextQuery(DelayedQuery): - def _get_query(self): - q_str = self.query_params["query"] - qp = MultifieldParser( - ["content", "title", "correspondent", "tag", "type", "notes", "metadatas"], - self.searcher.ixreader.schema, - ) - qp.add_plugin(DateParserPlugin(basedate=timezone.now())) - q = qp.parse(q_str) - - corrected = self.searcher.correct_query(q, q_str) - if corrected.query != q: - corrected.query = corrected.string - - return q, None - - -class DelayedMoreLikeThisQuery(DelayedQuery): - def _get_query(self): - more_like_doc_id = int(self.query_params["more_like_id"]) - content = Document.objects.get(id=more_like_doc_id).content - - docnum = self.searcher.document_number(id=more_like_doc_id) - kts = self.searcher.key_terms_from_text( - "content", - content, - numterms=20, - model=classify.Bo1Model, - normalize=False, - ) - q = query.Or( - [query.Term("content", word, boost=weight) for word, weight in kts], - ) - mask = {docnum} - - return q, mask - - -def autocomplete(ix, term, limit=10): - with ix.reader() as reader: - terms = [] - for (score, t) in reader.most_distinctive_terms( - "content", - number=limit, - prefix=term.lower(), - ): - terms.append(t) - return terms +import logging +import math +import os +from contextlib import contextmanager + +from dateutil.parser import isoparse +from django.conf import settings +from django.utils import timezone +from documents.models import Document, Metadata +from documents.models import Note +from guardian.shortcuts import get_users_with_perms +from whoosh import classify +from whoosh import highlight +from whoosh import query +from whoosh.fields import BOOLEAN +from whoosh.fields import DATETIME +from whoosh.fields import KEYWORD +from whoosh.fields import NUMERIC +from whoosh.fields import Schema +from whoosh.fields import TEXT +from whoosh.highlight import HtmlFormatter +from whoosh.index import create_in +from whoosh.index import exists_in +from whoosh.index import open_dir +from whoosh.qparser import MultifieldParser +from whoosh.qparser.dateparse import DateParserPlugin +from whoosh.searching import ResultsPage +from whoosh.searching import Searcher +from whoosh.writing import AsyncWriter + +logger = logging.getLogger("paperless.index") + + +def get_schema(): + return Schema( + id=NUMERIC(stored=True, unique=True), + title=TEXT(sortable=True), + content=TEXT(), + asn=NUMERIC(sortable=True, signed=False), + correspondent=TEXT(sortable=True), + correspondent_id=NUMERIC(), + has_correspondent=BOOLEAN(), + tag=KEYWORD(commas=True, scorable=True, lowercase=True), + tag_id=KEYWORD(commas=True, scorable=True), + has_tag=BOOLEAN(), + type=TEXT(sortable=True), + type_id=NUMERIC(), + has_type=BOOLEAN(), + created=DATETIME(sortable=True), + modified=DATETIME(sortable=True), + added=DATETIME(sortable=True), + path=TEXT(sortable=True), + path_id=NUMERIC(), + has_path=BOOLEAN(), + notes=TEXT(), + metadatas=TEXT(), + owner=TEXT(), + owner_id=NUMERIC(), + has_owner=BOOLEAN(), + viewer_id=KEYWORD(commas=True), + ) + + +def open_index(recreate=False): + try: + if exists_in(settings.INDEX_DIR) and not recreate: + return open_dir(settings.INDEX_DIR, schema=get_schema()) + except Exception: + logger.exception("Error while opening the index, recreating.") + + if not os.path.isdir(settings.INDEX_DIR): + os.makedirs(settings.INDEX_DIR, exist_ok=True) + return create_in(settings.INDEX_DIR, get_schema()) + + +@contextmanager +def open_index_writer(optimize=False): + writer = AsyncWriter(open_index()) + + try: + yield writer + except Exception as e: + logger.exception(str(e)) + writer.cancel() + finally: + writer.commit(optimize=optimize) + + +@contextmanager +def open_index_searcher(): + searcher = open_index().searcher() + + try: + yield searcher + finally: + searcher.close() + + +def update_document(writer: AsyncWriter, doc: Document): + tags = ",".join([t.name for t in doc.tags.all()]) + tags_ids = ",".join([str(t.id) for t in doc.tags.all()]) + notes = ",".join([str(c.note) for c in Note.objects.filter(document=doc)]) + latest_metadata = Metadata.objects.filter(document=doc).order_by('-created').first() + metadatas = str(latest_metadata) if latest_metadata else '' + asn = doc.archive_serial_number + if asn is not None and ( + asn < Document.ARCHIVE_SERIAL_NUMBER_MIN + or asn > Document.ARCHIVE_SERIAL_NUMBER_MAX + ): + logger.error( + f"Not indexing Archive Serial Number {asn} of document {doc.pk}. " + f"ASN is out of range " + f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, " + f"{Document.ARCHIVE_SERIAL_NUMBER_MAX:,}.", + ) + asn = 0 + users_with_perms = get_users_with_perms( + doc, + only_with_perms_in=["view_document"], + ) + viewer_ids = ",".join([str(u.id) for u in users_with_perms]) + writer.update_document( + id=doc.pk, + title=doc.title, + content=doc.content, + correspondent=doc.correspondent.name if doc.correspondent else None, + correspondent_id=doc.correspondent.id if doc.correspondent else None, + has_correspondent=doc.correspondent is not None, + tag=tags if tags else None, + tag_id=tags_ids if tags_ids else None, + has_tag=len(tags) > 0, + type=doc.document_type.name if doc.document_type else None, + type_id=doc.document_type.id if doc.document_type else None, + has_type=doc.document_type is not None, + created=doc.created, + added=doc.added, + asn=asn, + modified=doc.modified, + path=doc.storage_path.name if doc.storage_path else None, + path_id=doc.storage_path.id if doc.storage_path else None, + has_path=doc.storage_path is not None, + notes=notes, + # metadatas=metadatas, + owner=doc.owner.username if doc.owner else None, + owner_id=doc.owner.id if doc.owner else None, + has_owner=doc.owner is not None, + viewer_id=viewer_ids if viewer_ids else None, + ) + + +def remove_document(writer, doc): + remove_document_by_id(writer, doc.pk) + + +def remove_document_by_id(writer, doc_id): + writer.delete_by_term("id", doc_id) + + +def add_or_update_document(document): + with open_index_writer() as writer: + update_document(writer, document) + + +def remove_document_from_index(document): + with open_index_writer() as writer: + remove_document(writer, document) + + +class DelayedQuery: + def _get_query(self): + raise NotImplementedError + + def _get_query_filter(self): + criterias = [] + for k, v in self.query_params.items(): + if k == "correspondent__id": + criterias.append(query.Term("correspondent_id", v)) + elif k == "tags__id__all": + for tag_id in v.split(","): + criterias.append(query.Term("tag_id", tag_id)) + elif k == "tags__id__none": + for tag_id in v.split(","): + criterias.append(query.Not(query.Term("tag_id", tag_id))) + elif k == "document_type__id": + criterias.append(query.Term("type_id", v)) + elif k == "correspondent__isnull": + criterias.append(query.Term("has_correspondent", v == "false")) + elif k == "is_tagged": + criterias.append(query.Term("has_tag", v == "true")) + elif k == "document_type__isnull": + criterias.append(query.Term("has_type", v == "false")) + elif k == "created__date__lt": + criterias.append( + query.DateRange("created", start=None, end=isoparse(v)), + ) + elif k == "created__date__gt": + criterias.append( + query.DateRange("created", start=isoparse(v), end=None), + ) + elif k == "added__date__gt": + criterias.append(query.DateRange("added", start=isoparse(v), end=None)) + elif k == "added__date__lt": + criterias.append(query.DateRange("added", start=None, end=isoparse(v))) + elif k == "storage_path__id": + criterias.append(query.Term("path_id", v)) + elif k == "storage_path__isnull": + criterias.append(query.Term("has_path", v == "false")) + + user_criterias = [query.Term("has_owner", False)] + if "user" in self.query_params: + user_criterias.append(query.Term("owner_id", self.query_params["user"])) + user_criterias.append( + query.Term("viewer_id", str(self.query_params["user"])), + ) + if len(criterias) > 0: + criterias.append(query.Or(user_criterias)) + return query.And(criterias) + else: + return query.Or(user_criterias) + + def _get_query_sortedby(self): + if "ordering" not in self.query_params: + return None, False + + field: str = self.query_params["ordering"] + + sort_fields_map = { + "created": "created", + "modified": "modified", + "added": "added", + "title": "title", + "correspondent__name": "correspondent", + "document_type__name": "type", + "archive_serial_number": "asn", + } + + if field.startswith("-"): + field = field[1:] + reverse = True + else: + reverse = False + + if field not in sort_fields_map: + return None, False + else: + return sort_fields_map[field], reverse + + def __init__(self, searcher: Searcher, query_params, page_size): + self.searcher = searcher + self.query_params = query_params + self.page_size = page_size + self.saved_results = dict() + self.first_score = None + + def __len__(self): + page = self[0:1] + return len(page) + + def __getitem__(self, item): + if item.start in self.saved_results: + return self.saved_results[item.start] + + q, mask = self._get_query() + sortedby, reverse = self._get_query_sortedby() + + page: ResultsPage = self.searcher.search_page( + q, + mask=mask, + filter=self._get_query_filter(), + pagenum=math.floor(item.start / self.page_size) + 1, + pagelen=self.page_size, + sortedby=sortedby, + reverse=reverse, + ) + page.results.fragmenter = highlight.ContextFragmenter(surround=50) + page.results.formatter = HtmlFormatter(tagname="span", between=" ... ") + + if not self.first_score and len(page.results) > 0 and sortedby is None: + self.first_score = page.results[0].score + + page.results.top_n = list( + map( + lambda hit: ( + (hit[0] / self.first_score) if self.first_score else None, + hit[1], + ), + page.results.top_n, + ), + ) + + self.saved_results[item.start] = page + + return page + + +class DelayedFullTextQuery(DelayedQuery): + def _get_query(self): + q_str = self.query_params["query"] + qp = MultifieldParser( + ["content", "title", "correspondent", "tag", "type", "notes", "metadatas"], + self.searcher.ixreader.schema, + ) + qp.add_plugin(DateParserPlugin(basedate=timezone.now())) + q = qp.parse(q_str) + + corrected = self.searcher.correct_query(q, q_str) + if corrected.query != q: + corrected.query = corrected.string + + return q, None + + +class DelayedMoreLikeThisQuery(DelayedQuery): + def _get_query(self): + more_like_doc_id = int(self.query_params["more_like_id"]) + content = Document.objects.get(id=more_like_doc_id).content + + docnum = self.searcher.document_number(id=more_like_doc_id) + kts = self.searcher.key_terms_from_text( + "content", + content, + numterms=20, + model=classify.Bo1Model, + normalize=False, + ) + q = query.Or( + [query.Term("content", word, boost=weight) for word, weight in kts], + ) + mask = {docnum} + + return q, mask + + +def autocomplete(ix, term, limit=10): + with ix.reader() as reader: + terms = [] + for (score, t) in reader.most_distinctive_terms( + "content", + number=limit, + prefix=term.lower(), + ): + terms.append(t) + return terms diff --git a/src/documents/migrations/1036_add_metadata.py b/src/documents/migrations/1036_add_metadata.py index b23221ea7..44998f2b9 100644 --- a/src/documents/migrations/1036_add_metadata.py +++ b/src/documents/migrations/1036_add_metadata.py @@ -1,69 +1,69 @@ -from django.db import migrations, models -import django.utils.timezone -from django.conf import settings - - -class Migration(migrations.Migration): - dependencies = [ - ("documents", "1035_rename_comment_note"), - ] - - operations = [ - migrations.CreateModel( - name="Metadata", - fields=[ - ( - "id", - models.AutoField( - auto_created=True, - primary_key=True, - serialize=False, - verbose_name="ID", - ), - ), - ( - "data", - models.JSONField( - blank=True, - help_text="JSON metadata", - verbose_name="data" - ), - ), - ( - "created", - models.DateTimeField( - db_index=True, - default=django.utils.timezone.now, - verbose_name="created", - ), - ), - ( - "document", - models.ForeignKey( - blank=True, - null=True, - on_delete=django.db.models.deletion.CASCADE, - related_name="metadatas", - to="documents.document", - verbose_name="document", - ), - ), - ( - "user", - models.ForeignKey( - blank=True, - null=True, - on_delete=django.db.models.deletion.SET_NULL, - related_name="metadatas", - to=settings.AUTH_USER_MODEL, - verbose_name="user", - ), - ), - ], - options={ - "verbose_name": "metadata", - "verbose_name_plural": "metadatas", - "ordering": ("created",), - }, - ), - ] +from django.db import migrations, models +import django.utils.timezone +from django.conf import settings + + +class Migration(migrations.Migration): + dependencies = [ + ("documents", "1035_rename_comment_note"), + ] + + operations = [ + migrations.CreateModel( + name="Metadata", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "data", + models.JSONField( + blank=True, + help_text="JSON metadata", + verbose_name="data" + ), + ), + ( + "created", + models.DateTimeField( + db_index=True, + default=django.utils.timezone.now, + verbose_name="created", + ), + ), + ( + "document", + models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.CASCADE, + related_name="metadatas", + to="documents.document", + verbose_name="document", + ), + ), + ( + "user", + models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="metadatas", + to=settings.AUTH_USER_MODEL, + verbose_name="user", + ), + ), + ], + options={ + "verbose_name": "metadata", + "verbose_name_plural": "metadatas", + "ordering": ("created",), + }, + ), + ] diff --git a/src/documents/migrations/1037_alter_documenttype_add_default_metadata.py b/src/documents/migrations/1037_alter_documenttype_add_default_metadata.py index 5cc298913..f2ee75e08 100644 --- a/src/documents/migrations/1037_alter_documenttype_add_default_metadata.py +++ b/src/documents/migrations/1037_alter_documenttype_add_default_metadata.py @@ -1,24 +1,24 @@ -# Generated by Django 4.1.7 on 2023-07-23 17:36 - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - ('documents', '1036_add_metadata'), - ] - - operations = [ - migrations.AddField( - model_name='documenttype', - name='default_metadata', - field=models.JSONField(blank=True, help_text='Default JSON metadata', null=True, verbose_name='default_metadata'), - ), - migrations.AlterField( - model_name='metadata', - name='document', - field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='document', to='documents.document', verbose_name='document'), - ), +# Generated by Django 4.1.7 on 2023-07-23 17:36 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '1036_add_metadata'), + ] + + operations = [ + migrations.AddField( + model_name='documenttype', + name='default_metadata', + field=models.JSONField(blank=True, help_text='Default JSON metadata', null=True, verbose_name='default_metadata'), + ), + migrations.AlterField( + model_name='metadata', + name='document', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='document', to='documents.document', verbose_name='document'), + ), ] \ No newline at end of file diff --git a/src/documents/migrations/1038_alter_metadata_document.py b/src/documents/migrations/1038_alter_metadata_document.py index 4d4a112b1..3e710fd88 100644 --- a/src/documents/migrations/1038_alter_metadata_document.py +++ b/src/documents/migrations/1038_alter_metadata_document.py @@ -1,19 +1,19 @@ -# Generated by Django 4.1.7 on 2023-07-27 02:44 - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - ('documents', '1037_alter_documenttype_add_default_metadata'), - ] - - operations = [ - migrations.AlterField( - model_name='metadata', - name='document', - field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='metadatas', to='documents.document', verbose_name='document'), - ), +# Generated by Django 4.1.7 on 2023-07-27 02:44 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '1037_alter_documenttype_add_default_metadata'), + ] + + operations = [ + migrations.AlterField( + model_name='metadata', + name='document', + field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='metadatas', to='documents.document', verbose_name='document'), + ), ] \ No newline at end of file diff --git a/src/documents/models.py b/src/documents/models.py index 56921a0c2..b6525618f 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -1,728 +1,728 @@ -import datetime -import logging -import os -import re -from collections import OrderedDict -from pathlib import Path -from typing import Final -from typing import Optional - -import dateutil.parser -import pathvalidate -from celery import states -from django.conf import settings -from django.contrib.auth.models import User -from django.core.validators import MaxValueValidator -from django.core.validators import MinValueValidator -from django.db import models -from django.utils import timezone -from django.utils.translation import gettext_lazy as _ -from documents.parsers import get_default_file_extension - -ALL_STATES = sorted(states.ALL_STATES) -TASK_STATE_CHOICES = sorted(zip(ALL_STATES, ALL_STATES)) - - -class ModelWithOwner(models.Model): - owner = models.ForeignKey( - User, - blank=True, - null=True, - on_delete=models.SET_NULL, - verbose_name=_("owner"), - ) - - class Meta: - abstract = True - - -class MatchingModel(ModelWithOwner): - - MATCH_NONE = 0 - MATCH_ANY = 1 - MATCH_ALL = 2 - MATCH_LITERAL = 3 - MATCH_REGEX = 4 - MATCH_FUZZY = 5 - MATCH_AUTO = 6 - - MATCHING_ALGORITHMS = ( - (MATCH_NONE, _("None")), - (MATCH_ANY, _("Any word")), - (MATCH_ALL, _("All words")), - (MATCH_LITERAL, _("Exact match")), - (MATCH_REGEX, _("Regular expression")), - (MATCH_FUZZY, _("Fuzzy word")), - (MATCH_AUTO, _("Automatic")), - ) - - name = models.CharField(_("name"), max_length=128) - - match = models.CharField(_("match"), max_length=256, blank=True) - - matching_algorithm = models.PositiveIntegerField( - _("matching algorithm"), - choices=MATCHING_ALGORITHMS, - default=MATCH_ANY, - ) - - is_insensitive = models.BooleanField(_("is insensitive"), default=True) - - class Meta: - abstract = True - ordering = ("name",) - constraints = [ - models.UniqueConstraint( - fields=["name", "owner"], - name="%(app_label)s_%(class)s_unique_name_owner", - ), - models.UniqueConstraint( - name="%(app_label)s_%(class)s_name_uniq", - fields=["name"], - condition=models.Q(owner__isnull=True), - ), - ] - - def __str__(self): - return self.name - - -class Correspondent(MatchingModel): - class Meta(MatchingModel.Meta): - verbose_name = _("correspondent") - verbose_name_plural = _("correspondents") - - -class Tag(MatchingModel): - - color = models.CharField(_("color"), max_length=7, default="#a6cee3") - - is_inbox_tag = models.BooleanField( - _("is inbox tag"), - default=False, - help_text=_( - "Marks this tag as an inbox tag: All newly consumed " - "documents will be tagged with inbox tags.", - ), - ) - - class Meta(MatchingModel.Meta): - verbose_name = _("tag") - verbose_name_plural = _("tags") - - -class DocumentType(MatchingModel): - default_metadata = models.JSONField( - _("default_metadata"), - # Source: https://stackoverflow.com/a/47590145/5575610 - null=True, - blank=True, - help_text=_("Default JSON metadata"), - ) - - class Meta(MatchingModel.Meta): - verbose_name = _("document type") - verbose_name_plural = _("document types") - - -class StoragePath(MatchingModel): - path = models.CharField( - _("path"), - max_length=512, - ) - - class Meta(MatchingModel.Meta): - verbose_name = _("storage path") - verbose_name_plural = _("storage paths") - - -class Document(ModelWithOwner): - - STORAGE_TYPE_UNENCRYPTED = "unencrypted" - STORAGE_TYPE_GPG = "gpg" - STORAGE_TYPES = ( - (STORAGE_TYPE_UNENCRYPTED, _("Unencrypted")), - (STORAGE_TYPE_GPG, _("Encrypted with GNU Privacy Guard")), - ) - - correspondent = models.ForeignKey( - Correspondent, - blank=True, - null=True, - related_name="documents", - on_delete=models.SET_NULL, - verbose_name=_("correspondent"), - ) - - storage_path = models.ForeignKey( - StoragePath, - blank=True, - null=True, - related_name="documents", - on_delete=models.SET_NULL, - verbose_name=_("storage path"), - ) - - title = models.CharField(_("title"), max_length=128, blank=True, db_index=True) - - document_type = models.ForeignKey( - DocumentType, - blank=True, - null=True, - related_name="documents", - on_delete=models.SET_NULL, - verbose_name=_("document type"), - ) - - content = models.TextField( - _("content"), - blank=True, - help_text=_( - "The raw, text-only data of the document. This field is " - "primarily used for searching.", - ), - ) - - mime_type = models.CharField(_("mime type"), max_length=256, editable=False) - - tags = models.ManyToManyField( - Tag, - related_name="documents", - blank=True, - verbose_name=_("tags"), - ) - - checksum = models.CharField( - _("checksum"), - max_length=32, - editable=False, - unique=True, - help_text=_("The checksum of the original document."), - ) - - archive_checksum = models.CharField( - _("archive checksum"), - max_length=32, - editable=False, - blank=True, - null=True, - help_text=_("The checksum of the archived document."), - ) - - created = models.DateTimeField(_("created"), default=timezone.now, db_index=True) - - modified = models.DateTimeField( - _("modified"), - auto_now=True, - editable=False, - db_index=True, - ) - - storage_type = models.CharField( - _("storage type"), - max_length=11, - choices=STORAGE_TYPES, - default=STORAGE_TYPE_UNENCRYPTED, - editable=False, - ) - - added = models.DateTimeField( - _("added"), - default=timezone.now, - editable=False, - db_index=True, - ) - - filename = models.FilePathField( - _("filename"), - max_length=1024, - editable=False, - default=None, - unique=True, - null=True, - help_text=_("Current filename in storage"), - ) - - archive_filename = models.FilePathField( - _("archive filename"), - max_length=1024, - editable=False, - default=None, - unique=True, - null=True, - help_text=_("Current archive filename in storage"), - ) - - original_filename = models.CharField( - _("original filename"), - max_length=1024, - editable=False, - default=None, - unique=False, - null=True, - help_text=_("The original name of the file when it was uploaded"), - ) - - ARCHIVE_SERIAL_NUMBER_MIN: Final[int] = 0 - ARCHIVE_SERIAL_NUMBER_MAX: Final[int] = 0xFF_FF_FF_FF - - archive_serial_number = models.PositiveIntegerField( - _("archive serial number"), - blank=True, - null=True, - unique=True, - db_index=True, - validators=[ - MaxValueValidator(ARCHIVE_SERIAL_NUMBER_MAX), - MinValueValidator(ARCHIVE_SERIAL_NUMBER_MIN), - ], - help_text=_( - "The position of this document in your physical document archive.", - ), - ) - - class Meta: - ordering = ("-created",) - verbose_name = _("document") - verbose_name_plural = _("documents") - - def __str__(self) -> str: - - # Convert UTC database time to local time - created = datetime.date.isoformat(timezone.localdate(self.created)) - - res = f"{created}" - - if self.correspondent: - res += f" {self.correspondent}" - if self.title: - res += f" {self.title}" - return res - - @property - def source_path(self) -> Path: - if self.filename: - fname = str(self.filename) - else: - fname = f"{self.pk:07}{self.file_type}" - if self.storage_type == self.STORAGE_TYPE_GPG: - fname += ".gpg" # pragma: no cover - - return (settings.ORIGINALS_DIR / Path(fname)).resolve() - - @property - def source_file(self): - return open(self.source_path, "rb") - - @property - def has_archive_version(self) -> bool: - return self.archive_filename is not None - - @property - def archive_path(self) -> Optional[Path]: - if self.has_archive_version: - return (settings.ARCHIVE_DIR / Path(str(self.archive_filename))).resolve() - else: - return None - - @property - def archive_file(self): - return open(self.archive_path, "rb") - - def get_public_filename(self, archive=False, counter=0, suffix=None) -> str: - """ - Returns a sanitized filename for the document, not including any paths. - """ - result = str(self) - - if counter: - result += f"_{counter:02}" - - if suffix: - result += suffix - - if archive: - result += ".pdf" - else: - result += self.file_type - - return pathvalidate.sanitize_filename(result, replacement_text="-") - - @property - def file_type(self): - return get_default_file_extension(self.mime_type) - - @property - def thumbnail_path(self) -> Path: - webp_file_name = f"{self.pk:07}.webp" - if self.storage_type == self.STORAGE_TYPE_GPG: - webp_file_name += ".gpg" - - webp_file_path = settings.THUMBNAIL_DIR / Path(webp_file_name) - - return webp_file_path.resolve() - - @property - def thumbnail_file(self): - return open(self.thumbnail_path, "rb") - - @property - def created_date(self): - return timezone.localdate(self.created) - - -class Log(models.Model): - - LEVELS = ( - (logging.DEBUG, _("debug")), - (logging.INFO, _("information")), - (logging.WARNING, _("warning")), - (logging.ERROR, _("error")), - (logging.CRITICAL, _("critical")), - ) - - group = models.UUIDField(_("group"), blank=True, null=True) - - message = models.TextField(_("message")) - - level = models.PositiveIntegerField( - _("level"), - choices=LEVELS, - default=logging.INFO, - ) - - created = models.DateTimeField(_("created"), auto_now_add=True) - - class Meta: - ordering = ("-created",) - verbose_name = _("log") - verbose_name_plural = _("logs") - - def __str__(self): - return self.message - - -class SavedView(ModelWithOwner): - class Meta: - - ordering = ("name",) - verbose_name = _("saved view") - verbose_name_plural = _("saved views") - - name = models.CharField(_("name"), max_length=128) - - show_on_dashboard = models.BooleanField( - _("show on dashboard"), - ) - show_in_sidebar = models.BooleanField( - _("show in sidebar"), - ) - - sort_field = models.CharField( - _("sort field"), - max_length=128, - null=True, - blank=True, - ) - sort_reverse = models.BooleanField(_("sort reverse"), default=False) - - -class SavedViewFilterRule(models.Model): - RULE_TYPES = [ - (0, _("title contains")), - (1, _("content contains")), - (2, _("ASN is")), - (3, _("correspondent is")), - (4, _("document type is")), - (5, _("is in inbox")), - (6, _("has tag")), - (7, _("has any tag")), - (8, _("created before")), - (9, _("created after")), - (10, _("created year is")), - (11, _("created month is")), - (12, _("created day is")), - (13, _("added before")), - (14, _("added after")), - (15, _("modified before")), - (16, _("modified after")), - (17, _("does not have tag")), - (18, _("does not have ASN")), - (19, _("title or content contains")), - (20, _("fulltext query")), - (21, _("more like this")), - (22, _("has tags in")), - (23, _("ASN greater than")), - (24, _("ASN less than")), - (25, _("storage path is")), - (26, _("has correspondent in")), - (27, _("does not have correspondent in")), - (28, _("has document type in")), - (29, _("does not have document type in")), - (30, _("has storage path in")), - (31, _("does not have storage path in")), - ] - - saved_view = models.ForeignKey( - SavedView, - on_delete=models.CASCADE, - related_name="filter_rules", - verbose_name=_("saved view"), - ) - - rule_type = models.PositiveIntegerField(_("rule type"), choices=RULE_TYPES) - - value = models.CharField(_("value"), max_length=255, blank=True, null=True) - - class Meta: - verbose_name = _("filter rule") - verbose_name_plural = _("filter rules") - - def __str__(self) -> str: - return f"SavedViewFilterRule: {self.rule_type} : {self.value}" - - -# TODO: why is this in the models file? -# TODO: how about, what is this and where is it documented? -# It appears to parsing JSON from an environment variable to get a title and date from -# the filename, if possible, as a higher priority than either document filename or -# content parsing -class FileInfo: - - REGEXES = OrderedDict( - [ - ( - "created-title", - re.compile( - r"^(?P\d{8}(\d{6})?Z) - (?P.*)$", - flags=re.IGNORECASE, - ), - ), - ("title", re.compile(r"(?P<title>.*)$", flags=re.IGNORECASE)), - ], - ) - - def __init__( - self, - created=None, - correspondent=None, - title=None, - tags=(), - extension=None, - ): - - self.created = created - self.title = title - self.extension = extension - self.correspondent = correspondent - self.tags = tags - - @classmethod - def _get_created(cls, created): - try: - return dateutil.parser.parse(f"{created[:-1]:0<14}Z") - except ValueError: - return None - - @classmethod - def _get_title(cls, title): - return title - - @classmethod - def _mangle_property(cls, properties, name): - if name in properties: - properties[name] = getattr(cls, f"_get_{name}")(properties[name]) - - @classmethod - def from_filename(cls, filename) -> "FileInfo": - # Mutate filename in-place before parsing its components - # by applying at most one of the configured transformations. - for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS: - (filename, count) = pattern.subn(repl, filename) - if count: - break - - # do this after the transforms so that the transforms can do whatever - # with the file extension. - filename_no_ext = os.path.splitext(filename)[0] - - if filename_no_ext == filename and filename.startswith("."): - # This is a very special case where there is no text before the - # file type. - # TODO: this should be handled better. The ext is not removed - # because usually, files like '.pdf' are just hidden files - # with the name pdf, but in our case, its more likely that - # there's just no name to begin with. - filename = "" - # This isn't too bad either, since we'll just not match anything - # and return an empty title. TODO: actually, this is kinda bad. - else: - filename = filename_no_ext - - # Parse filename components. - for regex in cls.REGEXES.values(): - m = regex.match(filename) - if m: - properties = m.groupdict() - cls._mangle_property(properties, "created") - cls._mangle_property(properties, "title") - return cls(**properties) - - -# Extending User Model Using a One-To-One Link -class UiSettings(models.Model): - - user = models.OneToOneField( - User, - on_delete=models.CASCADE, - related_name="ui_settings", - ) - settings = models.JSONField(null=True) - - def __str__(self): - return self.user.username - - -class PaperlessTask(models.Model): - task_id = models.CharField( - max_length=255, - unique=True, - verbose_name=_("Task ID"), - help_text=_("Celery ID for the Task that was run"), - ) - - acknowledged = models.BooleanField( - default=False, - verbose_name=_("Acknowledged"), - help_text=_("If the task is acknowledged via the frontend or API"), - ) - - task_file_name = models.CharField( - null=True, - max_length=255, - verbose_name=_("Task Filename"), - help_text=_("Name of the file which the Task was run for"), - ) - - task_name = models.CharField( - null=True, - max_length=255, - verbose_name=_("Task Name"), - help_text=_("Name of the Task which was run"), - ) - - status = models.CharField( - max_length=30, - default=states.PENDING, - choices=TASK_STATE_CHOICES, - verbose_name=_("Task State"), - help_text=_("Current state of the task being run"), - ) - date_created = models.DateTimeField( - null=True, - default=timezone.now, - verbose_name=_("Created DateTime"), - help_text=_("Datetime field when the task result was created in UTC"), - ) - date_started = models.DateTimeField( - null=True, - default=None, - verbose_name=_("Started DateTime"), - help_text=_("Datetime field when the task was started in UTC"), - ) - date_done = models.DateTimeField( - null=True, - default=None, - verbose_name=_("Completed DateTime"), - help_text=_("Datetime field when the task was completed in UTC"), - ) - result = models.TextField( - null=True, - default=None, - verbose_name=_("Result Data"), - help_text=_( - "The data returned by the task", - ), - ) - - def __str__(self) -> str: - return f"Task {self.task_id}" - - -class Note(models.Model): - note = models.TextField( - _("content"), - blank=True, - help_text=_("Note for the document"), - ) - - created = models.DateTimeField( - _("created"), - default=timezone.now, - db_index=True, - ) - - document = models.ForeignKey( - Document, - blank=True, - null=True, - related_name="notes", - on_delete=models.CASCADE, - verbose_name=_("document"), - ) - - user = models.ForeignKey( - User, - blank=True, - null=True, - related_name="notes", - on_delete=models.SET_NULL, - verbose_name=_("user"), - ) - - class Meta: - ordering = ("created",) - verbose_name = _("note") - verbose_name_plural = _("notes") - - def __str__(self): - return self.note - -class Metadata(models.Model): - data = models.JSONField( - _("data"), - blank=True, - help_text=_("JSON metadata"), - ) - - created = models.DateTimeField( - _("created"), - default=timezone.now, - db_index=True, - ) - - document = models.ForeignKey( - Document, - blank=True, - null=True, - related_name="metadatas", - on_delete=models.CASCADE, - verbose_name=_("document"), - ) - - user = models.ForeignKey( - User, - blank=True, - null=True, - related_name="metadatas", - on_delete=models.SET_NULL, - verbose_name=_("user"), - ) - - class Meta: - ordering = ("created",) - verbose_name = _("metadata") - verbose_name_plural = _("metadatas") - - def __str__(self): +import datetime +import logging +import os +import re +from collections import OrderedDict +from pathlib import Path +from typing import Final +from typing import Optional + +import dateutil.parser +import pathvalidate +from celery import states +from django.conf import settings +from django.contrib.auth.models import User +from django.core.validators import MaxValueValidator +from django.core.validators import MinValueValidator +from django.db import models +from django.utils import timezone +from django.utils.translation import gettext_lazy as _ +from documents.parsers import get_default_file_extension + +ALL_STATES = sorted(states.ALL_STATES) +TASK_STATE_CHOICES = sorted(zip(ALL_STATES, ALL_STATES)) + + +class ModelWithOwner(models.Model): + owner = models.ForeignKey( + User, + blank=True, + null=True, + on_delete=models.SET_NULL, + verbose_name=_("owner"), + ) + + class Meta: + abstract = True + + +class MatchingModel(ModelWithOwner): + + MATCH_NONE = 0 + MATCH_ANY = 1 + MATCH_ALL = 2 + MATCH_LITERAL = 3 + MATCH_REGEX = 4 + MATCH_FUZZY = 5 + MATCH_AUTO = 6 + + MATCHING_ALGORITHMS = ( + (MATCH_NONE, _("None")), + (MATCH_ANY, _("Any word")), + (MATCH_ALL, _("All words")), + (MATCH_LITERAL, _("Exact match")), + (MATCH_REGEX, _("Regular expression")), + (MATCH_FUZZY, _("Fuzzy word")), + (MATCH_AUTO, _("Automatic")), + ) + + name = models.CharField(_("name"), max_length=128) + + match = models.CharField(_("match"), max_length=256, blank=True) + + matching_algorithm = models.PositiveIntegerField( + _("matching algorithm"), + choices=MATCHING_ALGORITHMS, + default=MATCH_ANY, + ) + + is_insensitive = models.BooleanField(_("is insensitive"), default=True) + + class Meta: + abstract = True + ordering = ("name",) + constraints = [ + models.UniqueConstraint( + fields=["name", "owner"], + name="%(app_label)s_%(class)s_unique_name_owner", + ), + models.UniqueConstraint( + name="%(app_label)s_%(class)s_name_uniq", + fields=["name"], + condition=models.Q(owner__isnull=True), + ), + ] + + def __str__(self): + return self.name + + +class Correspondent(MatchingModel): + class Meta(MatchingModel.Meta): + verbose_name = _("correspondent") + verbose_name_plural = _("correspondents") + + +class Tag(MatchingModel): + + color = models.CharField(_("color"), max_length=7, default="#a6cee3") + + is_inbox_tag = models.BooleanField( + _("is inbox tag"), + default=False, + help_text=_( + "Marks this tag as an inbox tag: All newly consumed " + "documents will be tagged with inbox tags.", + ), + ) + + class Meta(MatchingModel.Meta): + verbose_name = _("tag") + verbose_name_plural = _("tags") + + +class DocumentType(MatchingModel): + default_metadata = models.JSONField( + _("default_metadata"), + # Source: https://stackoverflow.com/a/47590145/5575610 + null=True, + blank=True, + help_text=_("Default JSON metadata"), + ) + + class Meta(MatchingModel.Meta): + verbose_name = _("document type") + verbose_name_plural = _("document types") + + +class StoragePath(MatchingModel): + path = models.CharField( + _("path"), + max_length=512, + ) + + class Meta(MatchingModel.Meta): + verbose_name = _("storage path") + verbose_name_plural = _("storage paths") + + +class Document(ModelWithOwner): + + STORAGE_TYPE_UNENCRYPTED = "unencrypted" + STORAGE_TYPE_GPG = "gpg" + STORAGE_TYPES = ( + (STORAGE_TYPE_UNENCRYPTED, _("Unencrypted")), + (STORAGE_TYPE_GPG, _("Encrypted with GNU Privacy Guard")), + ) + + correspondent = models.ForeignKey( + Correspondent, + blank=True, + null=True, + related_name="documents", + on_delete=models.SET_NULL, + verbose_name=_("correspondent"), + ) + + storage_path = models.ForeignKey( + StoragePath, + blank=True, + null=True, + related_name="documents", + on_delete=models.SET_NULL, + verbose_name=_("storage path"), + ) + + title = models.CharField(_("title"), max_length=128, blank=True, db_index=True) + + document_type = models.ForeignKey( + DocumentType, + blank=True, + null=True, + related_name="documents", + on_delete=models.SET_NULL, + verbose_name=_("document type"), + ) + + content = models.TextField( + _("content"), + blank=True, + help_text=_( + "The raw, text-only data of the document. This field is " + "primarily used for searching.", + ), + ) + + mime_type = models.CharField(_("mime type"), max_length=256, editable=False) + + tags = models.ManyToManyField( + Tag, + related_name="documents", + blank=True, + verbose_name=_("tags"), + ) + + checksum = models.CharField( + _("checksum"), + max_length=32, + editable=False, + unique=True, + help_text=_("The checksum of the original document."), + ) + + archive_checksum = models.CharField( + _("archive checksum"), + max_length=32, + editable=False, + blank=True, + null=True, + help_text=_("The checksum of the archived document."), + ) + + created = models.DateTimeField(_("created"), default=timezone.now, db_index=True) + + modified = models.DateTimeField( + _("modified"), + auto_now=True, + editable=False, + db_index=True, + ) + + storage_type = models.CharField( + _("storage type"), + max_length=11, + choices=STORAGE_TYPES, + default=STORAGE_TYPE_UNENCRYPTED, + editable=False, + ) + + added = models.DateTimeField( + _("added"), + default=timezone.now, + editable=False, + db_index=True, + ) + + filename = models.FilePathField( + _("filename"), + max_length=1024, + editable=False, + default=None, + unique=True, + null=True, + help_text=_("Current filename in storage"), + ) + + archive_filename = models.FilePathField( + _("archive filename"), + max_length=1024, + editable=False, + default=None, + unique=True, + null=True, + help_text=_("Current archive filename in storage"), + ) + + original_filename = models.CharField( + _("original filename"), + max_length=1024, + editable=False, + default=None, + unique=False, + null=True, + help_text=_("The original name of the file when it was uploaded"), + ) + + ARCHIVE_SERIAL_NUMBER_MIN: Final[int] = 0 + ARCHIVE_SERIAL_NUMBER_MAX: Final[int] = 0xFF_FF_FF_FF + + archive_serial_number = models.PositiveIntegerField( + _("archive serial number"), + blank=True, + null=True, + unique=True, + db_index=True, + validators=[ + MaxValueValidator(ARCHIVE_SERIAL_NUMBER_MAX), + MinValueValidator(ARCHIVE_SERIAL_NUMBER_MIN), + ], + help_text=_( + "The position of this document in your physical document archive.", + ), + ) + + class Meta: + ordering = ("-created",) + verbose_name = _("document") + verbose_name_plural = _("documents") + + def __str__(self) -> str: + + # Convert UTC database time to local time + created = datetime.date.isoformat(timezone.localdate(self.created)) + + res = f"{created}" + + if self.correspondent: + res += f" {self.correspondent}" + if self.title: + res += f" {self.title}" + return res + + @property + def source_path(self) -> Path: + if self.filename: + fname = str(self.filename) + else: + fname = f"{self.pk:07}{self.file_type}" + if self.storage_type == self.STORAGE_TYPE_GPG: + fname += ".gpg" # pragma: no cover + + return (settings.ORIGINALS_DIR / Path(fname)).resolve() + + @property + def source_file(self): + return open(self.source_path, "rb") + + @property + def has_archive_version(self) -> bool: + return self.archive_filename is not None + + @property + def archive_path(self) -> Optional[Path]: + if self.has_archive_version: + return (settings.ARCHIVE_DIR / Path(str(self.archive_filename))).resolve() + else: + return None + + @property + def archive_file(self): + return open(self.archive_path, "rb") + + def get_public_filename(self, archive=False, counter=0, suffix=None) -> str: + """ + Returns a sanitized filename for the document, not including any paths. + """ + result = str(self) + + if counter: + result += f"_{counter:02}" + + if suffix: + result += suffix + + if archive: + result += ".pdf" + else: + result += self.file_type + + return pathvalidate.sanitize_filename(result, replacement_text="-") + + @property + def file_type(self): + return get_default_file_extension(self.mime_type) + + @property + def thumbnail_path(self) -> Path: + webp_file_name = f"{self.pk:07}.webp" + if self.storage_type == self.STORAGE_TYPE_GPG: + webp_file_name += ".gpg" + + webp_file_path = settings.THUMBNAIL_DIR / Path(webp_file_name) + + return webp_file_path.resolve() + + @property + def thumbnail_file(self): + return open(self.thumbnail_path, "rb") + + @property + def created_date(self): + return timezone.localdate(self.created) + + +class Log(models.Model): + + LEVELS = ( + (logging.DEBUG, _("debug")), + (logging.INFO, _("information")), + (logging.WARNING, _("warning")), + (logging.ERROR, _("error")), + (logging.CRITICAL, _("critical")), + ) + + group = models.UUIDField(_("group"), blank=True, null=True) + + message = models.TextField(_("message")) + + level = models.PositiveIntegerField( + _("level"), + choices=LEVELS, + default=logging.INFO, + ) + + created = models.DateTimeField(_("created"), auto_now_add=True) + + class Meta: + ordering = ("-created",) + verbose_name = _("log") + verbose_name_plural = _("logs") + + def __str__(self): + return self.message + + +class SavedView(ModelWithOwner): + class Meta: + + ordering = ("name",) + verbose_name = _("saved view") + verbose_name_plural = _("saved views") + + name = models.CharField(_("name"), max_length=128) + + show_on_dashboard = models.BooleanField( + _("show on dashboard"), + ) + show_in_sidebar = models.BooleanField( + _("show in sidebar"), + ) + + sort_field = models.CharField( + _("sort field"), + max_length=128, + null=True, + blank=True, + ) + sort_reverse = models.BooleanField(_("sort reverse"), default=False) + + +class SavedViewFilterRule(models.Model): + RULE_TYPES = [ + (0, _("title contains")), + (1, _("content contains")), + (2, _("ASN is")), + (3, _("correspondent is")), + (4, _("document type is")), + (5, _("is in inbox")), + (6, _("has tag")), + (7, _("has any tag")), + (8, _("created before")), + (9, _("created after")), + (10, _("created year is")), + (11, _("created month is")), + (12, _("created day is")), + (13, _("added before")), + (14, _("added after")), + (15, _("modified before")), + (16, _("modified after")), + (17, _("does not have tag")), + (18, _("does not have ASN")), + (19, _("title or content contains")), + (20, _("fulltext query")), + (21, _("more like this")), + (22, _("has tags in")), + (23, _("ASN greater than")), + (24, _("ASN less than")), + (25, _("storage path is")), + (26, _("has correspondent in")), + (27, _("does not have correspondent in")), + (28, _("has document type in")), + (29, _("does not have document type in")), + (30, _("has storage path in")), + (31, _("does not have storage path in")), + ] + + saved_view = models.ForeignKey( + SavedView, + on_delete=models.CASCADE, + related_name="filter_rules", + verbose_name=_("saved view"), + ) + + rule_type = models.PositiveIntegerField(_("rule type"), choices=RULE_TYPES) + + value = models.CharField(_("value"), max_length=255, blank=True, null=True) + + class Meta: + verbose_name = _("filter rule") + verbose_name_plural = _("filter rules") + + def __str__(self) -> str: + return f"SavedViewFilterRule: {self.rule_type} : {self.value}" + + +# TODO: why is this in the models file? +# TODO: how about, what is this and where is it documented? +# It appears to parsing JSON from an environment variable to get a title and date from +# the filename, if possible, as a higher priority than either document filename or +# content parsing +class FileInfo: + + REGEXES = OrderedDict( + [ + ( + "created-title", + re.compile( + r"^(?P<created>\d{8}(\d{6})?Z) - (?P<title>.*)$", + flags=re.IGNORECASE, + ), + ), + ("title", re.compile(r"(?P<title>.*)$", flags=re.IGNORECASE)), + ], + ) + + def __init__( + self, + created=None, + correspondent=None, + title=None, + tags=(), + extension=None, + ): + + self.created = created + self.title = title + self.extension = extension + self.correspondent = correspondent + self.tags = tags + + @classmethod + def _get_created(cls, created): + try: + return dateutil.parser.parse(f"{created[:-1]:0<14}Z") + except ValueError: + return None + + @classmethod + def _get_title(cls, title): + return title + + @classmethod + def _mangle_property(cls, properties, name): + if name in properties: + properties[name] = getattr(cls, f"_get_{name}")(properties[name]) + + @classmethod + def from_filename(cls, filename) -> "FileInfo": + # Mutate filename in-place before parsing its components + # by applying at most one of the configured transformations. + for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS: + (filename, count) = pattern.subn(repl, filename) + if count: + break + + # do this after the transforms so that the transforms can do whatever + # with the file extension. + filename_no_ext = os.path.splitext(filename)[0] + + if filename_no_ext == filename and filename.startswith("."): + # This is a very special case where there is no text before the + # file type. + # TODO: this should be handled better. The ext is not removed + # because usually, files like '.pdf' are just hidden files + # with the name pdf, but in our case, its more likely that + # there's just no name to begin with. + filename = "" + # This isn't too bad either, since we'll just not match anything + # and return an empty title. TODO: actually, this is kinda bad. + else: + filename = filename_no_ext + + # Parse filename components. + for regex in cls.REGEXES.values(): + m = regex.match(filename) + if m: + properties = m.groupdict() + cls._mangle_property(properties, "created") + cls._mangle_property(properties, "title") + return cls(**properties) + + +# Extending User Model Using a One-To-One Link +class UiSettings(models.Model): + + user = models.OneToOneField( + User, + on_delete=models.CASCADE, + related_name="ui_settings", + ) + settings = models.JSONField(null=True) + + def __str__(self): + return self.user.username + + +class PaperlessTask(models.Model): + task_id = models.CharField( + max_length=255, + unique=True, + verbose_name=_("Task ID"), + help_text=_("Celery ID for the Task that was run"), + ) + + acknowledged = models.BooleanField( + default=False, + verbose_name=_("Acknowledged"), + help_text=_("If the task is acknowledged via the frontend or API"), + ) + + task_file_name = models.CharField( + null=True, + max_length=255, + verbose_name=_("Task Filename"), + help_text=_("Name of the file which the Task was run for"), + ) + + task_name = models.CharField( + null=True, + max_length=255, + verbose_name=_("Task Name"), + help_text=_("Name of the Task which was run"), + ) + + status = models.CharField( + max_length=30, + default=states.PENDING, + choices=TASK_STATE_CHOICES, + verbose_name=_("Task State"), + help_text=_("Current state of the task being run"), + ) + date_created = models.DateTimeField( + null=True, + default=timezone.now, + verbose_name=_("Created DateTime"), + help_text=_("Datetime field when the task result was created in UTC"), + ) + date_started = models.DateTimeField( + null=True, + default=None, + verbose_name=_("Started DateTime"), + help_text=_("Datetime field when the task was started in UTC"), + ) + date_done = models.DateTimeField( + null=True, + default=None, + verbose_name=_("Completed DateTime"), + help_text=_("Datetime field when the task was completed in UTC"), + ) + result = models.TextField( + null=True, + default=None, + verbose_name=_("Result Data"), + help_text=_( + "The data returned by the task", + ), + ) + + def __str__(self) -> str: + return f"Task {self.task_id}" + + +class Note(models.Model): + note = models.TextField( + _("content"), + blank=True, + help_text=_("Note for the document"), + ) + + created = models.DateTimeField( + _("created"), + default=timezone.now, + db_index=True, + ) + + document = models.ForeignKey( + Document, + blank=True, + null=True, + related_name="notes", + on_delete=models.CASCADE, + verbose_name=_("document"), + ) + + user = models.ForeignKey( + User, + blank=True, + null=True, + related_name="notes", + on_delete=models.SET_NULL, + verbose_name=_("user"), + ) + + class Meta: + ordering = ("created",) + verbose_name = _("note") + verbose_name_plural = _("notes") + + def __str__(self): + return self.note + +class Metadata(models.Model): + data = models.JSONField( + _("data"), + blank=True, + help_text=_("JSON metadata"), + ) + + created = models.DateTimeField( + _("created"), + default=timezone.now, + db_index=True, + ) + + document = models.ForeignKey( + Document, + blank=True, + null=True, + related_name="metadatas", + on_delete=models.CASCADE, + verbose_name=_("document"), + ) + + user = models.ForeignKey( + User, + blank=True, + null=True, + related_name="metadatas", + on_delete=models.SET_NULL, + verbose_name=_("user"), + ) + + class Meta: + ordering = ("created",) + verbose_name = _("metadata") + verbose_name_plural = _("metadatas") + + def __str__(self): return str(self.data) \ No newline at end of file diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index 788dc2531..1217410a0 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -1,948 +1,948 @@ -import datetime -import math -import re - -from celery import states - -try: - import zoneinfo -except ImportError: - from backports import zoneinfo -import magic -from django.conf import settings -from django.utils.text import slugify -from django.utils.translation import gettext as _ -from rest_framework import serializers -from rest_framework.fields import SerializerMethodField - -from . import bulk_edit -from .models import Correspondent -from .models import Document -from .models import DocumentType -from .models import MatchingModel -from .models import SavedView -from .models import SavedViewFilterRule -from .models import StoragePath -from .models import Tag -from .models import UiSettings -from .models import PaperlessTask -from .parsers import is_mime_type_supported - -from guardian.shortcuts import get_users_with_perms - -from django.contrib.auth.models import User -from django.contrib.auth.models import Group - -from documents.permissions import get_groups_with_only_permission -from documents.permissions import set_permissions_for_object - - -# https://www.django-rest-framework.org/api-guide/serializers/#example -class DynamicFieldsModelSerializer(serializers.ModelSerializer): - """ - A ModelSerializer that takes an additional `fields` argument that - controls which fields should be displayed. - """ - - def __init__(self, *args, **kwargs): - # Don't pass the 'fields' arg up to the superclass - fields = kwargs.pop("fields", None) - - # Instantiate the superclass normally - super().__init__(*args, **kwargs) - - if fields is not None: - # Drop any fields that are not specified in the `fields` argument. - allowed = set(fields) - existing = set(self.fields) - for field_name in existing - allowed: - self.fields.pop(field_name) - - -class MatchingModelSerializer(serializers.ModelSerializer): - - document_count = serializers.IntegerField(read_only=True) - - def get_slug(self, obj): - return slugify(obj.name) - - slug = SerializerMethodField() - - def validate(self, data): - # see https://github.com/encode/django-rest-framework/issues/7173 - name = data["name"] if "name" in data else self.instance.name - owner = ( - data["owner"] - if "owner" in data - else self.user - if hasattr(self, "user") - else None - ) - pk = self.instance.pk if hasattr(self.instance, "pk") else None - if ("name" in data or "owner" in data) and self.Meta.model.objects.filter( - name=name, - owner=owner, - ).exclude(pk=pk).exists(): - raise serializers.ValidationError( - {"error": "Object violates owner / name unique constraint"}, - ) - return data - - def validate_match(self, match): - if ( - "matching_algorithm" in self.initial_data - and self.initial_data["matching_algorithm"] == MatchingModel.MATCH_REGEX - ): - try: - re.compile(match) - except re.error as e: - raise serializers.ValidationError( - _("Invalid regular expression: %(error)s") % {"error": str(e.msg)}, - ) - return match - - -class SetPermissionsMixin: - def _validate_user_ids(self, user_ids): - users = User.objects.none() - if user_ids is not None: - users = User.objects.filter(id__in=user_ids) - if not users.count() == len(user_ids): - raise serializers.ValidationError( - "Some users in don't exist or were specified twice.", - ) - return users - - def _validate_group_ids(self, group_ids): - groups = Group.objects.none() - if group_ids is not None: - groups = Group.objects.filter(id__in=group_ids) - if not groups.count() == len(group_ids): - raise serializers.ValidationError( - "Some groups in don't exist or were specified twice.", - ) - return groups - - def validate_set_permissions(self, set_permissions=None): - permissions_dict = { - "view": { - "users": User.objects.none(), - "groups": Group.objects.none(), - }, - "change": { - "users": User.objects.none(), - "groups": Group.objects.none(), - }, - } - if set_permissions is not None: - for action in permissions_dict: - if action in set_permissions: - users = set_permissions[action]["users"] - permissions_dict[action]["users"] = self._validate_user_ids(users) - groups = set_permissions[action]["groups"] - permissions_dict[action]["groups"] = self._validate_group_ids( - groups, - ) - return permissions_dict - - def _set_permissions(self, permissions, object): - set_permissions_for_object(permissions, object) - - -class OwnedObjectSerializer(serializers.ModelSerializer, SetPermissionsMixin): - def __init__(self, *args, **kwargs): - self.user = kwargs.pop("user", None) - super().__init__(*args, **kwargs) - - def get_permissions(self, obj): - view_codename = f"view_{obj.__class__.__name__.lower()}" - change_codename = f"change_{obj.__class__.__name__.lower()}" - return { - "view": { - "users": get_users_with_perms( - obj, - only_with_perms_in=[view_codename], - with_group_users=False, - ).values_list("id", flat=True), - "groups": get_groups_with_only_permission( - obj, - codename=view_codename, - ).values_list("id", flat=True), - }, - "change": { - "users": get_users_with_perms( - obj, - only_with_perms_in=[change_codename], - with_group_users=False, - ).values_list("id", flat=True), - "groups": get_groups_with_only_permission( - obj, - codename=change_codename, - ).values_list("id", flat=True), - }, - } - - permissions = SerializerMethodField(read_only=True) - - set_permissions = serializers.DictField( - label="Set permissions", - allow_empty=True, - required=False, - write_only=True, - ) - # other methods in mixin - - def create(self, validated_data): - if self.user and ( - "owner" not in validated_data or validated_data["owner"] is None - ): - validated_data["owner"] = self.user - permissions = None - if "set_permissions" in validated_data: - permissions = validated_data.pop("set_permissions") - instance = super().create(validated_data) - if permissions is not None: - self._set_permissions(permissions, instance) - return instance - - def update(self, instance, validated_data): - if "set_permissions" in validated_data: - self._set_permissions(validated_data["set_permissions"], instance) - if "owner" in validated_data and "name" in self.Meta.fields: - name = validated_data["name"] if "name" in validated_data else instance.name - not_unique = ( - self.Meta.model.objects.exclude(pk=instance.pk) - .filter(owner=validated_data["owner"], name=name) - .exists() - ) - if not_unique: - raise serializers.ValidationError( - {"error": "Object violates owner / name unique constraint"}, - ) - return super().update(instance, validated_data) - - -class CorrespondentSerializer(MatchingModelSerializer, OwnedObjectSerializer): - - last_correspondence = serializers.DateTimeField(read_only=True) - - class Meta: - model = Correspondent - fields = ( - "id", - "slug", - "name", - "match", - "matching_algorithm", - "is_insensitive", - "document_count", - "last_correspondence", - "owner", - "permissions", - "set_permissions", - ) - - -class DocumentTypeSerializer(MatchingModelSerializer, OwnedObjectSerializer): - class Meta: - model = DocumentType - fields = ( - "id", - "slug", - "name", - "match", - "matching_algorithm", - "is_insensitive", - "document_count", - "owner", - "permissions", - "set_permissions", - "default_metadata", - ) - - -class ColorField(serializers.Field): - - COLOURS = ( - (1, "#a6cee3"), - (2, "#1f78b4"), - (3, "#b2df8a"), - (4, "#33a02c"), - (5, "#fb9a99"), - (6, "#e31a1c"), - (7, "#fdbf6f"), - (8, "#ff7f00"), - (9, "#cab2d6"), - (10, "#6a3d9a"), - (11, "#b15928"), - (12, "#000000"), - (13, "#cccccc"), - ) - - def to_internal_value(self, data): - for id, color in self.COLOURS: - if id == data: - return color - raise serializers.ValidationError - - def to_representation(self, value): - for id, color in self.COLOURS: - if color == value: - return id - return 1 - - -class TagSerializerVersion1(MatchingModelSerializer, OwnedObjectSerializer): - - colour = ColorField(source="color", default="#a6cee3") - - class Meta: - model = Tag - fields = ( - "id", - "slug", - "name", - "colour", - "match", - "matching_algorithm", - "is_insensitive", - "is_inbox_tag", - "document_count", - "owner", - "permissions", - "set_permissions", - ) - - -class TagSerializer(MatchingModelSerializer, OwnedObjectSerializer): - def get_text_color(self, obj): - try: - h = obj.color.lstrip("#") - rgb = tuple(int(h[i : i + 2], 16) / 256 for i in (0, 2, 4)) - luminance = math.sqrt( - 0.299 * math.pow(rgb[0], 2) - + 0.587 * math.pow(rgb[1], 2) - + 0.114 * math.pow(rgb[2], 2), - ) - return "#ffffff" if luminance < 0.53 else "#000000" - except ValueError: - return "#000000" - - text_color = serializers.SerializerMethodField() - - class Meta: - model = Tag - fields = ( - "id", - "slug", - "name", - "color", - "text_color", - "match", - "matching_algorithm", - "is_insensitive", - "is_inbox_tag", - "document_count", - "owner", - "permissions", - "set_permissions", - ) - - def validate_color(self, color): - regex = r"#[0-9a-fA-F]{6}" - if not re.match(regex, color): - raise serializers.ValidationError(_("Invalid color.")) - return color - - -class CorrespondentField(serializers.PrimaryKeyRelatedField): - def get_queryset(self): - return Correspondent.objects.all() - - -class TagsField(serializers.PrimaryKeyRelatedField): - def get_queryset(self): - return Tag.objects.all() - - -class DocumentTypeField(serializers.PrimaryKeyRelatedField): - def get_queryset(self): - return DocumentType.objects.all() - - -class StoragePathField(serializers.PrimaryKeyRelatedField): - def get_queryset(self): - return StoragePath.objects.all() - - -class DocumentSerializer(OwnedObjectSerializer, DynamicFieldsModelSerializer): - - correspondent = CorrespondentField(allow_null=True) - tags = TagsField(many=True) - document_type = DocumentTypeField(allow_null=True) - storage_path = StoragePathField(allow_null=True) - - original_file_name = SerializerMethodField() - archived_file_name = SerializerMethodField() - created_date = serializers.DateField(required=False) - - owner = serializers.PrimaryKeyRelatedField( - queryset=User.objects.all(), - required=False, - allow_null=True, - ) - - def get_original_file_name(self, obj): - return obj.get_public_filename() - - def get_archived_file_name(self, obj): - if obj.has_archive_version: - return obj.get_public_filename(archive=True) - else: - return None - - def to_representation(self, instance): - doc = super().to_representation(instance) - if self.truncate_content: - doc["content"] = doc.get("content")[0:550] - return doc - - def update(self, instance, validated_data): - if "created_date" in validated_data and "created" not in validated_data: - new_datetime = datetime.datetime.combine( - validated_data.get("created_date"), - datetime.time(0, 0, 0, 0, zoneinfo.ZoneInfo(settings.TIME_ZONE)), - ) - instance.created = new_datetime - instance.save() - if "created_date" in validated_data: - validated_data.pop("created_date") - super().update(instance, validated_data) - return instance - - def __init__(self, *args, **kwargs): - self.truncate_content = kwargs.pop("truncate_content", False) - - super().__init__(*args, **kwargs) - - class Meta: - model = Document - depth = 1 - fields = ( - "id", - "correspondent", - "document_type", - "storage_path", - "title", - "content", - "tags", - "created", - "created_date", - "modified", - "added", - "archive_serial_number", - "original_file_name", - "archived_file_name", - "owner", - "permissions", - "set_permissions", - "notes", - "metadatas", - ) - - -class SavedViewFilterRuleSerializer(serializers.ModelSerializer): - class Meta: - model = SavedViewFilterRule - fields = ["rule_type", "value"] - - -class SavedViewSerializer(OwnedObjectSerializer): - - filter_rules = SavedViewFilterRuleSerializer(many=True) - - class Meta: - model = SavedView - depth = 1 - fields = [ - "id", - "name", - "show_on_dashboard", - "show_in_sidebar", - "sort_field", - "sort_reverse", - "filter_rules", - "owner", - "permissions", - "set_permissions", - ] - - def update(self, instance, validated_data): - if "filter_rules" in validated_data: - rules_data = validated_data.pop("filter_rules") - else: - rules_data = None - if "user" in validated_data: - # backwards compatibility - validated_data["owner"] = validated_data.pop("user") - super().update(instance, validated_data) - if rules_data is not None: - SavedViewFilterRule.objects.filter(saved_view=instance).delete() - for rule_data in rules_data: - SavedViewFilterRule.objects.create(saved_view=instance, **rule_data) - return instance - - def create(self, validated_data): - rules_data = validated_data.pop("filter_rules") - if "user" in validated_data: - # backwards compatibility - validated_data["owner"] = validated_data.pop("user") - saved_view = SavedView.objects.create(**validated_data) - for rule_data in rules_data: - SavedViewFilterRule.objects.create(saved_view=saved_view, **rule_data) - return saved_view - - -class DocumentListSerializer(serializers.Serializer): - - documents = serializers.ListField( - required=True, - label="Documents", - write_only=True, - child=serializers.IntegerField(), - ) - - def _validate_document_id_list(self, documents, name="documents"): - if not type(documents) == list: - raise serializers.ValidationError(f"{name} must be a list") - if not all(type(i) == int for i in documents): - raise serializers.ValidationError(f"{name} must be a list of integers") - count = Document.objects.filter(id__in=documents).count() - if not count == len(documents): - raise serializers.ValidationError( - f"Some documents in {name} don't exist or were specified twice.", - ) - - def validate_documents(self, documents): - self._validate_document_id_list(documents) - return documents - - -class BulkEditSerializer(DocumentListSerializer, SetPermissionsMixin): - - method = serializers.ChoiceField( - choices=[ - "set_correspondent", - "set_document_type", - "set_storage_path", - "add_tag", - "remove_tag", - "modify_tags", - "delete", - "redo_ocr", - "set_permissions", - ], - label="Method", - write_only=True, - ) - - parameters = serializers.DictField(allow_empty=True) - - def _validate_tag_id_list(self, tags, name="tags"): - if not type(tags) == list: - raise serializers.ValidationError(f"{name} must be a list") - if not all(type(i) == int for i in tags): - raise serializers.ValidationError(f"{name} must be a list of integers") - count = Tag.objects.filter(id__in=tags).count() - if not count == len(tags): - raise serializers.ValidationError( - f"Some tags in {name} don't exist or were specified twice.", - ) - - def validate_method(self, method): - if method == "set_correspondent": - return bulk_edit.set_correspondent - elif method == "set_document_type": - return bulk_edit.set_document_type - elif method == "set_storage_path": - return bulk_edit.set_storage_path - elif method == "add_tag": - return bulk_edit.add_tag - elif method == "remove_tag": - return bulk_edit.remove_tag - elif method == "modify_tags": - return bulk_edit.modify_tags - elif method == "delete": - return bulk_edit.delete - elif method == "redo_ocr": - return bulk_edit.redo_ocr - elif method == "set_permissions": - return bulk_edit.set_permissions - else: - raise serializers.ValidationError("Unsupported method.") - - def _validate_parameters_tags(self, parameters): - if "tag" in parameters: - tag_id = parameters["tag"] - try: - Tag.objects.get(id=tag_id) - except Tag.DoesNotExist: - raise serializers.ValidationError("Tag does not exist") - else: - raise serializers.ValidationError("tag not specified") - - def _validate_parameters_document_type(self, parameters): - if "document_type" in parameters: - document_type_id = parameters["document_type"] - if document_type_id is None: - # None is ok - return - try: - DocumentType.objects.get(id=document_type_id) - except DocumentType.DoesNotExist: - raise serializers.ValidationError("Document type does not exist") - else: - raise serializers.ValidationError("document_type not specified") - - def _validate_parameters_correspondent(self, parameters): - if "correspondent" in parameters: - correspondent_id = parameters["correspondent"] - if correspondent_id is None: - return - try: - Correspondent.objects.get(id=correspondent_id) - except Correspondent.DoesNotExist: - raise serializers.ValidationError("Correspondent does not exist") - else: - raise serializers.ValidationError("correspondent not specified") - - def _validate_storage_path(self, parameters): - if "storage_path" in parameters: - storage_path_id = parameters["storage_path"] - if storage_path_id is None: - return - try: - StoragePath.objects.get(id=storage_path_id) - except StoragePath.DoesNotExist: - raise serializers.ValidationError( - "Storage path does not exist", - ) - else: - raise serializers.ValidationError("storage path not specified") - - def _validate_parameters_modify_tags(self, parameters): - if "add_tags" in parameters: - self._validate_tag_id_list(parameters["add_tags"], "add_tags") - else: - raise serializers.ValidationError("add_tags not specified") - - if "remove_tags" in parameters: - self._validate_tag_id_list(parameters["remove_tags"], "remove_tags") - else: - raise serializers.ValidationError("remove_tags not specified") - - def _validate_owner(self, owner): - ownerUser = User.objects.get(pk=owner) - if ownerUser is None: - raise serializers.ValidationError("Specified owner cannot be found") - return ownerUser - - def _validate_parameters_set_permissions(self, parameters): - parameters["set_permissions"] = self.validate_set_permissions( - parameters["set_permissions"], - ) - if "owner" in parameters and parameters["owner"] is not None: - self._validate_owner(parameters["owner"]) - - def validate(self, attrs): - - method = attrs["method"] - parameters = attrs["parameters"] - - if method == bulk_edit.set_correspondent: - self._validate_parameters_correspondent(parameters) - elif method == bulk_edit.set_document_type: - self._validate_parameters_document_type(parameters) - elif method == bulk_edit.add_tag or method == bulk_edit.remove_tag: - self._validate_parameters_tags(parameters) - elif method == bulk_edit.modify_tags: - self._validate_parameters_modify_tags(parameters) - elif method == bulk_edit.set_storage_path: - self._validate_storage_path(parameters) - elif method == bulk_edit.set_permissions: - self._validate_parameters_set_permissions(parameters) - - return attrs - - -class PostDocumentSerializer(serializers.Serializer): - - created = serializers.DateTimeField( - label="Created", - allow_null=True, - write_only=True, - required=False, - ) - - document = serializers.FileField( - label="Document", - write_only=True, - ) - - title = serializers.CharField( - label="Title", - write_only=True, - required=False, - ) - - correspondent = serializers.PrimaryKeyRelatedField( - queryset=Correspondent.objects.all(), - label="Correspondent", - allow_null=True, - write_only=True, - required=False, - ) - - document_type = serializers.PrimaryKeyRelatedField( - queryset=DocumentType.objects.all(), - label="Document type", - allow_null=True, - write_only=True, - required=False, - ) - - tags = serializers.PrimaryKeyRelatedField( - many=True, - queryset=Tag.objects.all(), - label="Tags", - write_only=True, - required=False, - ) - - archive_serial_number = serializers.IntegerField( - label="ASN", - write_only=True, - required=False, - min_value=Document.ARCHIVE_SERIAL_NUMBER_MIN, - max_value=Document.ARCHIVE_SERIAL_NUMBER_MAX, - ) - - storage_path_id = serializers.IntegerField( - label="Storage path ID", - allow_null=True, - write_only=True, - required=False, - ) - - full_path = serializers.CharField( - label="Full Path", - allow_null=True, - write_only=True, - required=False, - ) - - def validate_document(self, document): - document_data = document.file.read() - mime_type = magic.from_buffer(document_data, mime=True) - - if not is_mime_type_supported(mime_type): - raise serializers.ValidationError( - _("File type %(type)s not supported") % {"type": mime_type}, - ) - - return document.name, document_data - - def validate_correspondent(self, correspondent): - if correspondent: - return correspondent.id - else: - return None - - def validate_document_type(self, document_type): - if document_type: - return document_type.id - else: - return None - - def validate_tags(self, tags): - if tags: - return [tag.id for tag in tags] - else: - return None - - -class BulkDownloadSerializer(DocumentListSerializer): - - content = serializers.ChoiceField( - choices=["archive", "originals", "both"], - default="archive", - ) - - compression = serializers.ChoiceField( - choices=["none", "deflated", "bzip2", "lzma"], - default="none", - ) - - follow_formatting = serializers.BooleanField( - default=False, - ) - - def validate_compression(self, compression): - import zipfile - - return { - "none": zipfile.ZIP_STORED, - "deflated": zipfile.ZIP_DEFLATED, - "bzip2": zipfile.ZIP_BZIP2, - "lzma": zipfile.ZIP_LZMA, - }[compression] - - -class StoragePathSerializer(MatchingModelSerializer, OwnedObjectSerializer): - class Meta: - model = StoragePath - fields = ( - "id", - "slug", - "name", - "path", - "match", - "matching_algorithm", - "is_insensitive", - "document_count", - "owner", - "permissions", - "set_permissions", - ) - - def validate_path(self, path): - try: - path.format( - title="title", - correspondent="correspondent", - document_type="document_type", - created="created", - created_year="created_year", - created_year_short="created_year_short", - created_month="created_month", - created_month_name="created_month_name", - created_month_name_short="created_month_name_short", - created_day="created_day", - added="added", - added_year="added_year", - added_year_short="added_year_short", - added_month="added_month", - added_month_name="added_month_name", - added_month_name_short="added_month_name_short", - added_day="added_day", - asn="asn", - tags="tags", - tag_list="tag_list", - owner_username="someone", - original_name="testfile", - ) - - except KeyError as err: - raise serializers.ValidationError(_("Invalid variable detected.")) from err - - return path - - def update(self, instance, validated_data): - """ - When a storage path is updated, see if documents - using it require a rename/move - """ - doc_ids = [doc.id for doc in instance.documents.all()] - if len(doc_ids): - bulk_edit.bulk_update_documents.delay(doc_ids) - - return super().update(instance, validated_data) - - -class UiSettingsViewSerializer(serializers.ModelSerializer): - class Meta: - model = UiSettings - depth = 1 - fields = [ - "id", - "settings", - ] - - def validate_settings(self, settings): - # we never save update checking backend setting - if "update_checking" in settings: - try: - settings["update_checking"].pop("backend_setting") - except KeyError: - pass - return settings - - def create(self, validated_data): - ui_settings = UiSettings.objects.update_or_create( - user=validated_data.get("user"), - defaults={"settings": validated_data.get("settings", None)}, - ) - return ui_settings - - -class TasksViewSerializer(serializers.ModelSerializer): - class Meta: - model = PaperlessTask - depth = 1 - fields = ( - "id", - "task_id", - "task_file_name", - "date_created", - "date_done", - "type", - "status", - "result", - "acknowledged", - "related_document", - ) - - type = serializers.SerializerMethodField() - - def get_type(self, obj): - # just file tasks, for now - return "file" - - related_document = serializers.SerializerMethodField() - related_doc_re = re.compile(r"New document id (\d+) created") - - def get_related_document(self, obj): - result = None - if obj.status is not None and obj.status == states.SUCCESS: - try: - result = self.related_doc_re.search(obj.result).group(1) - except Exception: - pass - - return result - - -class AcknowledgeTasksViewSerializer(serializers.Serializer): - - tasks = serializers.ListField( - required=True, - label="Tasks", - write_only=True, - child=serializers.IntegerField(), - ) - - def _validate_task_id_list(self, tasks, name="tasks"): - pass - if not type(tasks) == list: - raise serializers.ValidationError(f"{name} must be a list") - if not all(type(i) == int for i in tasks): - raise serializers.ValidationError(f"{name} must be a list of integers") - count = PaperlessTask.objects.filter(id__in=tasks).count() - if not count == len(tasks): - raise serializers.ValidationError( - f"Some tasks in {name} don't exist or were specified twice.", - ) - - def validate_tasks(self, tasks): - self._validate_task_id_list(tasks) - return tasks +import datetime +import math +import re + +from celery import states + +try: + import zoneinfo +except ImportError: + from backports import zoneinfo +import magic +from django.conf import settings +from django.utils.text import slugify +from django.utils.translation import gettext as _ +from rest_framework import serializers +from rest_framework.fields import SerializerMethodField + +from . import bulk_edit +from .models import Correspondent +from .models import Document +from .models import DocumentType +from .models import MatchingModel +from .models import SavedView +from .models import SavedViewFilterRule +from .models import StoragePath +from .models import Tag +from .models import UiSettings +from .models import PaperlessTask +from .parsers import is_mime_type_supported + +from guardian.shortcuts import get_users_with_perms + +from django.contrib.auth.models import User +from django.contrib.auth.models import Group + +from documents.permissions import get_groups_with_only_permission +from documents.permissions import set_permissions_for_object + + +# https://www.django-rest-framework.org/api-guide/serializers/#example +class DynamicFieldsModelSerializer(serializers.ModelSerializer): + """ + A ModelSerializer that takes an additional `fields` argument that + controls which fields should be displayed. + """ + + def __init__(self, *args, **kwargs): + # Don't pass the 'fields' arg up to the superclass + fields = kwargs.pop("fields", None) + + # Instantiate the superclass normally + super().__init__(*args, **kwargs) + + if fields is not None: + # Drop any fields that are not specified in the `fields` argument. + allowed = set(fields) + existing = set(self.fields) + for field_name in existing - allowed: + self.fields.pop(field_name) + + +class MatchingModelSerializer(serializers.ModelSerializer): + + document_count = serializers.IntegerField(read_only=True) + + def get_slug(self, obj): + return slugify(obj.name) + + slug = SerializerMethodField() + + def validate(self, data): + # see https://github.com/encode/django-rest-framework/issues/7173 + name = data["name"] if "name" in data else self.instance.name + owner = ( + data["owner"] + if "owner" in data + else self.user + if hasattr(self, "user") + else None + ) + pk = self.instance.pk if hasattr(self.instance, "pk") else None + if ("name" in data or "owner" in data) and self.Meta.model.objects.filter( + name=name, + owner=owner, + ).exclude(pk=pk).exists(): + raise serializers.ValidationError( + {"error": "Object violates owner / name unique constraint"}, + ) + return data + + def validate_match(self, match): + if ( + "matching_algorithm" in self.initial_data + and self.initial_data["matching_algorithm"] == MatchingModel.MATCH_REGEX + ): + try: + re.compile(match) + except re.error as e: + raise serializers.ValidationError( + _("Invalid regular expression: %(error)s") % {"error": str(e.msg)}, + ) + return match + + +class SetPermissionsMixin: + def _validate_user_ids(self, user_ids): + users = User.objects.none() + if user_ids is not None: + users = User.objects.filter(id__in=user_ids) + if not users.count() == len(user_ids): + raise serializers.ValidationError( + "Some users in don't exist or were specified twice.", + ) + return users + + def _validate_group_ids(self, group_ids): + groups = Group.objects.none() + if group_ids is not None: + groups = Group.objects.filter(id__in=group_ids) + if not groups.count() == len(group_ids): + raise serializers.ValidationError( + "Some groups in don't exist or were specified twice.", + ) + return groups + + def validate_set_permissions(self, set_permissions=None): + permissions_dict = { + "view": { + "users": User.objects.none(), + "groups": Group.objects.none(), + }, + "change": { + "users": User.objects.none(), + "groups": Group.objects.none(), + }, + } + if set_permissions is not None: + for action in permissions_dict: + if action in set_permissions: + users = set_permissions[action]["users"] + permissions_dict[action]["users"] = self._validate_user_ids(users) + groups = set_permissions[action]["groups"] + permissions_dict[action]["groups"] = self._validate_group_ids( + groups, + ) + return permissions_dict + + def _set_permissions(self, permissions, object): + set_permissions_for_object(permissions, object) + + +class OwnedObjectSerializer(serializers.ModelSerializer, SetPermissionsMixin): + def __init__(self, *args, **kwargs): + self.user = kwargs.pop("user", None) + super().__init__(*args, **kwargs) + + def get_permissions(self, obj): + view_codename = f"view_{obj.__class__.__name__.lower()}" + change_codename = f"change_{obj.__class__.__name__.lower()}" + return { + "view": { + "users": get_users_with_perms( + obj, + only_with_perms_in=[view_codename], + with_group_users=False, + ).values_list("id", flat=True), + "groups": get_groups_with_only_permission( + obj, + codename=view_codename, + ).values_list("id", flat=True), + }, + "change": { + "users": get_users_with_perms( + obj, + only_with_perms_in=[change_codename], + with_group_users=False, + ).values_list("id", flat=True), + "groups": get_groups_with_only_permission( + obj, + codename=change_codename, + ).values_list("id", flat=True), + }, + } + + permissions = SerializerMethodField(read_only=True) + + set_permissions = serializers.DictField( + label="Set permissions", + allow_empty=True, + required=False, + write_only=True, + ) + # other methods in mixin + + def create(self, validated_data): + if self.user and ( + "owner" not in validated_data or validated_data["owner"] is None + ): + validated_data["owner"] = self.user + permissions = None + if "set_permissions" in validated_data: + permissions = validated_data.pop("set_permissions") + instance = super().create(validated_data) + if permissions is not None: + self._set_permissions(permissions, instance) + return instance + + def update(self, instance, validated_data): + if "set_permissions" in validated_data: + self._set_permissions(validated_data["set_permissions"], instance) + if "owner" in validated_data and "name" in self.Meta.fields: + name = validated_data["name"] if "name" in validated_data else instance.name + not_unique = ( + self.Meta.model.objects.exclude(pk=instance.pk) + .filter(owner=validated_data["owner"], name=name) + .exists() + ) + if not_unique: + raise serializers.ValidationError( + {"error": "Object violates owner / name unique constraint"}, + ) + return super().update(instance, validated_data) + + +class CorrespondentSerializer(MatchingModelSerializer, OwnedObjectSerializer): + + last_correspondence = serializers.DateTimeField(read_only=True) + + class Meta: + model = Correspondent + fields = ( + "id", + "slug", + "name", + "match", + "matching_algorithm", + "is_insensitive", + "document_count", + "last_correspondence", + "owner", + "permissions", + "set_permissions", + ) + + +class DocumentTypeSerializer(MatchingModelSerializer, OwnedObjectSerializer): + class Meta: + model = DocumentType + fields = ( + "id", + "slug", + "name", + "match", + "matching_algorithm", + "is_insensitive", + "document_count", + "owner", + "permissions", + "set_permissions", + "default_metadata", + ) + + +class ColorField(serializers.Field): + + COLOURS = ( + (1, "#a6cee3"), + (2, "#1f78b4"), + (3, "#b2df8a"), + (4, "#33a02c"), + (5, "#fb9a99"), + (6, "#e31a1c"), + (7, "#fdbf6f"), + (8, "#ff7f00"), + (9, "#cab2d6"), + (10, "#6a3d9a"), + (11, "#b15928"), + (12, "#000000"), + (13, "#cccccc"), + ) + + def to_internal_value(self, data): + for id, color in self.COLOURS: + if id == data: + return color + raise serializers.ValidationError + + def to_representation(self, value): + for id, color in self.COLOURS: + if color == value: + return id + return 1 + + +class TagSerializerVersion1(MatchingModelSerializer, OwnedObjectSerializer): + + colour = ColorField(source="color", default="#a6cee3") + + class Meta: + model = Tag + fields = ( + "id", + "slug", + "name", + "colour", + "match", + "matching_algorithm", + "is_insensitive", + "is_inbox_tag", + "document_count", + "owner", + "permissions", + "set_permissions", + ) + + +class TagSerializer(MatchingModelSerializer, OwnedObjectSerializer): + def get_text_color(self, obj): + try: + h = obj.color.lstrip("#") + rgb = tuple(int(h[i : i + 2], 16) / 256 for i in (0, 2, 4)) + luminance = math.sqrt( + 0.299 * math.pow(rgb[0], 2) + + 0.587 * math.pow(rgb[1], 2) + + 0.114 * math.pow(rgb[2], 2), + ) + return "#ffffff" if luminance < 0.53 else "#000000" + except ValueError: + return "#000000" + + text_color = serializers.SerializerMethodField() + + class Meta: + model = Tag + fields = ( + "id", + "slug", + "name", + "color", + "text_color", + "match", + "matching_algorithm", + "is_insensitive", + "is_inbox_tag", + "document_count", + "owner", + "permissions", + "set_permissions", + ) + + def validate_color(self, color): + regex = r"#[0-9a-fA-F]{6}" + if not re.match(regex, color): + raise serializers.ValidationError(_("Invalid color.")) + return color + + +class CorrespondentField(serializers.PrimaryKeyRelatedField): + def get_queryset(self): + return Correspondent.objects.all() + + +class TagsField(serializers.PrimaryKeyRelatedField): + def get_queryset(self): + return Tag.objects.all() + + +class DocumentTypeField(serializers.PrimaryKeyRelatedField): + def get_queryset(self): + return DocumentType.objects.all() + + +class StoragePathField(serializers.PrimaryKeyRelatedField): + def get_queryset(self): + return StoragePath.objects.all() + + +class DocumentSerializer(OwnedObjectSerializer, DynamicFieldsModelSerializer): + + correspondent = CorrespondentField(allow_null=True) + tags = TagsField(many=True) + document_type = DocumentTypeField(allow_null=True) + storage_path = StoragePathField(allow_null=True) + + original_file_name = SerializerMethodField() + archived_file_name = SerializerMethodField() + created_date = serializers.DateField(required=False) + + owner = serializers.PrimaryKeyRelatedField( + queryset=User.objects.all(), + required=False, + allow_null=True, + ) + + def get_original_file_name(self, obj): + return obj.get_public_filename() + + def get_archived_file_name(self, obj): + if obj.has_archive_version: + return obj.get_public_filename(archive=True) + else: + return None + + def to_representation(self, instance): + doc = super().to_representation(instance) + if self.truncate_content: + doc["content"] = doc.get("content")[0:550] + return doc + + def update(self, instance, validated_data): + if "created_date" in validated_data and "created" not in validated_data: + new_datetime = datetime.datetime.combine( + validated_data.get("created_date"), + datetime.time(0, 0, 0, 0, zoneinfo.ZoneInfo(settings.TIME_ZONE)), + ) + instance.created = new_datetime + instance.save() + if "created_date" in validated_data: + validated_data.pop("created_date") + super().update(instance, validated_data) + return instance + + def __init__(self, *args, **kwargs): + self.truncate_content = kwargs.pop("truncate_content", False) + + super().__init__(*args, **kwargs) + + class Meta: + model = Document + depth = 1 + fields = ( + "id", + "correspondent", + "document_type", + "storage_path", + "title", + "content", + "tags", + "created", + "created_date", + "modified", + "added", + "archive_serial_number", + "original_file_name", + "archived_file_name", + "owner", + "permissions", + "set_permissions", + "notes", + "metadatas", + ) + + +class SavedViewFilterRuleSerializer(serializers.ModelSerializer): + class Meta: + model = SavedViewFilterRule + fields = ["rule_type", "value"] + + +class SavedViewSerializer(OwnedObjectSerializer): + + filter_rules = SavedViewFilterRuleSerializer(many=True) + + class Meta: + model = SavedView + depth = 1 + fields = [ + "id", + "name", + "show_on_dashboard", + "show_in_sidebar", + "sort_field", + "sort_reverse", + "filter_rules", + "owner", + "permissions", + "set_permissions", + ] + + def update(self, instance, validated_data): + if "filter_rules" in validated_data: + rules_data = validated_data.pop("filter_rules") + else: + rules_data = None + if "user" in validated_data: + # backwards compatibility + validated_data["owner"] = validated_data.pop("user") + super().update(instance, validated_data) + if rules_data is not None: + SavedViewFilterRule.objects.filter(saved_view=instance).delete() + for rule_data in rules_data: + SavedViewFilterRule.objects.create(saved_view=instance, **rule_data) + return instance + + def create(self, validated_data): + rules_data = validated_data.pop("filter_rules") + if "user" in validated_data: + # backwards compatibility + validated_data["owner"] = validated_data.pop("user") + saved_view = SavedView.objects.create(**validated_data) + for rule_data in rules_data: + SavedViewFilterRule.objects.create(saved_view=saved_view, **rule_data) + return saved_view + + +class DocumentListSerializer(serializers.Serializer): + + documents = serializers.ListField( + required=True, + label="Documents", + write_only=True, + child=serializers.IntegerField(), + ) + + def _validate_document_id_list(self, documents, name="documents"): + if not type(documents) == list: + raise serializers.ValidationError(f"{name} must be a list") + if not all(type(i) == int for i in documents): + raise serializers.ValidationError(f"{name} must be a list of integers") + count = Document.objects.filter(id__in=documents).count() + if not count == len(documents): + raise serializers.ValidationError( + f"Some documents in {name} don't exist or were specified twice.", + ) + + def validate_documents(self, documents): + self._validate_document_id_list(documents) + return documents + + +class BulkEditSerializer(DocumentListSerializer, SetPermissionsMixin): + + method = serializers.ChoiceField( + choices=[ + "set_correspondent", + "set_document_type", + "set_storage_path", + "add_tag", + "remove_tag", + "modify_tags", + "delete", + "redo_ocr", + "set_permissions", + ], + label="Method", + write_only=True, + ) + + parameters = serializers.DictField(allow_empty=True) + + def _validate_tag_id_list(self, tags, name="tags"): + if not type(tags) == list: + raise serializers.ValidationError(f"{name} must be a list") + if not all(type(i) == int for i in tags): + raise serializers.ValidationError(f"{name} must be a list of integers") + count = Tag.objects.filter(id__in=tags).count() + if not count == len(tags): + raise serializers.ValidationError( + f"Some tags in {name} don't exist or were specified twice.", + ) + + def validate_method(self, method): + if method == "set_correspondent": + return bulk_edit.set_correspondent + elif method == "set_document_type": + return bulk_edit.set_document_type + elif method == "set_storage_path": + return bulk_edit.set_storage_path + elif method == "add_tag": + return bulk_edit.add_tag + elif method == "remove_tag": + return bulk_edit.remove_tag + elif method == "modify_tags": + return bulk_edit.modify_tags + elif method == "delete": + return bulk_edit.delete + elif method == "redo_ocr": + return bulk_edit.redo_ocr + elif method == "set_permissions": + return bulk_edit.set_permissions + else: + raise serializers.ValidationError("Unsupported method.") + + def _validate_parameters_tags(self, parameters): + if "tag" in parameters: + tag_id = parameters["tag"] + try: + Tag.objects.get(id=tag_id) + except Tag.DoesNotExist: + raise serializers.ValidationError("Tag does not exist") + else: + raise serializers.ValidationError("tag not specified") + + def _validate_parameters_document_type(self, parameters): + if "document_type" in parameters: + document_type_id = parameters["document_type"] + if document_type_id is None: + # None is ok + return + try: + DocumentType.objects.get(id=document_type_id) + except DocumentType.DoesNotExist: + raise serializers.ValidationError("Document type does not exist") + else: + raise serializers.ValidationError("document_type not specified") + + def _validate_parameters_correspondent(self, parameters): + if "correspondent" in parameters: + correspondent_id = parameters["correspondent"] + if correspondent_id is None: + return + try: + Correspondent.objects.get(id=correspondent_id) + except Correspondent.DoesNotExist: + raise serializers.ValidationError("Correspondent does not exist") + else: + raise serializers.ValidationError("correspondent not specified") + + def _validate_storage_path(self, parameters): + if "storage_path" in parameters: + storage_path_id = parameters["storage_path"] + if storage_path_id is None: + return + try: + StoragePath.objects.get(id=storage_path_id) + except StoragePath.DoesNotExist: + raise serializers.ValidationError( + "Storage path does not exist", + ) + else: + raise serializers.ValidationError("storage path not specified") + + def _validate_parameters_modify_tags(self, parameters): + if "add_tags" in parameters: + self._validate_tag_id_list(parameters["add_tags"], "add_tags") + else: + raise serializers.ValidationError("add_tags not specified") + + if "remove_tags" in parameters: + self._validate_tag_id_list(parameters["remove_tags"], "remove_tags") + else: + raise serializers.ValidationError("remove_tags not specified") + + def _validate_owner(self, owner): + ownerUser = User.objects.get(pk=owner) + if ownerUser is None: + raise serializers.ValidationError("Specified owner cannot be found") + return ownerUser + + def _validate_parameters_set_permissions(self, parameters): + parameters["set_permissions"] = self.validate_set_permissions( + parameters["set_permissions"], + ) + if "owner" in parameters and parameters["owner"] is not None: + self._validate_owner(parameters["owner"]) + + def validate(self, attrs): + + method = attrs["method"] + parameters = attrs["parameters"] + + if method == bulk_edit.set_correspondent: + self._validate_parameters_correspondent(parameters) + elif method == bulk_edit.set_document_type: + self._validate_parameters_document_type(parameters) + elif method == bulk_edit.add_tag or method == bulk_edit.remove_tag: + self._validate_parameters_tags(parameters) + elif method == bulk_edit.modify_tags: + self._validate_parameters_modify_tags(parameters) + elif method == bulk_edit.set_storage_path: + self._validate_storage_path(parameters) + elif method == bulk_edit.set_permissions: + self._validate_parameters_set_permissions(parameters) + + return attrs + + +class PostDocumentSerializer(serializers.Serializer): + + created = serializers.DateTimeField( + label="Created", + allow_null=True, + write_only=True, + required=False, + ) + + document = serializers.FileField( + label="Document", + write_only=True, + ) + + title = serializers.CharField( + label="Title", + write_only=True, + required=False, + ) + + correspondent = serializers.PrimaryKeyRelatedField( + queryset=Correspondent.objects.all(), + label="Correspondent", + allow_null=True, + write_only=True, + required=False, + ) + + document_type = serializers.PrimaryKeyRelatedField( + queryset=DocumentType.objects.all(), + label="Document type", + allow_null=True, + write_only=True, + required=False, + ) + + tags = serializers.PrimaryKeyRelatedField( + many=True, + queryset=Tag.objects.all(), + label="Tags", + write_only=True, + required=False, + ) + + archive_serial_number = serializers.IntegerField( + label="ASN", + write_only=True, + required=False, + min_value=Document.ARCHIVE_SERIAL_NUMBER_MIN, + max_value=Document.ARCHIVE_SERIAL_NUMBER_MAX, + ) + + storage_path_id = serializers.IntegerField( + label="Storage path ID", + allow_null=True, + write_only=True, + required=False, + ) + + full_path = serializers.CharField( + label="Full Path", + allow_null=True, + write_only=True, + required=False, + ) + + def validate_document(self, document): + document_data = document.file.read() + mime_type = magic.from_buffer(document_data, mime=True) + + if not is_mime_type_supported(mime_type): + raise serializers.ValidationError( + _("File type %(type)s not supported") % {"type": mime_type}, + ) + + return document.name, document_data + + def validate_correspondent(self, correspondent): + if correspondent: + return correspondent.id + else: + return None + + def validate_document_type(self, document_type): + if document_type: + return document_type.id + else: + return None + + def validate_tags(self, tags): + if tags: + return [tag.id for tag in tags] + else: + return None + + +class BulkDownloadSerializer(DocumentListSerializer): + + content = serializers.ChoiceField( + choices=["archive", "originals", "both"], + default="archive", + ) + + compression = serializers.ChoiceField( + choices=["none", "deflated", "bzip2", "lzma"], + default="none", + ) + + follow_formatting = serializers.BooleanField( + default=False, + ) + + def validate_compression(self, compression): + import zipfile + + return { + "none": zipfile.ZIP_STORED, + "deflated": zipfile.ZIP_DEFLATED, + "bzip2": zipfile.ZIP_BZIP2, + "lzma": zipfile.ZIP_LZMA, + }[compression] + + +class StoragePathSerializer(MatchingModelSerializer, OwnedObjectSerializer): + class Meta: + model = StoragePath + fields = ( + "id", + "slug", + "name", + "path", + "match", + "matching_algorithm", + "is_insensitive", + "document_count", + "owner", + "permissions", + "set_permissions", + ) + + def validate_path(self, path): + try: + path.format( + title="title", + correspondent="correspondent", + document_type="document_type", + created="created", + created_year="created_year", + created_year_short="created_year_short", + created_month="created_month", + created_month_name="created_month_name", + created_month_name_short="created_month_name_short", + created_day="created_day", + added="added", + added_year="added_year", + added_year_short="added_year_short", + added_month="added_month", + added_month_name="added_month_name", + added_month_name_short="added_month_name_short", + added_day="added_day", + asn="asn", + tags="tags", + tag_list="tag_list", + owner_username="someone", + original_name="testfile", + ) + + except KeyError as err: + raise serializers.ValidationError(_("Invalid variable detected.")) from err + + return path + + def update(self, instance, validated_data): + """ + When a storage path is updated, see if documents + using it require a rename/move + """ + doc_ids = [doc.id for doc in instance.documents.all()] + if len(doc_ids): + bulk_edit.bulk_update_documents.delay(doc_ids) + + return super().update(instance, validated_data) + + +class UiSettingsViewSerializer(serializers.ModelSerializer): + class Meta: + model = UiSettings + depth = 1 + fields = [ + "id", + "settings", + ] + + def validate_settings(self, settings): + # we never save update checking backend setting + if "update_checking" in settings: + try: + settings["update_checking"].pop("backend_setting") + except KeyError: + pass + return settings + + def create(self, validated_data): + ui_settings = UiSettings.objects.update_or_create( + user=validated_data.get("user"), + defaults={"settings": validated_data.get("settings", None)}, + ) + return ui_settings + + +class TasksViewSerializer(serializers.ModelSerializer): + class Meta: + model = PaperlessTask + depth = 1 + fields = ( + "id", + "task_id", + "task_file_name", + "date_created", + "date_done", + "type", + "status", + "result", + "acknowledged", + "related_document", + ) + + type = serializers.SerializerMethodField() + + def get_type(self, obj): + # just file tasks, for now + return "file" + + related_document = serializers.SerializerMethodField() + related_doc_re = re.compile(r"New document id (\d+) created") + + def get_related_document(self, obj): + result = None + if obj.status is not None and obj.status == states.SUCCESS: + try: + result = self.related_doc_re.search(obj.result).group(1) + except Exception: + pass + + return result + + +class AcknowledgeTasksViewSerializer(serializers.Serializer): + + tasks = serializers.ListField( + required=True, + label="Tasks", + write_only=True, + child=serializers.IntegerField(), + ) + + def _validate_task_id_list(self, tasks, name="tasks"): + pass + if not type(tasks) == list: + raise serializers.ValidationError(f"{name} must be a list") + if not all(type(i) == int for i in tasks): + raise serializers.ValidationError(f"{name} must be a list of integers") + count = PaperlessTask.objects.filter(id__in=tasks).count() + if not count == len(tasks): + raise serializers.ValidationError( + f"Some tasks in {name} don't exist or were specified twice.", + ) + + def validate_tasks(self, tasks): + self._validate_task_id_list(tasks) + return tasks diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 5f7c212a9..33e93b703 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -1,305 +1,305 @@ -import hashlib -import logging -import shutil -import uuid -from typing import Optional -from typing import Type - -import tqdm -from asgiref.sync import async_to_sync -from celery import shared_task -from channels.layers import get_channel_layer -from django.conf import settings -from django.db import transaction -from django.db.models.signals import post_save -from documents import barcodes -from documents import index -from documents import sanity_checker -from documents.classifier import DocumentClassifier -from documents.classifier import load_classifier -from documents.consumer import Consumer -from documents.consumer import ConsumerError -from documents.data_models import ConsumableDocument -from documents.data_models import DocumentMetadataOverrides -from documents.data_models import DocumentSource -from documents.file_handling import create_source_path_directory -from documents.file_handling import generate_unique_filename -from documents.models import Correspondent -from documents.models import Document -from documents.models import DocumentType -from documents.models import StoragePath -from documents.models import Tag -from documents.parsers import DocumentParser -from documents.parsers import get_parser_class_for_mime_type -from documents.sanity_checker import SanityCheckFailedException -from filelock import FileLock -from redis.exceptions import ConnectionError -from whoosh.writing import AsyncWriter - - -logger = logging.getLogger("paperless.tasks") - - -@shared_task -def index_optimize(): - ix = index.open_index() - writer = AsyncWriter(ix) - writer.commit(optimize=True) - - -def index_reindex(progress_bar_disable=False): - documents = Document.objects.all() - - ix = index.open_index(recreate=True) - - with AsyncWriter(ix) as writer: - for document in tqdm.tqdm(documents, disable=progress_bar_disable): - index.update_document(writer, document) - - -@shared_task -def train_classifier(): - if ( - not Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() - and not DocumentType.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() - and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() - and not StoragePath.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() - ): - - return - - classifier = load_classifier() - - if not classifier: - classifier = DocumentClassifier() - - try: - if classifier.train(): - logger.info( - f"Saving updated classifier model to {settings.MODEL_FILE}...", - ) - classifier.save() - else: - logger.debug("Training data unchanged.") - - except Exception as e: - logger.warning("Classifier error: " + str(e)) - - -@shared_task -def consume_file( - input_doc: ConsumableDocument, - overrides: Optional[DocumentMetadataOverrides] = None, -): - - # Default no overrides - if overrides is None: - overrides = DocumentMetadataOverrides() - - # read all barcodes in the current document - if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE: - doc_barcode_info = barcodes.scan_file_for_barcodes( - input_doc.original_file, - input_doc.mime_type, - ) - - # split document by separator pages, if enabled - if settings.CONSUMER_ENABLE_BARCODES: - separators = barcodes.get_separating_barcodes(doc_barcode_info.barcodes) - - if len(separators) > 0: - logger.debug( - f"Pages with separators found in: {input_doc.original_file}", - ) - document_list = barcodes.separate_pages( - doc_barcode_info.pdf_path, - separators, - ) - - if document_list: - - # If the file is an upload, it's in the scratch directory - # Move it to consume directory to be picked up - # Otherwise, use the current parent to keep possible tags - # from subdirectories - if input_doc.source != DocumentSource.ConsumeFolder: - save_to_dir = settings.CONSUMPTION_DIR - else: - # Note this uses the original file, because it's in the - # consume folder already and may include additional path - # components for tagging - # the .path is somewhere in scratch in this case - save_to_dir = input_doc.original_file.parent - - for n, document in enumerate(document_list): - # save to consumption dir - # rename it to the original filename with number prefix - if overrides.filename is not None: - newname = f"{str(n)}_{overrides.filename}" - else: - newname = None - - barcodes.save_to_dir( - document, - newname=newname, - target_dir=save_to_dir, - ) - - # Split file has been copied safely, remove it - document.unlink() - - # And clean up the directory as well, now it's empty - shutil.rmtree(document_list[0].parent) - - # This file has been split into multiple files without issue - # remove the original and working copy - input_doc.original_file.unlink() - - # If the original file was a TIFF, remove the PDF generated from it - if input_doc.mime_type == "image/tiff": - logger.debug( - f"Deleting file {doc_barcode_info.pdf_path}", - ) - doc_barcode_info.pdf_path.unlink() - - # notify the sender, otherwise the progress bar - # in the UI stays stuck - payload = { - "filename": overrides.filename or input_doc.original_file.name, - "task_id": None, - "current_progress": 100, - "max_progress": 100, - "status": "SUCCESS", - "message": "finished", - } - try: - async_to_sync(get_channel_layer().group_send)( - "status_updates", - {"type": "status_update", "data": payload}, - ) - except ConnectionError as e: - logger.warning(f"ConnectionError on status send: {str(e)}") - # consuming stops here, since the original document with - # the barcodes has been split and will be consumed separately - return "File successfully split" - - # try reading the ASN from barcode - if settings.CONSUMER_ENABLE_ASN_BARCODE: - overrides.asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) - if overrides.asn: - logger.info(f"Found ASN in barcode: {overrides.asn}") - - # continue with consumption if no barcode was found - document = Consumer().try_consume_file( - input_doc.original_file, - override_filename=overrides.filename, - override_title=overrides.title, - override_correspondent_id=overrides.correspondent_id, - override_document_type_id=overrides.document_type_id, - override_tag_ids=overrides.tag_ids, - override_created=overrides.created, - override_asn=overrides.asn, - override_owner_id=overrides.owner_id, - override_storage_path_id=overrides.storage_path_id, - full_path=overrides.full_path - ) - - if document: - return f"Success. New document id {document.pk} created" - else: - raise ConsumerError( - "Unknown error: Returned document was null, but " - "no error message was given.", - ) - - -@shared_task -def sanity_check(): - messages = sanity_checker.check_sanity() - - messages.log_messages() - - if messages.has_error: - raise SanityCheckFailedException("Sanity check failed with errors. See log.") - elif messages.has_warning: - return "Sanity check exited with warnings. See log." - elif len(messages) > 0: - return "Sanity check exited with infos. See log." - else: - return "No issues detected." - - -@shared_task -def bulk_update_documents(document_ids): - documents = Document.objects.filter(id__in=document_ids) - - ix = index.open_index() - - for doc in documents: - post_save.send(Document, instance=doc, created=False) - - with AsyncWriter(ix) as writer: - for doc in documents: - index.update_document(writer, doc) - - -@shared_task -def update_document_archive_file(document_id): - """ - Re-creates the archive file of a document, including new OCR content and thumbnail - """ - document = Document.objects.get(id=document_id) - - mime_type = document.mime_type - - parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type) - - if not parser_class: - logger.error( - f"No parser found for mime type {mime_type}, cannot " - f"archive document {document} (ID: {document_id})", - ) - return - - parser: DocumentParser = parser_class(logging_group=uuid.uuid4()) - - try: - parser.parse(document.source_path, mime_type, document.get_public_filename()) - - thumbnail = parser.get_thumbnail( - document.source_path, - mime_type, - document.get_public_filename(), - ) - - if parser.get_archive_path(): - with transaction.atomic(): - with open(parser.get_archive_path(), "rb") as f: - checksum = hashlib.md5(f.read()).hexdigest() - # I'm going to save first so that in case the file move - # fails, the database is rolled back. - # We also don't use save() since that triggers the filehandling - # logic, and we don't want that yet (file not yet in place) - document.archive_filename = generate_unique_filename( - document, - archive_filename=True, - ) - Document.objects.filter(pk=document.pk).update( - archive_checksum=checksum, - content=parser.get_text(), - archive_filename=document.archive_filename, - ) - with FileLock(settings.MEDIA_LOCK): - create_source_path_directory(document.archive_path) - shutil.move(parser.get_archive_path(), document.archive_path) - shutil.move(thumbnail, document.thumbnail_path) - - with index.open_index_writer() as writer: - index.update_document(writer, document) - - except Exception: - logger.exception( - f"Error while parsing document {document} (ID: {document_id})", - ) - finally: - parser.cleanup() +import hashlib +import logging +import shutil +import uuid +from typing import Optional +from typing import Type + +import tqdm +from asgiref.sync import async_to_sync +from celery import shared_task +from channels.layers import get_channel_layer +from django.conf import settings +from django.db import transaction +from django.db.models.signals import post_save +from documents import barcodes +from documents import index +from documents import sanity_checker +from documents.classifier import DocumentClassifier +from documents.classifier import load_classifier +from documents.consumer import Consumer +from documents.consumer import ConsumerError +from documents.data_models import ConsumableDocument +from documents.data_models import DocumentMetadataOverrides +from documents.data_models import DocumentSource +from documents.file_handling import create_source_path_directory +from documents.file_handling import generate_unique_filename +from documents.models import Correspondent +from documents.models import Document +from documents.models import DocumentType +from documents.models import StoragePath +from documents.models import Tag +from documents.parsers import DocumentParser +from documents.parsers import get_parser_class_for_mime_type +from documents.sanity_checker import SanityCheckFailedException +from filelock import FileLock +from redis.exceptions import ConnectionError +from whoosh.writing import AsyncWriter + + +logger = logging.getLogger("paperless.tasks") + + +@shared_task +def index_optimize(): + ix = index.open_index() + writer = AsyncWriter(ix) + writer.commit(optimize=True) + + +def index_reindex(progress_bar_disable=False): + documents = Document.objects.all() + + ix = index.open_index(recreate=True) + + with AsyncWriter(ix) as writer: + for document in tqdm.tqdm(documents, disable=progress_bar_disable): + index.update_document(writer, document) + + +@shared_task +def train_classifier(): + if ( + not Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() + and not DocumentType.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() + and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() + and not StoragePath.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists() + ): + + return + + classifier = load_classifier() + + if not classifier: + classifier = DocumentClassifier() + + try: + if classifier.train(): + logger.info( + f"Saving updated classifier model to {settings.MODEL_FILE}...", + ) + classifier.save() + else: + logger.debug("Training data unchanged.") + + except Exception as e: + logger.warning("Classifier error: " + str(e)) + + +@shared_task +def consume_file( + input_doc: ConsumableDocument, + overrides: Optional[DocumentMetadataOverrides] = None, +): + + # Default no overrides + if overrides is None: + overrides = DocumentMetadataOverrides() + + # read all barcodes in the current document + if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE: + doc_barcode_info = barcodes.scan_file_for_barcodes( + input_doc.original_file, + input_doc.mime_type, + ) + + # split document by separator pages, if enabled + if settings.CONSUMER_ENABLE_BARCODES: + separators = barcodes.get_separating_barcodes(doc_barcode_info.barcodes) + + if len(separators) > 0: + logger.debug( + f"Pages with separators found in: {input_doc.original_file}", + ) + document_list = barcodes.separate_pages( + doc_barcode_info.pdf_path, + separators, + ) + + if document_list: + + # If the file is an upload, it's in the scratch directory + # Move it to consume directory to be picked up + # Otherwise, use the current parent to keep possible tags + # from subdirectories + if input_doc.source != DocumentSource.ConsumeFolder: + save_to_dir = settings.CONSUMPTION_DIR + else: + # Note this uses the original file, because it's in the + # consume folder already and may include additional path + # components for tagging + # the .path is somewhere in scratch in this case + save_to_dir = input_doc.original_file.parent + + for n, document in enumerate(document_list): + # save to consumption dir + # rename it to the original filename with number prefix + if overrides.filename is not None: + newname = f"{str(n)}_{overrides.filename}" + else: + newname = None + + barcodes.save_to_dir( + document, + newname=newname, + target_dir=save_to_dir, + ) + + # Split file has been copied safely, remove it + document.unlink() + + # And clean up the directory as well, now it's empty + shutil.rmtree(document_list[0].parent) + + # This file has been split into multiple files without issue + # remove the original and working copy + input_doc.original_file.unlink() + + # If the original file was a TIFF, remove the PDF generated from it + if input_doc.mime_type == "image/tiff": + logger.debug( + f"Deleting file {doc_barcode_info.pdf_path}", + ) + doc_barcode_info.pdf_path.unlink() + + # notify the sender, otherwise the progress bar + # in the UI stays stuck + payload = { + "filename": overrides.filename or input_doc.original_file.name, + "task_id": None, + "current_progress": 100, + "max_progress": 100, + "status": "SUCCESS", + "message": "finished", + } + try: + async_to_sync(get_channel_layer().group_send)( + "status_updates", + {"type": "status_update", "data": payload}, + ) + except ConnectionError as e: + logger.warning(f"ConnectionError on status send: {str(e)}") + # consuming stops here, since the original document with + # the barcodes has been split and will be consumed separately + return "File successfully split" + + # try reading the ASN from barcode + if settings.CONSUMER_ENABLE_ASN_BARCODE: + overrides.asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes) + if overrides.asn: + logger.info(f"Found ASN in barcode: {overrides.asn}") + + # continue with consumption if no barcode was found + document = Consumer().try_consume_file( + input_doc.original_file, + override_filename=overrides.filename, + override_title=overrides.title, + override_correspondent_id=overrides.correspondent_id, + override_document_type_id=overrides.document_type_id, + override_tag_ids=overrides.tag_ids, + override_created=overrides.created, + override_asn=overrides.asn, + override_owner_id=overrides.owner_id, + override_storage_path_id=overrides.storage_path_id, + full_path=overrides.full_path + ) + + if document: + return f"Success. New document id {document.pk} created" + else: + raise ConsumerError( + "Unknown error: Returned document was null, but " + "no error message was given.", + ) + + +@shared_task +def sanity_check(): + messages = sanity_checker.check_sanity() + + messages.log_messages() + + if messages.has_error: + raise SanityCheckFailedException("Sanity check failed with errors. See log.") + elif messages.has_warning: + return "Sanity check exited with warnings. See log." + elif len(messages) > 0: + return "Sanity check exited with infos. See log." + else: + return "No issues detected." + + +@shared_task +def bulk_update_documents(document_ids): + documents = Document.objects.filter(id__in=document_ids) + + ix = index.open_index() + + for doc in documents: + post_save.send(Document, instance=doc, created=False) + + with AsyncWriter(ix) as writer: + for doc in documents: + index.update_document(writer, doc) + + +@shared_task +def update_document_archive_file(document_id): + """ + Re-creates the archive file of a document, including new OCR content and thumbnail + """ + document = Document.objects.get(id=document_id) + + mime_type = document.mime_type + + parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type) + + if not parser_class: + logger.error( + f"No parser found for mime type {mime_type}, cannot " + f"archive document {document} (ID: {document_id})", + ) + return + + parser: DocumentParser = parser_class(logging_group=uuid.uuid4()) + + try: + parser.parse(document.source_path, mime_type, document.get_public_filename()) + + thumbnail = parser.get_thumbnail( + document.source_path, + mime_type, + document.get_public_filename(), + ) + + if parser.get_archive_path(): + with transaction.atomic(): + with open(parser.get_archive_path(), "rb") as f: + checksum = hashlib.md5(f.read()).hexdigest() + # I'm going to save first so that in case the file move + # fails, the database is rolled back. + # We also don't use save() since that triggers the filehandling + # logic, and we don't want that yet (file not yet in place) + document.archive_filename = generate_unique_filename( + document, + archive_filename=True, + ) + Document.objects.filter(pk=document.pk).update( + archive_checksum=checksum, + content=parser.get_text(), + archive_filename=document.archive_filename, + ) + with FileLock(settings.MEDIA_LOCK): + create_source_path_directory(document.archive_path) + shutil.move(parser.get_archive_path(), document.archive_path) + shutil.move(thumbnail, document.thumbnail_path) + + with index.open_index_writer() as writer: + index.update_document(writer, document) + + except Exception: + logger.exception( + f"Error while parsing document {document} (ID: {document_id})", + ) + finally: + parser.cleanup() diff --git a/src/documents/templates/registration/logged_out.html b/src/documents/templates/registration/logged_out.html index 5fe2144fd..9d1f5bf35 100644 --- a/src/documents/templates/registration/logged_out.html +++ b/src/documents/templates/registration/logged_out.html @@ -1,48 +1,48 @@ -<!DOCTYPE html> - -{% load static %} {% load i18n %} - -<html lang="en"> - <head> - <meta charset="utf-8" /> - <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" /> - <meta name="description" content="" /> - <meta name="author" content="Mark Otto, Jacob Thornton, and Bootstrap contributors" /> - <meta name="generator" content="Jekyll v4.1.1" /> - <meta name="robots" content="noindex,nofollow" /> - <title>{% translate "Paperless-ngx signed out" %} - - - - - - - - - - -
- - - - -

{% translate "You have been successfully logged out. Bye!" %}

- {% translate "Sign in again" %} -
- - + + +{% load static %} {% load i18n %} + + + + + + + + + + {% translate "Paperless-ngx signed out" %} + + + + + + + + + + +
+ + + + +

{% translate "You have been successfully logged out. Bye!" %}

+ {% translate "Sign in again" %} +
+ + diff --git a/src/documents/templates/registration/login.html b/src/documents/templates/registration/login.html index f13d3a4b4..5a414c16f 100644 --- a/src/documents/templates/registration/login.html +++ b/src/documents/templates/registration/login.html @@ -1,57 +1,57 @@ - - -{% load static %} {% load i18n %} - - - - - - - - - - - {% translate "LBC Finance sign in" %} - - - - - - - - - - -
- {% csrf_token %} - - - - -

{% translate "Sign in to LBC Finance" %}

- {% if form.errors %} - - {% endif %} {% translate "Username" as i18n_username %} {% translate "Password" as i18n_password %} - - - - - -
- - + + +{% load static %} {% load i18n %} + + + + + + + + + + + {% translate "LBC Finance sign in" %} + + + + + + + + + + +
+ {% csrf_token %} + + + + +

{% translate "Sign in to LBC Finance" %}

+ {% if form.errors %} + + {% endif %} {% translate "Username" as i18n_username %} {% translate "Password" as i18n_password %} + + + + + +
+ + diff --git a/src/documents/views.py b/src/documents/views.py index 7b70e96b1..2cc09dbf5 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1,1072 +1,1072 @@ -import itertools -import json -import logging -import os -import re -import tempfile -import urllib -import zipfile -from datetime import datetime -from pathlib import Path -from time import mktime -from unicodedata import normalize -from urllib.parse import quote - -import pathvalidate -from django.conf import settings -from django.contrib.auth.models import User -from django.db.models import Case -from django.db.models import Count -from django.db.models import IntegerField -from django.db.models import Max -from django.db.models import Sum -from django.db.models import When -from django.db.models.functions import Length -from django.db.models.functions import Lower -from django.http import Http404 -from django.http import HttpResponse -from django.http import HttpResponseBadRequest -from django.shortcuts import get_object_or_404 -from django.utils.decorators import method_decorator -from django.utils.translation import get_language -from django.views.decorators.cache import cache_control -from django.views.generic import TemplateView -from django_filters.rest_framework import DjangoFilterBackend -from documents.filters import ObjectOwnedOrGrantedPermissionsFilter -from documents.permissions import PaperlessAdminPermissions -from documents.permissions import PaperlessObjectPermissions -from documents.tasks import consume_file -from langdetect import detect -from packaging import version as packaging_version -from paperless import version -from paperless.db import GnuPG -from paperless.views import StandardPagination -from rest_framework import parsers -from rest_framework.decorators import action -from rest_framework.exceptions import NotFound -from rest_framework.filters import OrderingFilter -from rest_framework.filters import SearchFilter -from rest_framework.generics import GenericAPIView -from rest_framework.mixins import CreateModelMixin -from rest_framework.mixins import DestroyModelMixin -from rest_framework.mixins import ListModelMixin -from rest_framework.mixins import RetrieveModelMixin -from rest_framework.mixins import UpdateModelMixin -from rest_framework.permissions import IsAuthenticated -from rest_framework.response import Response -from rest_framework.views import APIView -from rest_framework.viewsets import GenericViewSet -from rest_framework.viewsets import ModelViewSet -from rest_framework.viewsets import ReadOnlyModelViewSet -from rest_framework.viewsets import ViewSet - -from .bulk_download import ArchiveOnlyStrategy -from .bulk_download import OriginalAndArchiveStrategy -from .bulk_download import OriginalsOnlyStrategy -from .classifier import load_classifier -from .data_models import ConsumableDocument -from .data_models import DocumentMetadataOverrides -from .data_models import DocumentSource -from .filters import CorrespondentFilterSet -from .filters import DocumentFilterSet -from .filters import DocumentTypeFilterSet -from .filters import StoragePathFilterSet -from .filters import TagFilterSet -from .matching import match_correspondents -from .matching import match_document_types -from .matching import match_storage_paths -from .matching import match_tags -from .models import Correspondent, Metadata -from .models import Document -from .models import DocumentType -from .models import Note -from .models import PaperlessTask -from .models import SavedView -from .models import StoragePath -from .models import Tag -from .parsers import get_parser_class_for_mime_type -from .parsers import parse_date_generator -from .serialisers import AcknowledgeTasksViewSerializer -from .serialisers import BulkDownloadSerializer -from .serialisers import BulkEditSerializer -from .serialisers import CorrespondentSerializer -from .serialisers import DocumentListSerializer -from .serialisers import DocumentSerializer -from .serialisers import DocumentTypeSerializer -from .serialisers import PostDocumentSerializer -from .serialisers import SavedViewSerializer -from .serialisers import StoragePathSerializer -from .serialisers import TagSerializer -from .serialisers import TagSerializerVersion1 -from .serialisers import TasksViewSerializer -from .serialisers import UiSettingsViewSerializer - -logger = logging.getLogger("paperless.api") - - -class IndexView(TemplateView): - template_name = "index.html" - - def get_frontend_language(self): - if hasattr( - self.request.user, - "ui_settings", - ) and self.request.user.ui_settings.settings.get("language"): - lang = self.request.user.ui_settings.settings.get("language") - else: - lang = get_language() - # This is here for the following reason: - # Django identifies languages in the form "en-us" - # However, angular generates locales as "en-US". - # this translates between these two forms. - if "-" in lang: - first = lang[: lang.index("-")] - second = lang[lang.index("-") + 1 :] - return f"{first}-{second.upper()}" - else: - return lang - - def get_context_data(self, **kwargs): - context = super().get_context_data(**kwargs) - context["cookie_prefix"] = settings.COOKIE_PREFIX - context["username"] = self.request.user.username - context["full_name"] = self.request.user.get_full_name() - context["styles_css"] = f"frontend/{self.get_frontend_language()}/styles.css" - context["runtime_js"] = f"frontend/{self.get_frontend_language()}/runtime.js" - context[ - "polyfills_js" - ] = f"frontend/{self.get_frontend_language()}/polyfills.js" - context["main_js"] = f"frontend/{self.get_frontend_language()}/main.js" - context[ - "webmanifest" - ] = f"frontend/{self.get_frontend_language()}/manifest.webmanifest" # noqa: E501 - context[ - "apple_touch_icon" - ] = f"frontend/{self.get_frontend_language()}/apple-touch-icon.png" # noqa: E501 - return context - - -class PassUserMixin(CreateModelMixin): - """ - Pass a user object to serializer - """ - - def get_serializer(self, *args, **kwargs): - kwargs.setdefault("user", self.request.user) - return super().get_serializer(*args, **kwargs) - - -class CorrespondentViewSet(ModelViewSet, PassUserMixin): - model = Correspondent - - queryset = Correspondent.objects.annotate( - document_count=Count("documents"), - last_correspondence=Max("documents__created"), - ).order_by(Lower("name")) - - serializer_class = CorrespondentSerializer - pagination_class = StandardPagination - permission_classes = (IsAuthenticated, PaperlessObjectPermissions) - filter_backends = ( - DjangoFilterBackend, - OrderingFilter, - ObjectOwnedOrGrantedPermissionsFilter, - ) - filterset_class = CorrespondentFilterSet - ordering_fields = ( - "name", - "matching_algorithm", - "match", - "document_count", - "last_correspondence", - ) - - -class TagViewSet(ModelViewSet, PassUserMixin): - model = Tag - - queryset = Tag.objects.annotate(document_count=Count("documents")).order_by( - Lower("name"), - ) - - def get_serializer_class(self, *args, **kwargs): - print(self.request.version) - if int(self.request.version) == 1: - return TagSerializerVersion1 - else: - return TagSerializer - - pagination_class = StandardPagination - permission_classes = (IsAuthenticated, PaperlessObjectPermissions) - filter_backends = ( - DjangoFilterBackend, - OrderingFilter, - ObjectOwnedOrGrantedPermissionsFilter, - ) - filterset_class = TagFilterSet - ordering_fields = ("color", "name", "matching_algorithm", "match", "document_count") - - -class DocumentTypeViewSet(ModelViewSet, PassUserMixin): - model = DocumentType - - queryset = DocumentType.objects.annotate( - document_count=Count("documents"), - ).order_by(Lower("name")) - - serializer_class = DocumentTypeSerializer - pagination_class = StandardPagination - permission_classes = (IsAuthenticated, PaperlessObjectPermissions) - filter_backends = ( - DjangoFilterBackend, - OrderingFilter, - ObjectOwnedOrGrantedPermissionsFilter, - ) - filterset_class = DocumentTypeFilterSet - ordering_fields = ("name", "matching_algorithm", "match", "document_count") - - -class DocumentViewSet( - PassUserMixin, - RetrieveModelMixin, - UpdateModelMixin, - DestroyModelMixin, - ListModelMixin, - GenericViewSet, -): - model = Document - queryset = Document.objects.annotate(num_notes=Count("notes")) - serializer_class = DocumentSerializer - pagination_class = StandardPagination - permission_classes = (IsAuthenticated, PaperlessObjectPermissions) - filter_backends = ( - DjangoFilterBackend, - SearchFilter, - OrderingFilter, - ObjectOwnedOrGrantedPermissionsFilter, - ) - filterset_class = DocumentFilterSet - search_fields = ("title", "correspondent__name", "content") - ordering_fields = ( - "id", - "title", - "correspondent__name", - "document_type__name", - "created", - "modified", - "added", - "archive_serial_number", - "num_notes", - ) - - def get_queryset(self): - return Document.objects.distinct().annotate(num_notes=Count("notes")) - - def get_serializer(self, *args, **kwargs): - super().get_serializer(*args, **kwargs) - fields_param = self.request.query_params.get("fields", None) - fields = fields_param.split(",") if fields_param else None - truncate_content = self.request.query_params.get("truncate_content", "False") - serializer_class = self.get_serializer_class() - kwargs.setdefault("context", self.get_serializer_context()) - kwargs.setdefault("fields", fields) - kwargs.setdefault("truncate_content", truncate_content.lower() in ["true", "1"]) - return serializer_class(*args, **kwargs) - - def update(self, request, *args, **kwargs): - response = super().update(request, *args, **kwargs) - from documents import index - - index.add_or_update_document(self.get_object()) - return response - - def destroy(self, request, *args, **kwargs): - from documents import index - - index.remove_document_from_index(self.get_object()) - return super().destroy(request, *args, **kwargs) - - @staticmethod - def original_requested(request): - return ( - "original" in request.query_params - and request.query_params["original"] == "true" - ) - - def file_response(self, pk, request, disposition): - doc = Document.objects.get(id=pk) - if not self.original_requested(request) and doc.has_archive_version: - file_handle = doc.archive_file - filename = doc.get_public_filename(archive=True) - mime_type = "application/pdf" - else: - file_handle = doc.source_file - filename = doc.get_public_filename() - mime_type = doc.mime_type - # Support browser previewing csv files by using text mime type - if mime_type in {"application/csv", "text/csv"} and disposition == "inline": - mime_type = "text/plain" - - if doc.storage_type == Document.STORAGE_TYPE_GPG: - file_handle = GnuPG.decrypted(file_handle) - - response = HttpResponse(file_handle, content_type=mime_type) - # Firefox is not able to handle unicode characters in filename field - # RFC 5987 addresses this issue - # see https://datatracker.ietf.org/doc/html/rfc5987#section-4.2 - # Chromium cannot handle commas in the filename - filename_normalized = normalize("NFKD", filename.replace(",", "_")).encode( - "ascii", - "ignore", - ) - filename_encoded = quote(filename) - content_disposition = ( - f"{disposition}; " - f'filename="{filename_normalized}"; ' - f"filename*=utf-8''{filename_encoded}" - ) - response["Content-Disposition"] = content_disposition - return response - - def get_metadata(self, file, mime_type): - if not os.path.isfile(file): - return None - - parser_class = get_parser_class_for_mime_type(mime_type) - if parser_class: - parser = parser_class(progress_callback=None, logging_group=None) - - try: - return parser.extract_metadata(file, mime_type) - except Exception: - # TODO: cover GPG errors, remove later. - return [] - else: - return [] - - def get_filesize(self, filename): - if os.path.isfile(filename): - return os.stat(filename).st_size - else: - return None - - @action(methods=["get"], detail=True) - def metadata(self, request, pk=None): - try: - doc = Document.objects.get(pk=pk) - except Document.DoesNotExist: - raise Http404 - - meta = { - "original_checksum": doc.checksum, - "original_size": self.get_filesize(doc.source_path), - "original_mime_type": doc.mime_type, - "media_filename": doc.filename, - "has_archive_version": doc.has_archive_version, - "original_metadata": self.get_metadata(doc.source_path, doc.mime_type), - "archive_checksum": doc.archive_checksum, - "archive_media_filename": doc.archive_filename, - "original_filename": doc.original_filename, - } - - lang = "en" - try: - lang = detect(doc.content) - except Exception: - pass - meta["lang"] = lang - - if doc.has_archive_version: - meta["archive_size"] = self.get_filesize(doc.archive_path) - meta["archive_metadata"] = self.get_metadata( - doc.archive_path, - "application/pdf", - ) - else: - meta["archive_size"] = None - meta["archive_metadata"] = None - - return Response(meta) - - @action(methods=["get"], detail=True) - def suggestions(self, request, pk=None): - doc = get_object_or_404(Document, pk=pk) - - classifier = load_classifier() - - gen = parse_date_generator(doc.filename, doc.content) - dates = sorted( - {i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)}, - ) - - return Response( - { - "correspondents": [c.id for c in match_correspondents(doc, classifier)], - "tags": [t.id for t in match_tags(doc, classifier)], - "document_types": [ - dt.id for dt in match_document_types(doc, classifier) - ], - "storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)], - "dates": [ - date.strftime("%Y-%m-%d") for date in dates if date is not None - ], - }, - ) - - @action(methods=["get"], detail=True) - def preview(self, request, pk=None): - try: - response = self.file_response(pk, request, "inline") - return response - except (FileNotFoundError, Document.DoesNotExist): - raise Http404 - - @action(methods=["get"], detail=True) - @method_decorator(cache_control(public=False, max_age=315360000)) - def thumb(self, request, pk=None): - try: - doc = Document.objects.get(id=pk) - if doc.storage_type == Document.STORAGE_TYPE_GPG: - handle = GnuPG.decrypted(doc.thumbnail_file) - else: - handle = doc.thumbnail_file - # TODO: Send ETag information and use that to send new thumbnails - # if available - - return HttpResponse(handle, content_type="image/webp") - except (FileNotFoundError, Document.DoesNotExist): - raise Http404 - - @action(methods=["get"], detail=True) - def download(self, request, pk=None): - try: - return self.file_response(pk, request, "attachment") - except (FileNotFoundError, Document.DoesNotExist): - raise Http404 - - def getNotes(self, doc): - return [ - { - "id": c.id, - "note": c.note, - "created": c.created, - "user": { - "id": c.user.id, - "username": c.user.username, - "first_name": c.user.first_name, - "last_name": c.user.last_name, - }, - } - for c in Note.objects.filter(document=doc).order_by("-created") - ] - - @action(methods=["get", "post", "delete"], detail=True) - def notes(self, request, pk=None): - try: - doc = Document.objects.get(pk=pk) - except Document.DoesNotExist: - raise Http404 - - currentUser = request.user - - if request.method == "GET": - try: - return Response(self.getNotes(doc)) - except Exception as e: - logger.warning(f"An error occurred retrieving notes: {str(e)}") - return Response( - {"error": "Error retreiving notes, check logs for more detail."}, - ) - elif request.method == "POST": - try: - c = Note.objects.create( - document=doc, - note=request.data["note"], - user=currentUser, - ) - c.save() - - from documents import index - - index.add_or_update_document(self.get_object()) - - return Response(self.getNotes(doc)) - except Exception as e: - logger.warning(f"An error occurred saving note: {str(e)}") - return Response( - { - "error": "Error saving note, check logs for more detail.", - }, - ) - elif request.method == "DELETE": - note = Note.objects.get(id=int(request.GET.get("id"))) - note.delete() - - from documents import index - - index.add_or_update_document(self.get_object()) - - return Response(self.getNotes(doc)) - - return Response( - { - "error": "error", - }, - ) - - @action(methods=["get", "post"], detail=True) - def index_field_metadata(self, request, pk=None): - try: - doc = Document.objects.get(pk=pk) - except Document.DoesNotExist: - raise Http404 - - currentUser = request.user - - if request.method == "GET": - try: - return Response(Metadata.objects.filter(document=doc).order_by("-created")) - except Exception as e: - logger.warning(f"An error occurred retrieving metadatas: {str(e)}") - return Response( - {"error": "Error retreiving metadatas, check logs for more detail."}, - ) - elif request.method == "POST": - try: - c = Metadata.objects.create( - document=doc, - data=request.data["metadata"], - user=currentUser, - ) - c.save() - - from documents import index - - index.add_or_update_document(self.get_object()) - - return Response(str(c.data)) - except Exception as e: - logger.warning(f"An error occurred saving metadata: {str(e)}") - return Response( - { - "error": "Error saving metadata, check logs for more detail.", - }, - ) - - -class SearchResultSerializer(DocumentSerializer, PassUserMixin): - def to_representation(self, instance): - doc = Document.objects.get(id=instance["id"]) - notes = ",".join( - [str(c.note) for c in Note.objects.filter(document=instance["id"])], - ) - r = super().to_representation(doc) - r["__search_hit__"] = { - "score": instance.score, - "highlights": instance.highlights("content", text=doc.content), - "note_highlights": instance.highlights("notes", text=notes) - if doc - else None, - "rank": instance.rank, - } - - return r - - -class UnifiedSearchViewSet(DocumentViewSet): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.searcher = None - - def get_serializer_class(self): - if self._is_search_request(): - return SearchResultSerializer - else: - return DocumentSerializer - - def _is_search_request(self): - return ( - "query" in self.request.query_params - or "more_like_id" in self.request.query_params - ) - - def filter_queryset(self, queryset): - if self._is_search_request(): - from documents import index - - if hasattr(self.request, "user"): - # pass user to query for perms - self.request.query_params._mutable = True - self.request.query_params["user"] = self.request.user.id - self.request.query_params._mutable = False - - if "query" in self.request.query_params: - query_class = index.DelayedFullTextQuery - elif "more_like_id" in self.request.query_params: - query_class = index.DelayedMoreLikeThisQuery - else: - raise ValueError - - return query_class( - self.searcher, - self.request.query_params, - self.paginator.get_page_size(self.request), - ) - else: - return super().filter_queryset(queryset) - - def list(self, request, *args, **kwargs): - if self._is_search_request(): - from documents import index - - try: - with index.open_index_searcher() as s: - self.searcher = s - return super().list(request) - except NotFound: - raise - except Exception as e: - return HttpResponseBadRequest(str(e)) - else: - return super().list(request) - - -class LogViewSet(ViewSet): - - permission_classes = (IsAuthenticated, PaperlessAdminPermissions) - - log_files = ["paperless", "mail"] - - def get_log_filename(self, log): - return os.path.join(settings.LOGGING_DIR, f"{log}.log") - - def retrieve(self, request, pk=None, *args, **kwargs): - if pk not in self.log_files: - raise Http404 - - filename = self.get_log_filename(pk) - - if not os.path.isfile(filename): - raise Http404 - - with open(filename) as f: - lines = [line.rstrip() for line in f.readlines()] - - return Response(lines) - - def list(self, request, *args, **kwargs): - exist = [ - log for log in self.log_files if os.path.isfile(self.get_log_filename(log)) - ] - return Response(exist) - - -class SavedViewViewSet(ModelViewSet, PassUserMixin): - model = SavedView - - queryset = SavedView.objects.all() - serializer_class = SavedViewSerializer - pagination_class = StandardPagination - permission_classes = (IsAuthenticated, PaperlessObjectPermissions) - - def get_queryset(self): - user = self.request.user - return SavedView.objects.filter(owner=user) - - def perform_create(self, serializer): - serializer.save(owner=self.request.user) - - -class BulkEditView(GenericAPIView): - - permission_classes = (IsAuthenticated,) - serializer_class = BulkEditSerializer - parser_classes = (parsers.JSONParser,) - - def post(self, request, *args, **kwargs): - serializer = self.get_serializer(data=request.data) - serializer.is_valid(raise_exception=True) - - method = serializer.validated_data.get("method") - parameters = serializer.validated_data.get("parameters") - documents = serializer.validated_data.get("documents") - - try: - # TODO: parameter validation - result = method(documents, **parameters) - return Response({"result": result}) - except Exception as e: - return HttpResponseBadRequest(str(e)) - - -class PostDocumentView(GenericAPIView): - - permission_classes = (IsAuthenticated,) - serializer_class = PostDocumentSerializer - parser_classes = (parsers.MultiPartParser,) - - def post(self, request, *args, **kwargs): - - serializer = self.get_serializer(data=request.data) - serializer.is_valid(raise_exception=True) - - doc_name, doc_data = serializer.validated_data.get("document") - correspondent_id = serializer.validated_data.get("correspondent") - document_type_id = serializer.validated_data.get("document_type") - tag_ids = serializer.validated_data.get("tags") - title = serializer.validated_data.get("title") - created = serializer.validated_data.get("created") - archive_serial_number = serializer.validated_data.get("archive_serial_number") - storage_path_id = serializer.validated_data.get("storage_path_id") - full_path = serializer.validated_data.get("full_path") - - logger.debug(f"storage_path_id: {storage_path_id}") - - t = int(mktime(datetime.now().timetuple())) - - os.makedirs(settings.SCRATCH_DIR, exist_ok=True) - - temp_file_path = Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR)) / Path( - pathvalidate.sanitize_filename(doc_name), - ) - - temp_file_path.write_bytes(doc_data) - - os.utime(temp_file_path, times=(t, t)) - - input_doc = ConsumableDocument( - source=DocumentSource.ApiUpload, - original_file=temp_file_path, - ) - input_doc_overrides = DocumentMetadataOverrides( - filename=doc_name, - title=title, - correspondent_id=correspondent_id, - document_type_id=document_type_id, - tag_ids=tag_ids, - created=created, - asn=archive_serial_number, - owner_id=request.user.id, - storage_path_id=storage_path_id, - full_path=full_path, - ) - - async_task = consume_file.delay( - input_doc, - input_doc_overrides, - ) - - return Response(async_task.id) - - -class SelectionDataView(GenericAPIView): - - permission_classes = (IsAuthenticated,) - serializer_class = DocumentListSerializer - parser_classes = (parsers.MultiPartParser, parsers.JSONParser) - - def post(self, request, format=None): - serializer = self.get_serializer(data=request.data) - serializer.is_valid(raise_exception=True) - - ids = serializer.validated_data.get("documents") - - correspondents = Correspondent.objects.annotate( - document_count=Count( - Case(When(documents__id__in=ids, then=1), output_field=IntegerField()), - ), - ) - - tags = Tag.objects.annotate( - document_count=Count( - Case(When(documents__id__in=ids, then=1), output_field=IntegerField()), - ), - ) - - types = DocumentType.objects.annotate( - document_count=Count( - Case(When(documents__id__in=ids, then=1), output_field=IntegerField()), - ), - ) - - storage_paths = StoragePath.objects.annotate( - document_count=Count( - Case(When(documents__id__in=ids, then=1), output_field=IntegerField()), - ), - ) - - r = Response( - { - "selected_correspondents": [ - {"id": t.id, "document_count": t.document_count} - for t in correspondents - ], - "selected_tags": [ - {"id": t.id, "document_count": t.document_count} for t in tags - ], - "selected_document_types": [ - {"id": t.id, "document_count": t.document_count} for t in types - ], - "selected_storage_paths": [ - {"id": t.id, "document_count": t.document_count} - for t in storage_paths - ], - }, - ) - - return r - - -class SearchAutoCompleteView(APIView): - - permission_classes = (IsAuthenticated,) - - def get(self, request, format=None): - if "term" in request.query_params: - term = request.query_params["term"] - else: - return HttpResponseBadRequest("Term required") - - if "limit" in request.query_params: - limit = int(request.query_params["limit"]) - if limit <= 0: - return HttpResponseBadRequest("Invalid limit") - else: - limit = 10 - - from documents import index - - ix = index.open_index() - - return Response(index.autocomplete(ix, term, limit)) - - -class StatisticsView(APIView): - - permission_classes = (IsAuthenticated,) - - def get(self, request, format=None): - documents_total = Document.objects.all().count() - - inbox_tag = Tag.objects.filter(is_inbox_tag=True) - - documents_inbox = ( - Document.objects.filter(tags__is_inbox_tag=True).distinct().count() - if inbox_tag.exists() - else None - ) - - document_file_type_counts = ( - Document.objects.values("mime_type") - .annotate(mime_type_count=Count("mime_type")) - .order_by("-mime_type_count") - if documents_total > 0 - else 0 - ) - - character_count = ( - Document.objects.annotate( - characters=Length("content"), - ) - .aggregate(Sum("characters")) - .get("characters__sum") - ) - - return Response( - { - "documents_total": documents_total, - "documents_inbox": documents_inbox, - "inbox_tag": inbox_tag.first().pk if inbox_tag.exists() else None, - "document_file_type_counts": document_file_type_counts, - "character_count": character_count, - }, - ) - - -class BulkDownloadView(GenericAPIView): - - permission_classes = (IsAuthenticated,) - serializer_class = BulkDownloadSerializer - parser_classes = (parsers.JSONParser,) - - def post(self, request, format=None): - serializer = self.get_serializer(data=request.data) - serializer.is_valid(raise_exception=True) - - ids = serializer.validated_data.get("documents") - compression = serializer.validated_data.get("compression") - content = serializer.validated_data.get("content") - follow_filename_format = serializer.validated_data.get("follow_formatting") - - os.makedirs(settings.SCRATCH_DIR, exist_ok=True) - temp = tempfile.NamedTemporaryFile( - dir=settings.SCRATCH_DIR, - suffix="-compressed-archive", - delete=False, - ) - - if content == "both": - strategy_class = OriginalAndArchiveStrategy - elif content == "originals": - strategy_class = OriginalsOnlyStrategy - else: - strategy_class = ArchiveOnlyStrategy - - with zipfile.ZipFile(temp.name, "w", compression) as zipf: - strategy = strategy_class(zipf, follow_filename_format) - for id in ids: - doc = Document.objects.get(id=id) - strategy.add_document(doc) - - with open(temp.name, "rb") as f: - response = HttpResponse(f, content_type="application/zip") - response["Content-Disposition"] = '{}; filename="{}"'.format( - "attachment", - "documents.zip", - ) - - return response - - -class RemoteVersionView(GenericAPIView): - def get(self, request, format=None): - remote_version = "0.0.0" - is_greater_than_current = False - current_version = packaging_version.parse(version.__full_version_str__) - try: - req = urllib.request.Request( - "https://api.github.com/repos/paperless-ngx/" - "paperless-ngx/releases/latest", - ) - # Ensure a JSON response - req.add_header("Accept", "application/json") - - with urllib.request.urlopen(req) as response: - remote = response.read().decode("utf-8") - try: - remote_json = json.loads(remote) - remote_version = remote_json["tag_name"] - # Basically PEP 616 but that only went in 3.9 - if remote_version.startswith("ngx-"): - remote_version = remote_version[len("ngx-") :] - except ValueError: - logger.debug("An error occurred parsing remote version json") - except urllib.error.URLError: - logger.debug("An error occurred checking for available updates") - - is_greater_than_current = ( - packaging_version.parse( - remote_version, - ) - > current_version - ) - - return Response( - { - "version": remote_version, - "update_available": is_greater_than_current, - }, - ) - - -class StoragePathViewSet(ModelViewSet, PassUserMixin): - model = StoragePath - - queryset = StoragePath.objects.annotate(document_count=Count("documents")).order_by( - Lower("name"), - ) - - serializer_class = StoragePathSerializer - pagination_class = StandardPagination - permission_classes = (IsAuthenticated, PaperlessObjectPermissions) - filter_backends = (DjangoFilterBackend, OrderingFilter) - filterset_class = StoragePathFilterSet - ordering_fields = ("name", "path", "matching_algorithm", "match", "document_count") - - -class UiSettingsView(GenericAPIView): - - permission_classes = (IsAuthenticated,) - serializer_class = UiSettingsViewSerializer - - def get(self, request, format=None): - serializer = self.get_serializer(data=request.data) - serializer.is_valid(raise_exception=True) - - user = User.objects.get(pk=request.user.id) - ui_settings = {} - if hasattr(user, "ui_settings"): - ui_settings = user.ui_settings.settings - if "update_checking" in ui_settings: - ui_settings["update_checking"][ - "backend_setting" - ] = settings.ENABLE_UPDATE_CHECK - else: - ui_settings["update_checking"] = { - "backend_setting": settings.ENABLE_UPDATE_CHECK, - } - # strip . - roles = map(lambda perm: re.sub(r"^\w+.", "", perm), user.get_all_permissions()) - return Response( - { - "user": { - "id": user.id, - "username": user.username, - "is_superuser": user.is_superuser, - "groups": user.groups.values_list("id", flat=True), - }, - "settings": ui_settings, - "permissions": roles, - }, - ) - - def post(self, request, format=None): - serializer = self.get_serializer(data=request.data) - serializer.is_valid(raise_exception=True) - - serializer.save(user=self.request.user) - - return Response( - { - "success": True, - }, - ) - - -class TasksViewSet(ReadOnlyModelViewSet): - - permission_classes = (IsAuthenticated,) - serializer_class = TasksViewSerializer - - def get_queryset(self): - queryset = ( - PaperlessTask.objects.filter( - acknowledged=False, - ) - .order_by("date_created") - .reverse() - ) - task_id = self.request.query_params.get("task_id") - if task_id is not None: - queryset = PaperlessTask.objects.filter(task_id=task_id) - return queryset - - -class AcknowledgeTasksView(GenericAPIView): - - permission_classes = (IsAuthenticated,) - serializer_class = AcknowledgeTasksViewSerializer - - def post(self, request, *args, **kwargs): - serializer = self.get_serializer(data=request.data) - serializer.is_valid(raise_exception=True) - - tasks = serializer.validated_data.get("tasks") - - try: - result = PaperlessTask.objects.filter(id__in=tasks).update( - acknowledged=True, - ) - return Response({"result": result}) - except Exception: - return HttpResponseBadRequest() +import itertools +import json +import logging +import os +import re +import tempfile +import urllib +import zipfile +from datetime import datetime +from pathlib import Path +from time import mktime +from unicodedata import normalize +from urllib.parse import quote + +import pathvalidate +from django.conf import settings +from django.contrib.auth.models import User +from django.db.models import Case +from django.db.models import Count +from django.db.models import IntegerField +from django.db.models import Max +from django.db.models import Sum +from django.db.models import When +from django.db.models.functions import Length +from django.db.models.functions import Lower +from django.http import Http404 +from django.http import HttpResponse +from django.http import HttpResponseBadRequest +from django.shortcuts import get_object_or_404 +from django.utils.decorators import method_decorator +from django.utils.translation import get_language +from django.views.decorators.cache import cache_control +from django.views.generic import TemplateView +from django_filters.rest_framework import DjangoFilterBackend +from documents.filters import ObjectOwnedOrGrantedPermissionsFilter +from documents.permissions import PaperlessAdminPermissions +from documents.permissions import PaperlessObjectPermissions +from documents.tasks import consume_file +from langdetect import detect +from packaging import version as packaging_version +from paperless import version +from paperless.db import GnuPG +from paperless.views import StandardPagination +from rest_framework import parsers +from rest_framework.decorators import action +from rest_framework.exceptions import NotFound +from rest_framework.filters import OrderingFilter +from rest_framework.filters import SearchFilter +from rest_framework.generics import GenericAPIView +from rest_framework.mixins import CreateModelMixin +from rest_framework.mixins import DestroyModelMixin +from rest_framework.mixins import ListModelMixin +from rest_framework.mixins import RetrieveModelMixin +from rest_framework.mixins import UpdateModelMixin +from rest_framework.permissions import IsAuthenticated +from rest_framework.response import Response +from rest_framework.views import APIView +from rest_framework.viewsets import GenericViewSet +from rest_framework.viewsets import ModelViewSet +from rest_framework.viewsets import ReadOnlyModelViewSet +from rest_framework.viewsets import ViewSet + +from .bulk_download import ArchiveOnlyStrategy +from .bulk_download import OriginalAndArchiveStrategy +from .bulk_download import OriginalsOnlyStrategy +from .classifier import load_classifier +from .data_models import ConsumableDocument +from .data_models import DocumentMetadataOverrides +from .data_models import DocumentSource +from .filters import CorrespondentFilterSet +from .filters import DocumentFilterSet +from .filters import DocumentTypeFilterSet +from .filters import StoragePathFilterSet +from .filters import TagFilterSet +from .matching import match_correspondents +from .matching import match_document_types +from .matching import match_storage_paths +from .matching import match_tags +from .models import Correspondent, Metadata +from .models import Document +from .models import DocumentType +from .models import Note +from .models import PaperlessTask +from .models import SavedView +from .models import StoragePath +from .models import Tag +from .parsers import get_parser_class_for_mime_type +from .parsers import parse_date_generator +from .serialisers import AcknowledgeTasksViewSerializer +from .serialisers import BulkDownloadSerializer +from .serialisers import BulkEditSerializer +from .serialisers import CorrespondentSerializer +from .serialisers import DocumentListSerializer +from .serialisers import DocumentSerializer +from .serialisers import DocumentTypeSerializer +from .serialisers import PostDocumentSerializer +from .serialisers import SavedViewSerializer +from .serialisers import StoragePathSerializer +from .serialisers import TagSerializer +from .serialisers import TagSerializerVersion1 +from .serialisers import TasksViewSerializer +from .serialisers import UiSettingsViewSerializer + +logger = logging.getLogger("paperless.api") + + +class IndexView(TemplateView): + template_name = "index.html" + + def get_frontend_language(self): + if hasattr( + self.request.user, + "ui_settings", + ) and self.request.user.ui_settings.settings.get("language"): + lang = self.request.user.ui_settings.settings.get("language") + else: + lang = get_language() + # This is here for the following reason: + # Django identifies languages in the form "en-us" + # However, angular generates locales as "en-US". + # this translates between these two forms. + if "-" in lang: + first = lang[: lang.index("-")] + second = lang[lang.index("-") + 1 :] + return f"{first}-{second.upper()}" + else: + return lang + + def get_context_data(self, **kwargs): + context = super().get_context_data(**kwargs) + context["cookie_prefix"] = settings.COOKIE_PREFIX + context["username"] = self.request.user.username + context["full_name"] = self.request.user.get_full_name() + context["styles_css"] = f"frontend/{self.get_frontend_language()}/styles.css" + context["runtime_js"] = f"frontend/{self.get_frontend_language()}/runtime.js" + context[ + "polyfills_js" + ] = f"frontend/{self.get_frontend_language()}/polyfills.js" + context["main_js"] = f"frontend/{self.get_frontend_language()}/main.js" + context[ + "webmanifest" + ] = f"frontend/{self.get_frontend_language()}/manifest.webmanifest" # noqa: E501 + context[ + "apple_touch_icon" + ] = f"frontend/{self.get_frontend_language()}/apple-touch-icon.png" # noqa: E501 + return context + + +class PassUserMixin(CreateModelMixin): + """ + Pass a user object to serializer + """ + + def get_serializer(self, *args, **kwargs): + kwargs.setdefault("user", self.request.user) + return super().get_serializer(*args, **kwargs) + + +class CorrespondentViewSet(ModelViewSet, PassUserMixin): + model = Correspondent + + queryset = Correspondent.objects.annotate( + document_count=Count("documents"), + last_correspondence=Max("documents__created"), + ).order_by(Lower("name")) + + serializer_class = CorrespondentSerializer + pagination_class = StandardPagination + permission_classes = (IsAuthenticated, PaperlessObjectPermissions) + filter_backends = ( + DjangoFilterBackend, + OrderingFilter, + ObjectOwnedOrGrantedPermissionsFilter, + ) + filterset_class = CorrespondentFilterSet + ordering_fields = ( + "name", + "matching_algorithm", + "match", + "document_count", + "last_correspondence", + ) + + +class TagViewSet(ModelViewSet, PassUserMixin): + model = Tag + + queryset = Tag.objects.annotate(document_count=Count("documents")).order_by( + Lower("name"), + ) + + def get_serializer_class(self, *args, **kwargs): + print(self.request.version) + if int(self.request.version) == 1: + return TagSerializerVersion1 + else: + return TagSerializer + + pagination_class = StandardPagination + permission_classes = (IsAuthenticated, PaperlessObjectPermissions) + filter_backends = ( + DjangoFilterBackend, + OrderingFilter, + ObjectOwnedOrGrantedPermissionsFilter, + ) + filterset_class = TagFilterSet + ordering_fields = ("color", "name", "matching_algorithm", "match", "document_count") + + +class DocumentTypeViewSet(ModelViewSet, PassUserMixin): + model = DocumentType + + queryset = DocumentType.objects.annotate( + document_count=Count("documents"), + ).order_by(Lower("name")) + + serializer_class = DocumentTypeSerializer + pagination_class = StandardPagination + permission_classes = (IsAuthenticated, PaperlessObjectPermissions) + filter_backends = ( + DjangoFilterBackend, + OrderingFilter, + ObjectOwnedOrGrantedPermissionsFilter, + ) + filterset_class = DocumentTypeFilterSet + ordering_fields = ("name", "matching_algorithm", "match", "document_count") + + +class DocumentViewSet( + PassUserMixin, + RetrieveModelMixin, + UpdateModelMixin, + DestroyModelMixin, + ListModelMixin, + GenericViewSet, +): + model = Document + queryset = Document.objects.annotate(num_notes=Count("notes")) + serializer_class = DocumentSerializer + pagination_class = StandardPagination + permission_classes = (IsAuthenticated, PaperlessObjectPermissions) + filter_backends = ( + DjangoFilterBackend, + SearchFilter, + OrderingFilter, + ObjectOwnedOrGrantedPermissionsFilter, + ) + filterset_class = DocumentFilterSet + search_fields = ("title", "correspondent__name", "content") + ordering_fields = ( + "id", + "title", + "correspondent__name", + "document_type__name", + "created", + "modified", + "added", + "archive_serial_number", + "num_notes", + ) + + def get_queryset(self): + return Document.objects.distinct().annotate(num_notes=Count("notes")) + + def get_serializer(self, *args, **kwargs): + super().get_serializer(*args, **kwargs) + fields_param = self.request.query_params.get("fields", None) + fields = fields_param.split(",") if fields_param else None + truncate_content = self.request.query_params.get("truncate_content", "False") + serializer_class = self.get_serializer_class() + kwargs.setdefault("context", self.get_serializer_context()) + kwargs.setdefault("fields", fields) + kwargs.setdefault("truncate_content", truncate_content.lower() in ["true", "1"]) + return serializer_class(*args, **kwargs) + + def update(self, request, *args, **kwargs): + response = super().update(request, *args, **kwargs) + from documents import index + + index.add_or_update_document(self.get_object()) + return response + + def destroy(self, request, *args, **kwargs): + from documents import index + + index.remove_document_from_index(self.get_object()) + return super().destroy(request, *args, **kwargs) + + @staticmethod + def original_requested(request): + return ( + "original" in request.query_params + and request.query_params["original"] == "true" + ) + + def file_response(self, pk, request, disposition): + doc = Document.objects.get(id=pk) + if not self.original_requested(request) and doc.has_archive_version: + file_handle = doc.archive_file + filename = doc.get_public_filename(archive=True) + mime_type = "application/pdf" + else: + file_handle = doc.source_file + filename = doc.get_public_filename() + mime_type = doc.mime_type + # Support browser previewing csv files by using text mime type + if mime_type in {"application/csv", "text/csv"} and disposition == "inline": + mime_type = "text/plain" + + if doc.storage_type == Document.STORAGE_TYPE_GPG: + file_handle = GnuPG.decrypted(file_handle) + + response = HttpResponse(file_handle, content_type=mime_type) + # Firefox is not able to handle unicode characters in filename field + # RFC 5987 addresses this issue + # see https://datatracker.ietf.org/doc/html/rfc5987#section-4.2 + # Chromium cannot handle commas in the filename + filename_normalized = normalize("NFKD", filename.replace(",", "_")).encode( + "ascii", + "ignore", + ) + filename_encoded = quote(filename) + content_disposition = ( + f"{disposition}; " + f'filename="{filename_normalized}"; ' + f"filename*=utf-8''{filename_encoded}" + ) + response["Content-Disposition"] = content_disposition + return response + + def get_metadata(self, file, mime_type): + if not os.path.isfile(file): + return None + + parser_class = get_parser_class_for_mime_type(mime_type) + if parser_class: + parser = parser_class(progress_callback=None, logging_group=None) + + try: + return parser.extract_metadata(file, mime_type) + except Exception: + # TODO: cover GPG errors, remove later. + return [] + else: + return [] + + def get_filesize(self, filename): + if os.path.isfile(filename): + return os.stat(filename).st_size + else: + return None + + @action(methods=["get"], detail=True) + def metadata(self, request, pk=None): + try: + doc = Document.objects.get(pk=pk) + except Document.DoesNotExist: + raise Http404 + + meta = { + "original_checksum": doc.checksum, + "original_size": self.get_filesize(doc.source_path), + "original_mime_type": doc.mime_type, + "media_filename": doc.filename, + "has_archive_version": doc.has_archive_version, + "original_metadata": self.get_metadata(doc.source_path, doc.mime_type), + "archive_checksum": doc.archive_checksum, + "archive_media_filename": doc.archive_filename, + "original_filename": doc.original_filename, + } + + lang = "en" + try: + lang = detect(doc.content) + except Exception: + pass + meta["lang"] = lang + + if doc.has_archive_version: + meta["archive_size"] = self.get_filesize(doc.archive_path) + meta["archive_metadata"] = self.get_metadata( + doc.archive_path, + "application/pdf", + ) + else: + meta["archive_size"] = None + meta["archive_metadata"] = None + + return Response(meta) + + @action(methods=["get"], detail=True) + def suggestions(self, request, pk=None): + doc = get_object_or_404(Document, pk=pk) + + classifier = load_classifier() + + gen = parse_date_generator(doc.filename, doc.content) + dates = sorted( + {i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)}, + ) + + return Response( + { + "correspondents": [c.id for c in match_correspondents(doc, classifier)], + "tags": [t.id for t in match_tags(doc, classifier)], + "document_types": [ + dt.id for dt in match_document_types(doc, classifier) + ], + "storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)], + "dates": [ + date.strftime("%Y-%m-%d") for date in dates if date is not None + ], + }, + ) + + @action(methods=["get"], detail=True) + def preview(self, request, pk=None): + try: + response = self.file_response(pk, request, "inline") + return response + except (FileNotFoundError, Document.DoesNotExist): + raise Http404 + + @action(methods=["get"], detail=True) + @method_decorator(cache_control(public=False, max_age=315360000)) + def thumb(self, request, pk=None): + try: + doc = Document.objects.get(id=pk) + if doc.storage_type == Document.STORAGE_TYPE_GPG: + handle = GnuPG.decrypted(doc.thumbnail_file) + else: + handle = doc.thumbnail_file + # TODO: Send ETag information and use that to send new thumbnails + # if available + + return HttpResponse(handle, content_type="image/webp") + except (FileNotFoundError, Document.DoesNotExist): + raise Http404 + + @action(methods=["get"], detail=True) + def download(self, request, pk=None): + try: + return self.file_response(pk, request, "attachment") + except (FileNotFoundError, Document.DoesNotExist): + raise Http404 + + def getNotes(self, doc): + return [ + { + "id": c.id, + "note": c.note, + "created": c.created, + "user": { + "id": c.user.id, + "username": c.user.username, + "first_name": c.user.first_name, + "last_name": c.user.last_name, + }, + } + for c in Note.objects.filter(document=doc).order_by("-created") + ] + + @action(methods=["get", "post", "delete"], detail=True) + def notes(self, request, pk=None): + try: + doc = Document.objects.get(pk=pk) + except Document.DoesNotExist: + raise Http404 + + currentUser = request.user + + if request.method == "GET": + try: + return Response(self.getNotes(doc)) + except Exception as e: + logger.warning(f"An error occurred retrieving notes: {str(e)}") + return Response( + {"error": "Error retreiving notes, check logs for more detail."}, + ) + elif request.method == "POST": + try: + c = Note.objects.create( + document=doc, + note=request.data["note"], + user=currentUser, + ) + c.save() + + from documents import index + + index.add_or_update_document(self.get_object()) + + return Response(self.getNotes(doc)) + except Exception as e: + logger.warning(f"An error occurred saving note: {str(e)}") + return Response( + { + "error": "Error saving note, check logs for more detail.", + }, + ) + elif request.method == "DELETE": + note = Note.objects.get(id=int(request.GET.get("id"))) + note.delete() + + from documents import index + + index.add_or_update_document(self.get_object()) + + return Response(self.getNotes(doc)) + + return Response( + { + "error": "error", + }, + ) + + @action(methods=["get", "post"], detail=True) + def index_field_metadata(self, request, pk=None): + try: + doc = Document.objects.get(pk=pk) + except Document.DoesNotExist: + raise Http404 + + currentUser = request.user + + if request.method == "GET": + try: + return Response(Metadata.objects.filter(document=doc).order_by("-created")) + except Exception as e: + logger.warning(f"An error occurred retrieving metadatas: {str(e)}") + return Response( + {"error": "Error retreiving metadatas, check logs for more detail."}, + ) + elif request.method == "POST": + try: + c = Metadata.objects.create( + document=doc, + data=request.data["metadata"], + user=currentUser, + ) + c.save() + + from documents import index + + index.add_or_update_document(self.get_object()) + + return Response(str(c.data)) + except Exception as e: + logger.warning(f"An error occurred saving metadata: {str(e)}") + return Response( + { + "error": "Error saving metadata, check logs for more detail.", + }, + ) + + +class SearchResultSerializer(DocumentSerializer, PassUserMixin): + def to_representation(self, instance): + doc = Document.objects.get(id=instance["id"]) + notes = ",".join( + [str(c.note) for c in Note.objects.filter(document=instance["id"])], + ) + r = super().to_representation(doc) + r["__search_hit__"] = { + "score": instance.score, + "highlights": instance.highlights("content", text=doc.content), + "note_highlights": instance.highlights("notes", text=notes) + if doc + else None, + "rank": instance.rank, + } + + return r + + +class UnifiedSearchViewSet(DocumentViewSet): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.searcher = None + + def get_serializer_class(self): + if self._is_search_request(): + return SearchResultSerializer + else: + return DocumentSerializer + + def _is_search_request(self): + return ( + "query" in self.request.query_params + or "more_like_id" in self.request.query_params + ) + + def filter_queryset(self, queryset): + if self._is_search_request(): + from documents import index + + if hasattr(self.request, "user"): + # pass user to query for perms + self.request.query_params._mutable = True + self.request.query_params["user"] = self.request.user.id + self.request.query_params._mutable = False + + if "query" in self.request.query_params: + query_class = index.DelayedFullTextQuery + elif "more_like_id" in self.request.query_params: + query_class = index.DelayedMoreLikeThisQuery + else: + raise ValueError + + return query_class( + self.searcher, + self.request.query_params, + self.paginator.get_page_size(self.request), + ) + else: + return super().filter_queryset(queryset) + + def list(self, request, *args, **kwargs): + if self._is_search_request(): + from documents import index + + try: + with index.open_index_searcher() as s: + self.searcher = s + return super().list(request) + except NotFound: + raise + except Exception as e: + return HttpResponseBadRequest(str(e)) + else: + return super().list(request) + + +class LogViewSet(ViewSet): + + permission_classes = (IsAuthenticated, PaperlessAdminPermissions) + + log_files = ["paperless", "mail"] + + def get_log_filename(self, log): + return os.path.join(settings.LOGGING_DIR, f"{log}.log") + + def retrieve(self, request, pk=None, *args, **kwargs): + if pk not in self.log_files: + raise Http404 + + filename = self.get_log_filename(pk) + + if not os.path.isfile(filename): + raise Http404 + + with open(filename) as f: + lines = [line.rstrip() for line in f.readlines()] + + return Response(lines) + + def list(self, request, *args, **kwargs): + exist = [ + log for log in self.log_files if os.path.isfile(self.get_log_filename(log)) + ] + return Response(exist) + + +class SavedViewViewSet(ModelViewSet, PassUserMixin): + model = SavedView + + queryset = SavedView.objects.all() + serializer_class = SavedViewSerializer + pagination_class = StandardPagination + permission_classes = (IsAuthenticated, PaperlessObjectPermissions) + + def get_queryset(self): + user = self.request.user + return SavedView.objects.filter(owner=user) + + def perform_create(self, serializer): + serializer.save(owner=self.request.user) + + +class BulkEditView(GenericAPIView): + + permission_classes = (IsAuthenticated,) + serializer_class = BulkEditSerializer + parser_classes = (parsers.JSONParser,) + + def post(self, request, *args, **kwargs): + serializer = self.get_serializer(data=request.data) + serializer.is_valid(raise_exception=True) + + method = serializer.validated_data.get("method") + parameters = serializer.validated_data.get("parameters") + documents = serializer.validated_data.get("documents") + + try: + # TODO: parameter validation + result = method(documents, **parameters) + return Response({"result": result}) + except Exception as e: + return HttpResponseBadRequest(str(e)) + + +class PostDocumentView(GenericAPIView): + + permission_classes = (IsAuthenticated,) + serializer_class = PostDocumentSerializer + parser_classes = (parsers.MultiPartParser,) + + def post(self, request, *args, **kwargs): + + serializer = self.get_serializer(data=request.data) + serializer.is_valid(raise_exception=True) + + doc_name, doc_data = serializer.validated_data.get("document") + correspondent_id = serializer.validated_data.get("correspondent") + document_type_id = serializer.validated_data.get("document_type") + tag_ids = serializer.validated_data.get("tags") + title = serializer.validated_data.get("title") + created = serializer.validated_data.get("created") + archive_serial_number = serializer.validated_data.get("archive_serial_number") + storage_path_id = serializer.validated_data.get("storage_path_id") + full_path = serializer.validated_data.get("full_path") + + logger.debug(f"storage_path_id: {storage_path_id}") + + t = int(mktime(datetime.now().timetuple())) + + os.makedirs(settings.SCRATCH_DIR, exist_ok=True) + + temp_file_path = Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR)) / Path( + pathvalidate.sanitize_filename(doc_name), + ) + + temp_file_path.write_bytes(doc_data) + + os.utime(temp_file_path, times=(t, t)) + + input_doc = ConsumableDocument( + source=DocumentSource.ApiUpload, + original_file=temp_file_path, + ) + input_doc_overrides = DocumentMetadataOverrides( + filename=doc_name, + title=title, + correspondent_id=correspondent_id, + document_type_id=document_type_id, + tag_ids=tag_ids, + created=created, + asn=archive_serial_number, + owner_id=request.user.id, + storage_path_id=storage_path_id, + full_path=full_path, + ) + + async_task = consume_file.delay( + input_doc, + input_doc_overrides, + ) + + return Response(async_task.id) + + +class SelectionDataView(GenericAPIView): + + permission_classes = (IsAuthenticated,) + serializer_class = DocumentListSerializer + parser_classes = (parsers.MultiPartParser, parsers.JSONParser) + + def post(self, request, format=None): + serializer = self.get_serializer(data=request.data) + serializer.is_valid(raise_exception=True) + + ids = serializer.validated_data.get("documents") + + correspondents = Correspondent.objects.annotate( + document_count=Count( + Case(When(documents__id__in=ids, then=1), output_field=IntegerField()), + ), + ) + + tags = Tag.objects.annotate( + document_count=Count( + Case(When(documents__id__in=ids, then=1), output_field=IntegerField()), + ), + ) + + types = DocumentType.objects.annotate( + document_count=Count( + Case(When(documents__id__in=ids, then=1), output_field=IntegerField()), + ), + ) + + storage_paths = StoragePath.objects.annotate( + document_count=Count( + Case(When(documents__id__in=ids, then=1), output_field=IntegerField()), + ), + ) + + r = Response( + { + "selected_correspondents": [ + {"id": t.id, "document_count": t.document_count} + for t in correspondents + ], + "selected_tags": [ + {"id": t.id, "document_count": t.document_count} for t in tags + ], + "selected_document_types": [ + {"id": t.id, "document_count": t.document_count} for t in types + ], + "selected_storage_paths": [ + {"id": t.id, "document_count": t.document_count} + for t in storage_paths + ], + }, + ) + + return r + + +class SearchAutoCompleteView(APIView): + + permission_classes = (IsAuthenticated,) + + def get(self, request, format=None): + if "term" in request.query_params: + term = request.query_params["term"] + else: + return HttpResponseBadRequest("Term required") + + if "limit" in request.query_params: + limit = int(request.query_params["limit"]) + if limit <= 0: + return HttpResponseBadRequest("Invalid limit") + else: + limit = 10 + + from documents import index + + ix = index.open_index() + + return Response(index.autocomplete(ix, term, limit)) + + +class StatisticsView(APIView): + + permission_classes = (IsAuthenticated,) + + def get(self, request, format=None): + documents_total = Document.objects.all().count() + + inbox_tag = Tag.objects.filter(is_inbox_tag=True) + + documents_inbox = ( + Document.objects.filter(tags__is_inbox_tag=True).distinct().count() + if inbox_tag.exists() + else None + ) + + document_file_type_counts = ( + Document.objects.values("mime_type") + .annotate(mime_type_count=Count("mime_type")) + .order_by("-mime_type_count") + if documents_total > 0 + else 0 + ) + + character_count = ( + Document.objects.annotate( + characters=Length("content"), + ) + .aggregate(Sum("characters")) + .get("characters__sum") + ) + + return Response( + { + "documents_total": documents_total, + "documents_inbox": documents_inbox, + "inbox_tag": inbox_tag.first().pk if inbox_tag.exists() else None, + "document_file_type_counts": document_file_type_counts, + "character_count": character_count, + }, + ) + + +class BulkDownloadView(GenericAPIView): + + permission_classes = (IsAuthenticated,) + serializer_class = BulkDownloadSerializer + parser_classes = (parsers.JSONParser,) + + def post(self, request, format=None): + serializer = self.get_serializer(data=request.data) + serializer.is_valid(raise_exception=True) + + ids = serializer.validated_data.get("documents") + compression = serializer.validated_data.get("compression") + content = serializer.validated_data.get("content") + follow_filename_format = serializer.validated_data.get("follow_formatting") + + os.makedirs(settings.SCRATCH_DIR, exist_ok=True) + temp = tempfile.NamedTemporaryFile( + dir=settings.SCRATCH_DIR, + suffix="-compressed-archive", + delete=False, + ) + + if content == "both": + strategy_class = OriginalAndArchiveStrategy + elif content == "originals": + strategy_class = OriginalsOnlyStrategy + else: + strategy_class = ArchiveOnlyStrategy + + with zipfile.ZipFile(temp.name, "w", compression) as zipf: + strategy = strategy_class(zipf, follow_filename_format) + for id in ids: + doc = Document.objects.get(id=id) + strategy.add_document(doc) + + with open(temp.name, "rb") as f: + response = HttpResponse(f, content_type="application/zip") + response["Content-Disposition"] = '{}; filename="{}"'.format( + "attachment", + "documents.zip", + ) + + return response + + +class RemoteVersionView(GenericAPIView): + def get(self, request, format=None): + remote_version = "0.0.0" + is_greater_than_current = False + current_version = packaging_version.parse(version.__full_version_str__) + try: + req = urllib.request.Request( + "https://api.github.com/repos/paperless-ngx/" + "paperless-ngx/releases/latest", + ) + # Ensure a JSON response + req.add_header("Accept", "application/json") + + with urllib.request.urlopen(req) as response: + remote = response.read().decode("utf-8") + try: + remote_json = json.loads(remote) + remote_version = remote_json["tag_name"] + # Basically PEP 616 but that only went in 3.9 + if remote_version.startswith("ngx-"): + remote_version = remote_version[len("ngx-") :] + except ValueError: + logger.debug("An error occurred parsing remote version json") + except urllib.error.URLError: + logger.debug("An error occurred checking for available updates") + + is_greater_than_current = ( + packaging_version.parse( + remote_version, + ) + > current_version + ) + + return Response( + { + "version": remote_version, + "update_available": is_greater_than_current, + }, + ) + + +class StoragePathViewSet(ModelViewSet, PassUserMixin): + model = StoragePath + + queryset = StoragePath.objects.annotate(document_count=Count("documents")).order_by( + Lower("name"), + ) + + serializer_class = StoragePathSerializer + pagination_class = StandardPagination + permission_classes = (IsAuthenticated, PaperlessObjectPermissions) + filter_backends = (DjangoFilterBackend, OrderingFilter) + filterset_class = StoragePathFilterSet + ordering_fields = ("name", "path", "matching_algorithm", "match", "document_count") + + +class UiSettingsView(GenericAPIView): + + permission_classes = (IsAuthenticated,) + serializer_class = UiSettingsViewSerializer + + def get(self, request, format=None): + serializer = self.get_serializer(data=request.data) + serializer.is_valid(raise_exception=True) + + user = User.objects.get(pk=request.user.id) + ui_settings = {} + if hasattr(user, "ui_settings"): + ui_settings = user.ui_settings.settings + if "update_checking" in ui_settings: + ui_settings["update_checking"][ + "backend_setting" + ] = settings.ENABLE_UPDATE_CHECK + else: + ui_settings["update_checking"] = { + "backend_setting": settings.ENABLE_UPDATE_CHECK, + } + # strip . + roles = map(lambda perm: re.sub(r"^\w+.", "", perm), user.get_all_permissions()) + return Response( + { + "user": { + "id": user.id, + "username": user.username, + "is_superuser": user.is_superuser, + "groups": user.groups.values_list("id", flat=True), + }, + "settings": ui_settings, + "permissions": roles, + }, + ) + + def post(self, request, format=None): + serializer = self.get_serializer(data=request.data) + serializer.is_valid(raise_exception=True) + + serializer.save(user=self.request.user) + + return Response( + { + "success": True, + }, + ) + + +class TasksViewSet(ReadOnlyModelViewSet): + + permission_classes = (IsAuthenticated,) + serializer_class = TasksViewSerializer + + def get_queryset(self): + queryset = ( + PaperlessTask.objects.filter( + acknowledged=False, + ) + .order_by("date_created") + .reverse() + ) + task_id = self.request.query_params.get("task_id") + if task_id is not None: + queryset = PaperlessTask.objects.filter(task_id=task_id) + return queryset + + +class AcknowledgeTasksView(GenericAPIView): + + permission_classes = (IsAuthenticated,) + serializer_class = AcknowledgeTasksViewSerializer + + def post(self, request, *args, **kwargs): + serializer = self.get_serializer(data=request.data) + serializer.is_valid(raise_exception=True) + + tasks = serializer.validated_data.get("tasks") + + try: + result = PaperlessTask.objects.filter(id__in=tasks).update( + acknowledged=True, + ) + return Response({"result": result}) + except Exception: + return HttpResponseBadRequest()