# Generated by Django 3.1.6 on 2021-02-07 22:26 import hashlib import logging import os import shutil from django.conf import settings from django.db import migrations logger = logging.getLogger("paperless.migrations") def archive_name_from_filename_old(filename): return os.path.splitext(filename)[0] + ".pdf" def archive_path_old(doc): if doc.filename: fname = archive_name_from_filename_old(doc.filename) else: fname = "{:07}.pdf".format(doc.pk) return os.path.join( settings.ARCHIVE_DIR, fname ) def archive_name_from_filename_new(filename): name, ext = os.path.splitext(filename) if ext == ".pdf": return filename else: return filename + ".pdf" def archive_path_new(doc): if doc.filename: fname = archive_name_from_filename_new(doc.filename) else: fname = "{:07}.pdf".format(doc.pk) return os.path.join( settings.ARCHIVE_DIR, fname ) STORAGE_TYPE_GPG = "gpg" def source_path(doc): if doc.filename: fname = str(doc.filename) else: fname = "{:07}{}".format(doc.pk, doc.file_type) if doc.storage_type == STORAGE_TYPE_GPG: fname += ".gpg" # pragma: no cover return os.path.join( settings.ORIGINALS_DIR, fname ) def move_old_to_new_locations(apps, schema_editor): Document = apps.get_model("documents", "Document") affected_document_ids = set() old_archive_path_to_id = {} # check for documents that have incorrect archive versions for doc in Document.objects.filter(archive_checksum__isnull=False): old_path = archive_path_old(doc) new_path = archive_path_new(doc) if not os.path.isfile(old_path): raise ValueError( f"Archived document of {doc.filename} does not exist at: " f"{old_path}") if old_path != new_path and os.path.isfile(new_path): raise ValueError( f"Need to move {old_path} to {new_path}, but target file " f"already exists") if old_path in old_archive_path_to_id: affected_document_ids.add(doc.id) affected_document_ids.add(old_archive_path_to_id[old_path]) else: old_archive_path_to_id[old_path] = doc.id # check that we can regenerate these archive versions for doc_id in affected_document_ids: from documents.parsers import get_parser_class_for_mime_type doc = Document.objects.get(id=doc_id) parser_class = get_parser_class_for_mime_type(doc.mime_type) if not parser_class: raise Exception( f"document {doc.filename} has an invalid archived document, " f"but no parsers are available. Cannot migrate.") # move files for doc in Document.objects.filter(archive_checksum__isnull=False): old_path = archive_path_old(doc) new_path = archive_path_new(doc) if doc.id in affected_document_ids: # remove affected archive versions if os.path.isfile(old_path): os.unlink(old_path) else: # move unaffected archive versions if old_path != new_path and os.path.isfile(old_path) and not os.path.isfile(new_path): logger.debug( f"Moving {old_path} to {new_path}" ) shutil.move(old_path, new_path) # regenerate archive documents for doc_id in affected_document_ids: from documents.parsers import get_parser_class_for_mime_type, \ DocumentParser, \ ParseError doc = Document.objects.get(id=doc_id) logger.info( f"Regenerating archive document for {doc.filename}" ) parser_class = get_parser_class_for_mime_type(doc.mime_type) parser: DocumentParser = parser_class(None, None) try: parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename)) doc.content = parser.get_text() if parser.archive_path and os.path.isfile(parser.archive_path): with open(parser.archive_path, "rb") as f: doc.archive_checksum = hashlib.md5(f.read()).hexdigest() shutil.copy2(parser.archive_path, archive_path_new(doc)) else: doc.archive_checksum = None if os.path.isfile(archive_path_new(doc)): os.unlink(archive_path_new(doc)) doc.save() except ParseError: logger.exception( f"Unable to regenerate archive document for {doc.filename}" ) finally: parser.cleanup() def move_new_to_old_locations(apps, schema_editor): Document = apps.get_model("documents", "Document") old_archive_paths = set() for doc in Document.objects.filter(archive_checksum__isnull=False): new_archive_path = archive_path_new(doc) old_archive_path = archive_path_old(doc) if old_archive_path in old_archive_paths: raise ValueError( f"Cannot migrate: Archive file name {old_archive_path} of " f"document {doc.filename} would clash with another archive " f"filename.") old_archive_paths.add(old_archive_path) if new_archive_path != old_archive_path and os.path.isfile(old_archive_path): raise ValueError( f"Cannot migrate: Cannot move {new_archive_path} to " f"{old_archive_path}: file already exists." ) for doc in Document.objects.filter(archive_checksum__isnull=False): new_archive_path = archive_path_new(doc) old_archive_path = archive_path_old(doc) if new_archive_path != old_archive_path: logger.debug(f"Moving {new_archive_path} to {old_archive_path}") shutil.move(new_archive_path, old_archive_path) class Migration(migrations.Migration): dependencies = [ ('documents', '1011_auto_20210101_2340'), ] operations = [ migrations.RunPython( move_old_to_new_locations, move_new_to_old_locations ) ]