paperless-ngx/src/documents/migrations/1012_fix_archive_files.py
2021-02-09 00:13:13 +01:00

195 lines
6.1 KiB
Python

# Generated by Django 3.1.6 on 2021-02-07 22:26
import hashlib
import logging
import os
import shutil
from django.conf import settings
from django.db import migrations
logger = logging.getLogger("paperless.migrations")
def archive_name_from_filename_old(filename):
return os.path.splitext(filename)[0] + ".pdf"
def archive_path_old(doc):
if doc.filename:
fname = archive_name_from_filename_old(doc.filename)
else:
fname = "{:07}.pdf".format(doc.pk)
return os.path.join(
settings.ARCHIVE_DIR,
fname
)
def archive_name_from_filename_new(filename):
name, ext = os.path.splitext(filename)
if ext == ".pdf":
return filename
else:
return filename + ".pdf"
def archive_path_new(doc):
if doc.filename:
fname = archive_name_from_filename_new(doc.filename)
else:
fname = "{:07}.pdf".format(doc.pk)
return os.path.join(
settings.ARCHIVE_DIR,
fname
)
STORAGE_TYPE_GPG = "gpg"
def source_path(doc):
if doc.filename:
fname = str(doc.filename)
else:
fname = "{:07}{}".format(doc.pk, doc.file_type)
if doc.storage_type == STORAGE_TYPE_GPG:
fname += ".gpg" # pragma: no cover
return os.path.join(
settings.ORIGINALS_DIR,
fname
)
def move_old_to_new_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document")
affected_document_ids = set()
old_archive_path_to_id = {}
# check for documents that have incorrect archive versions
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
new_path = archive_path_new(doc)
if not os.path.isfile(old_path):
raise ValueError(
f"Archived document of {doc.filename} does not exist at: "
f"{old_path}")
if old_path != new_path and os.path.isfile(new_path):
raise ValueError(
f"Need to move {old_path} to {new_path}, but target file "
f"already exists")
if old_path in old_archive_path_to_id:
affected_document_ids.add(doc.id)
affected_document_ids.add(old_archive_path_to_id[old_path])
else:
old_archive_path_to_id[old_path] = doc.id
# check that we can regenerate these archive versions
for doc_id in affected_document_ids:
from documents.parsers import get_parser_class_for_mime_type
doc = Document.objects.get(id=doc_id)
parser_class = get_parser_class_for_mime_type(doc.mime_type)
if not parser_class:
raise Exception(
f"document {doc.filename} has an invalid archived document, "
f"but no parsers are available. Cannot migrate.")
# move files
for doc in Document.objects.filter(archive_checksum__isnull=False):
old_path = archive_path_old(doc)
new_path = archive_path_new(doc)
if doc.id in affected_document_ids:
# remove affected archive versions
if os.path.isfile(old_path):
os.unlink(old_path)
else:
# move unaffected archive versions
if old_path != new_path and os.path.isfile(old_path) and not os.path.isfile(new_path):
logger.debug(
f"Moving {old_path} to {new_path}"
)
shutil.move(old_path, new_path)
# regenerate archive documents
for doc_id in affected_document_ids:
from documents.parsers import get_parser_class_for_mime_type, \
DocumentParser, \
ParseError
doc = Document.objects.get(id=doc_id)
logger.info(
f"Regenerating archive document for {doc.filename}"
)
parser_class = get_parser_class_for_mime_type(doc.mime_type)
parser: DocumentParser = parser_class(None, None)
try:
parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename))
doc.content = parser.get_text()
if parser.archive_path and os.path.isfile(parser.archive_path):
with open(parser.archive_path, "rb") as f:
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
shutil.copy2(parser.archive_path, archive_path_new(doc))
else:
doc.archive_checksum = None
if os.path.isfile(archive_path_new(doc)):
os.unlink(archive_path_new(doc))
doc.save()
except ParseError:
logger.exception(
f"Unable to regenerate archive document for {doc.filename}"
)
finally:
parser.cleanup()
def move_new_to_old_locations(apps, schema_editor):
Document = apps.get_model("documents", "Document")
old_archive_paths = set()
for doc in Document.objects.filter(archive_checksum__isnull=False):
new_archive_path = archive_path_new(doc)
old_archive_path = archive_path_old(doc)
if old_archive_path in old_archive_paths:
raise ValueError(
f"Cannot migrate: Archive file name {old_archive_path} of "
f"document {doc.filename} would clash with another archive "
f"filename.")
old_archive_paths.add(old_archive_path)
if new_archive_path != old_archive_path and os.path.isfile(old_archive_path):
raise ValueError(
f"Cannot migrate: Cannot move {new_archive_path} to "
f"{old_archive_path}: file already exists."
)
for doc in Document.objects.filter(archive_checksum__isnull=False):
new_archive_path = archive_path_new(doc)
old_archive_path = archive_path_old(doc)
if new_archive_path != old_archive_path:
logger.debug(f"Moving {new_archive_path} to {old_archive_path}")
shutil.move(new_archive_path, old_archive_path)
class Migration(migrations.Migration):
dependencies = [
('documents', '1011_auto_20210101_2340'),
]
operations = [
migrations.RunPython(
move_old_to_new_locations,
move_new_to_old_locations
)
]