195 lines
6.1 KiB
Python
195 lines
6.1 KiB
Python
# Generated by Django 3.1.6 on 2021-02-07 22:26
|
|
import hashlib
|
|
import logging
|
|
import os
|
|
import shutil
|
|
|
|
from django.conf import settings
|
|
from django.db import migrations
|
|
|
|
|
|
logger = logging.getLogger("paperless.migrations")
|
|
|
|
|
|
def archive_name_from_filename_old(filename):
|
|
return os.path.splitext(filename)[0] + ".pdf"
|
|
|
|
|
|
def archive_path_old(doc):
|
|
if doc.filename:
|
|
fname = archive_name_from_filename_old(doc.filename)
|
|
else:
|
|
fname = "{:07}.pdf".format(doc.pk)
|
|
|
|
return os.path.join(
|
|
settings.ARCHIVE_DIR,
|
|
fname
|
|
)
|
|
|
|
|
|
def archive_name_from_filename_new(filename):
|
|
name, ext = os.path.splitext(filename)
|
|
if ext == ".pdf":
|
|
return filename
|
|
else:
|
|
return filename + ".pdf"
|
|
|
|
|
|
def archive_path_new(doc):
|
|
if doc.filename:
|
|
fname = archive_name_from_filename_new(doc.filename)
|
|
else:
|
|
fname = "{:07}.pdf".format(doc.pk)
|
|
|
|
return os.path.join(
|
|
settings.ARCHIVE_DIR,
|
|
fname
|
|
)
|
|
|
|
|
|
STORAGE_TYPE_GPG = "gpg"
|
|
|
|
|
|
def source_path(doc):
|
|
if doc.filename:
|
|
fname = str(doc.filename)
|
|
else:
|
|
fname = "{:07}{}".format(doc.pk, doc.file_type)
|
|
if doc.storage_type == STORAGE_TYPE_GPG:
|
|
fname += ".gpg" # pragma: no cover
|
|
|
|
return os.path.join(
|
|
settings.ORIGINALS_DIR,
|
|
fname
|
|
)
|
|
|
|
|
|
def move_old_to_new_locations(apps, schema_editor):
|
|
Document = apps.get_model("documents", "Document")
|
|
|
|
affected_document_ids = set()
|
|
|
|
old_archive_path_to_id = {}
|
|
|
|
# check for documents that have incorrect archive versions
|
|
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
|
old_path = archive_path_old(doc)
|
|
new_path = archive_path_new(doc)
|
|
|
|
if not os.path.isfile(old_path):
|
|
raise ValueError(
|
|
f"Archived document of {doc.filename} does not exist at: "
|
|
f"{old_path}")
|
|
|
|
if old_path != new_path and os.path.isfile(new_path):
|
|
raise ValueError(
|
|
f"Need to move {old_path} to {new_path}, but target file "
|
|
f"already exists")
|
|
|
|
if old_path in old_archive_path_to_id:
|
|
affected_document_ids.add(doc.id)
|
|
affected_document_ids.add(old_archive_path_to_id[old_path])
|
|
else:
|
|
old_archive_path_to_id[old_path] = doc.id
|
|
|
|
# check that we can regenerate these archive versions
|
|
for doc_id in affected_document_ids:
|
|
from documents.parsers import get_parser_class_for_mime_type
|
|
|
|
doc = Document.objects.get(id=doc_id)
|
|
parser_class = get_parser_class_for_mime_type(doc.mime_type)
|
|
if not parser_class:
|
|
raise Exception(
|
|
f"document {doc.filename} has an invalid archived document, "
|
|
f"but no parsers are available. Cannot migrate.")
|
|
|
|
# move files
|
|
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
|
old_path = archive_path_old(doc)
|
|
new_path = archive_path_new(doc)
|
|
|
|
if doc.id in affected_document_ids:
|
|
# remove affected archive versions
|
|
if os.path.isfile(old_path):
|
|
os.unlink(old_path)
|
|
else:
|
|
# move unaffected archive versions
|
|
if old_path != new_path and os.path.isfile(old_path) and not os.path.isfile(new_path):
|
|
logger.debug(
|
|
f"Moving {old_path} to {new_path}"
|
|
)
|
|
shutil.move(old_path, new_path)
|
|
|
|
# regenerate archive documents
|
|
for doc_id in affected_document_ids:
|
|
from documents.parsers import get_parser_class_for_mime_type, \
|
|
DocumentParser, \
|
|
ParseError
|
|
|
|
doc = Document.objects.get(id=doc_id)
|
|
logger.info(
|
|
f"Regenerating archive document for {doc.filename}"
|
|
)
|
|
parser_class = get_parser_class_for_mime_type(doc.mime_type)
|
|
parser: DocumentParser = parser_class(None, None)
|
|
try:
|
|
parser.parse(source_path(doc), doc.mime_type, os.path.basename(doc.filename))
|
|
doc.content = parser.get_text()
|
|
if parser.archive_path and os.path.isfile(parser.archive_path):
|
|
with open(parser.archive_path, "rb") as f:
|
|
doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
|
|
shutil.copy2(parser.archive_path, archive_path_new(doc))
|
|
else:
|
|
doc.archive_checksum = None
|
|
if os.path.isfile(archive_path_new(doc)):
|
|
os.unlink(archive_path_new(doc))
|
|
doc.save()
|
|
except ParseError:
|
|
logger.exception(
|
|
f"Unable to regenerate archive document for {doc.filename}"
|
|
)
|
|
finally:
|
|
parser.cleanup()
|
|
|
|
|
|
def move_new_to_old_locations(apps, schema_editor):
|
|
Document = apps.get_model("documents", "Document")
|
|
|
|
old_archive_paths = set()
|
|
|
|
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
|
new_archive_path = archive_path_new(doc)
|
|
old_archive_path = archive_path_old(doc)
|
|
if old_archive_path in old_archive_paths:
|
|
raise ValueError(
|
|
f"Cannot migrate: Archive file name {old_archive_path} of "
|
|
f"document {doc.filename} would clash with another archive "
|
|
f"filename.")
|
|
old_archive_paths.add(old_archive_path)
|
|
if new_archive_path != old_archive_path and os.path.isfile(old_archive_path):
|
|
raise ValueError(
|
|
f"Cannot migrate: Cannot move {new_archive_path} to "
|
|
f"{old_archive_path}: file already exists."
|
|
)
|
|
|
|
for doc in Document.objects.filter(archive_checksum__isnull=False):
|
|
new_archive_path = archive_path_new(doc)
|
|
old_archive_path = archive_path_old(doc)
|
|
if new_archive_path != old_archive_path:
|
|
logger.debug(f"Moving {new_archive_path} to {old_archive_path}")
|
|
shutil.move(new_archive_path, old_archive_path)
|
|
|
|
|
|
class Migration(migrations.Migration):
|
|
|
|
dependencies = [
|
|
('documents', '1011_auto_20210101_2340'),
|
|
]
|
|
|
|
operations = [
|
|
migrations.RunPython(
|
|
move_old_to_new_locations,
|
|
move_new_to_old_locations
|
|
)
|
|
]
|