Saves work on an export/import process which is data only

This commit is contained in:
Trenton H 2024-05-17 10:04:12 -07:00
parent 04f52f553a
commit c3d7793e3c
5 changed files with 108 additions and 20 deletions

View File

@ -185,6 +185,13 @@ For PostgreSQL, refer to [Upgrading a PostgreSQL Cluster](https://www.postgresql
For MariaDB, refer to [Upgrading MariaDB](https://mariadb.com/kb/en/upgrading/) For MariaDB, refer to [Upgrading MariaDB](https://mariadb.com/kb/en/upgrading/)
You may also use the exporter and importer with the `--data-only` flag.
!!! warning
You should not change any settings, especially paths, when doing this or there is a
risk of data loss
## Downgrading Paperless {#downgrade-paperless} ## Downgrading Paperless {#downgrade-paperless}
Downgrades are possible. However, some updates also contain database Downgrades are possible. However, some updates also contain database
@ -269,6 +276,7 @@ optional arguments:
-sm, --split-manifest -sm, --split-manifest
-z, --zip -z, --zip
-zn, --zip-name -zn, --zip-name
--data-only
``` ```
`target` is a folder to which the data gets written. This includes `target` is a folder to which the data gets written. This includes
@ -327,6 +335,9 @@ If `-z` or `--zip` is provided, the export will be a zip file
in the target directory, named according to the current local date or the in the target directory, named according to the current local date or the
value set in `-zn` or `--zip-name`. value set in `-zn` or `--zip-name`.
If `--data-only` is provided, only the database will be exported. This option is intended
to facilitate database upgrades with needing to clean documents and thumbnails.
!!! warning !!! warning
If exporting with the file name format, there may be errors due to If exporting with the file name format, there may be errors due to
@ -345,6 +356,11 @@ and the script does the rest of the work:
document_importer source document_importer source
``` ```
| Option | Required | Default | Description |
| ----------- | -------- | ------- | ------------------------------------------------------------------------- |
| source | Yes | N/A | The directory containing an export |
| --data-only | No | False | If provided, only import data, do not import document files or thumbnails |
When you use the provided docker compose script, put the export inside When you use the provided docker compose script, put the export inside
the `export` folder in your paperless source directory. Specify the `export` folder in your paperless source directory. Specify
`../export` as the `source`. `../export` as the `source`.

View File

@ -5,7 +5,6 @@ import shutil
import tempfile import tempfile
import time import time
from pathlib import Path from pathlib import Path
from typing import Optional
import tqdm import tqdm
from django.conf import settings from django.conf import settings
@ -147,6 +146,13 @@ class Command(BaseCommand):
help="Sets the export zip file name", help="Sets the export zip file name",
) )
parser.add_argument(
"--data-only",
default=False,
action="store_true",
help="If set, only the database will be exported, not files",
)
parser.add_argument( parser.add_argument(
"--no-progress-bar", "--no-progress-bar",
default=False, default=False,
@ -166,6 +172,7 @@ class Command(BaseCommand):
self.delete = False self.delete = False
self.no_archive = False self.no_archive = False
self.no_thumbnail = False self.no_thumbnail = False
self.data_only = False
def handle(self, *args, **options): def handle(self, *args, **options):
self.target = Path(options["target"]).resolve() self.target = Path(options["target"]).resolve()
@ -177,14 +184,14 @@ class Command(BaseCommand):
self.no_archive: bool = options["no_archive"] self.no_archive: bool = options["no_archive"]
self.no_thumbnail: bool = options["no_thumbnail"] self.no_thumbnail: bool = options["no_thumbnail"]
self.zip_export: bool = options["zip"] self.zip_export: bool = options["zip"]
self.data_only: bool = options["data_only"]
self.no_progress_bar: bool = options["no_progress_bar"]
# If zipping, save the original target for later and # If zipping, save the original target for later and
# get a temporary directory for the target instead # get a temporary directory for the target instead
temp_dir = None temp_dir = None
self.original_target: Optional[Path] = None
if self.zip_export:
self.original_target = self.target self.original_target = self.target
if self.zip_export:
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True) settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
temp_dir = tempfile.TemporaryDirectory( temp_dir = tempfile.TemporaryDirectory(
dir=settings.SCRATCH_DIR, dir=settings.SCRATCH_DIR,
@ -203,7 +210,7 @@ class Command(BaseCommand):
try: try:
with FileLock(settings.MEDIA_LOCK): with FileLock(settings.MEDIA_LOCK):
self.dump(options["no_progress_bar"]) self.dump()
# We've written everything to the temporary directory in this case, # We've written everything to the temporary directory in this case,
# now make an archive in the original target, with all files stored # now make an archive in the original target, with all files stored
@ -222,7 +229,7 @@ class Command(BaseCommand):
if self.zip_export and temp_dir is not None: if self.zip_export and temp_dir is not None:
temp_dir.cleanup() temp_dir.cleanup()
def dump(self, progress_bar_disable=False): def dump(self):
# 1. Take a snapshot of what files exist in the current export folder # 1. Take a snapshot of what files exist in the current export folder
for x in self.target.glob("**/*"): for x in self.target.glob("**/*"):
if x.is_file(): if x.is_file():
@ -334,11 +341,15 @@ class Command(BaseCommand):
manifest += notes manifest += notes
manifest += custom_field_instances manifest += custom_field_instances
if self.data_only:
self.stdout.write(self.style.NOTICE("Data only export completed"))
return
# 3. Export files from each document # 3. Export files from each document
for index, document_dict in tqdm.tqdm( for index, document_dict in tqdm.tqdm(
enumerate(document_manifest), enumerate(document_manifest),
total=len(document_manifest), total=len(document_manifest),
disable=progress_bar_disable, disable=self.no_progress_bar,
): ):
# 3.1. store files unencrypted # 3.1. store files unencrypted
document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED

View File

@ -57,6 +57,7 @@ class Command(BaseCommand):
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument("source") parser.add_argument("source")
parser.add_argument( parser.add_argument(
"--no-progress-bar", "--no-progress-bar",
default=False, default=False,
@ -64,6 +65,13 @@ class Command(BaseCommand):
help="If set, the progress bar will not be shown", help="If set, the progress bar will not be shown",
) )
parser.add_argument(
"--data-only",
default=False,
action="store_true",
help="If set, only the database will be exported, not files",
)
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs) BaseCommand.__init__(self, *args, **kwargs)
self.source = None self.source = None
@ -82,6 +90,10 @@ class Command(BaseCommand):
if not os.access(self.source, os.R_OK): if not os.access(self.source, os.R_OK):
raise CommandError("That path doesn't appear to be readable") raise CommandError("That path doesn't appear to be readable")
# Skip this check if operating only on the database
# We can data to exist
if not self.data_only:
for document_dir in [settings.ORIGINALS_DIR, settings.ARCHIVE_DIR]: for document_dir in [settings.ORIGINALS_DIR, settings.ARCHIVE_DIR]:
if document_dir.exists() and document_dir.is_dir(): if document_dir.exists() and document_dir.is_dir():
for entry in document_dir.glob("**/*"): for entry in document_dir.glob("**/*"):
@ -113,6 +125,8 @@ class Command(BaseCommand):
logging.getLogger().handlers[0].level = logging.ERROR logging.getLogger().handlers[0].level = logging.ERROR
self.source = Path(options["source"]).resolve() self.source = Path(options["source"]).resolve()
self.data_only: bool = options["data_only"]
self.no_progress_bar: bool = options["no_progress_bar"]
self.pre_check() self.pre_check()
@ -200,8 +214,13 @@ class Command(BaseCommand):
) )
raise e raise e
if not self.data_only:
self._import_files_from_manifest(options["no_progress_bar"]) self._import_files_from_manifest(options["no_progress_bar"])
else:
self.stdout.write(self.style.NOTICE("Data only import completed"))
self.stdout.write("Updating search index...") self.stdout.write("Updating search index...")
call_command( call_command(
"document_index", "document_index",

View File

@ -37,10 +37,16 @@ from documents.sanity_checker import check_sanity
from documents.settings import EXPORTER_FILE_NAME from documents.settings import EXPORTER_FILE_NAME
from documents.tests.utils import DirectoriesMixin from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import SampleDirMixin
from documents.tests.utils import paperless_environment from documents.tests.utils import paperless_environment
class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase): class TestExportImport(
DirectoriesMixin,
FileSystemAssertsMixin,
SampleDirMixin,
TestCase,
):
def setUp(self) -> None: def setUp(self) -> None:
self.target = Path(tempfile.mkdtemp()) self.target = Path(tempfile.mkdtemp())
self.addCleanup(shutil.rmtree, self.target) self.addCleanup(shutil.rmtree, self.target)
@ -139,6 +145,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
@override_settings(PASSPHRASE="test") @override_settings(PASSPHRASE="test")
def _do_export( def _do_export(
self, self,
*,
use_filename_format=False, use_filename_format=False,
compare_checksums=False, compare_checksums=False,
delete=False, delete=False,
@ -146,6 +153,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
no_thumbnail=False, no_thumbnail=False,
split_manifest=False, split_manifest=False,
use_folder_prefix=False, use_folder_prefix=False,
data_only=False,
): ):
args = ["document_exporter", self.target] args = ["document_exporter", self.target]
if use_filename_format: if use_filename_format:
@ -162,6 +170,8 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
args += ["--split-manifest"] args += ["--split-manifest"]
if use_folder_prefix: if use_folder_prefix:
args += ["--use-folder-prefix"] args += ["--use-folder-prefix"]
if data_only:
args += ["--data-only"]
call_command(*args) call_command(*args)
@ -794,3 +804,25 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
manifest = self._do_export(use_filename_format=True) manifest = self._do_export(use_filename_format=True)
for obj in manifest: for obj in manifest:
self.assertNotEqual(obj["model"], "auditlog.logentry") self.assertNotEqual(obj["model"], "auditlog.logentry")
def test_export_data_only(self):
"""
GIVEN:
- Request to export documents with data only
WHEN:
- Export command is called
THEN:
- No document files are exported
- Manifest and version are exported
"""
shutil.rmtree(self.dirs.media_dir / "documents")
shutil.copytree(
self.SAMPLE_DIR / "documents",
self.dirs.media_dir / "documents",
)
_ = self._do_export(data_only=True)
# Manifest and version files only should be present in the exported directory
self.assertFileCountInDir(self.target, 2)

View File

@ -200,6 +200,16 @@ class FileSystemAssertsMixin:
self.assertEqual(hash1, hash2, "File SHA256 mismatch") self.assertEqual(hash1, hash2, "File SHA256 mismatch")
def assertFileCountInDir(self, path: Union[PathLike, str], count: int):
path = Path(path).resolve()
self.assertTrue(path.is_dir(), f"Path {path} is not a directory")
file_count = len([x for x in path.iterdir() if x.is_file()])
self.assertEqual(
file_count,
count,
f"Path {path} contains {file_count} files instead of {count} file",
)
class ConsumerProgressMixin: class ConsumerProgressMixin:
""" """