diff --git a/docs/administration.md b/docs/administration.md index f34156898..d53b68f56 100644 --- a/docs/administration.md +++ b/docs/administration.md @@ -185,6 +185,13 @@ For PostgreSQL, refer to [Upgrading a PostgreSQL Cluster](https://www.postgresql For MariaDB, refer to [Upgrading MariaDB](https://mariadb.com/kb/en/upgrading/) +You may also use the exporter and importer with the `--data-only` flag. + +!!! warning + + You should not change any settings, especially paths, when doing this or there is a + risk of data loss + ## Downgrading Paperless {#downgrade-paperless} Downgrades are possible. However, some updates also contain database @@ -269,6 +276,7 @@ optional arguments: -sm, --split-manifest -z, --zip -zn, --zip-name +--data-only ``` `target` is a folder to which the data gets written. This includes @@ -327,6 +335,9 @@ If `-z` or `--zip` is provided, the export will be a zip file in the target directory, named according to the current local date or the value set in `-zn` or `--zip-name`. +If `--data-only` is provided, only the database will be exported. This option is intended +to facilitate database upgrades with needing to clean documents and thumbnails. + !!! warning If exporting with the file name format, there may be errors due to @@ -345,6 +356,11 @@ and the script does the rest of the work: document_importer source ``` +| Option | Required | Default | Description | +| ----------- | -------- | ------- | ------------------------------------------------------------------------- | +| source | Yes | N/A | The directory containing an export | +| --data-only | No | False | If provided, only import data, do not import document files or thumbnails | + When you use the provided docker compose script, put the export inside the `export` folder in your paperless source directory. Specify `../export` as the `source`. diff --git a/src/documents/management/commands/document_exporter.py b/src/documents/management/commands/document_exporter.py index 081dfb360..b99d4b450 100644 --- a/src/documents/management/commands/document_exporter.py +++ b/src/documents/management/commands/document_exporter.py @@ -5,7 +5,6 @@ import shutil import tempfile import time from pathlib import Path -from typing import Optional import tqdm from django.conf import settings @@ -147,6 +146,13 @@ class Command(BaseCommand): help="Sets the export zip file name", ) + parser.add_argument( + "--data-only", + default=False, + action="store_true", + help="If set, only the database will be exported, not files", + ) + parser.add_argument( "--no-progress-bar", default=False, @@ -166,6 +172,7 @@ class Command(BaseCommand): self.delete = False self.no_archive = False self.no_thumbnail = False + self.data_only = False def handle(self, *args, **options): self.target = Path(options["target"]).resolve() @@ -177,14 +184,14 @@ class Command(BaseCommand): self.no_archive: bool = options["no_archive"] self.no_thumbnail: bool = options["no_thumbnail"] self.zip_export: bool = options["zip"] + self.data_only: bool = options["data_only"] + self.no_progress_bar: bool = options["no_progress_bar"] # If zipping, save the original target for later and # get a temporary directory for the target instead temp_dir = None - self.original_target: Optional[Path] = None + self.original_target = self.target if self.zip_export: - self.original_target = self.target - settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True) temp_dir = tempfile.TemporaryDirectory( dir=settings.SCRATCH_DIR, @@ -203,7 +210,7 @@ class Command(BaseCommand): try: with FileLock(settings.MEDIA_LOCK): - self.dump(options["no_progress_bar"]) + self.dump() # We've written everything to the temporary directory in this case, # now make an archive in the original target, with all files stored @@ -222,7 +229,7 @@ class Command(BaseCommand): if self.zip_export and temp_dir is not None: temp_dir.cleanup() - def dump(self, progress_bar_disable=False): + def dump(self): # 1. Take a snapshot of what files exist in the current export folder for x in self.target.glob("**/*"): if x.is_file(): @@ -334,11 +341,15 @@ class Command(BaseCommand): manifest += notes manifest += custom_field_instances + if self.data_only: + self.stdout.write(self.style.NOTICE("Data only export completed")) + return + # 3. Export files from each document for index, document_dict in tqdm.tqdm( enumerate(document_manifest), total=len(document_manifest), - disable=progress_bar_disable, + disable=self.no_progress_bar, ): # 3.1. store files unencrypted document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED diff --git a/src/documents/management/commands/document_importer.py b/src/documents/management/commands/document_importer.py index 5cf036b0f..e0c1abc70 100644 --- a/src/documents/management/commands/document_importer.py +++ b/src/documents/management/commands/document_importer.py @@ -57,6 +57,7 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument("source") + parser.add_argument( "--no-progress-bar", default=False, @@ -64,6 +65,13 @@ class Command(BaseCommand): help="If set, the progress bar will not be shown", ) + parser.add_argument( + "--data-only", + default=False, + action="store_true", + help="If set, only the database will be exported, not files", + ) + def __init__(self, *args, **kwargs): BaseCommand.__init__(self, *args, **kwargs) self.source = None @@ -82,17 +90,21 @@ class Command(BaseCommand): if not os.access(self.source, os.R_OK): raise CommandError("That path doesn't appear to be readable") - for document_dir in [settings.ORIGINALS_DIR, settings.ARCHIVE_DIR]: - if document_dir.exists() and document_dir.is_dir(): - for entry in document_dir.glob("**/*"): - if entry.is_dir(): - continue - self.stdout.write( - self.style.WARNING( - f"Found file {entry.relative_to(document_dir)}, this might indicate a non-empty installation", - ), - ) - break + # Skip this check if operating only on the database + # We can data to exist + if not self.data_only: + + for document_dir in [settings.ORIGINALS_DIR, settings.ARCHIVE_DIR]: + if document_dir.exists() and document_dir.is_dir(): + for entry in document_dir.glob("**/*"): + if entry.is_dir(): + continue + self.stdout.write( + self.style.WARNING( + f"Found file {entry.relative_to(document_dir)}, this might indicate a non-empty installation", + ), + ) + break if ( User.objects.exclude(username__in=["consumer", "AnonymousUser"]).count() != 0 @@ -113,6 +125,8 @@ class Command(BaseCommand): logging.getLogger().handlers[0].level = logging.ERROR self.source = Path(options["source"]).resolve() + self.data_only: bool = options["data_only"] + self.no_progress_bar: bool = options["no_progress_bar"] self.pre_check() @@ -200,7 +214,12 @@ class Command(BaseCommand): ) raise e - self._import_files_from_manifest(options["no_progress_bar"]) + if not self.data_only: + self._import_files_from_manifest(options["no_progress_bar"]) + + else: + + self.stdout.write(self.style.NOTICE("Data only import completed")) self.stdout.write("Updating search index...") call_command( diff --git a/src/documents/tests/test_management_exporter.py b/src/documents/tests/test_management_exporter.py index b95d07dec..0a191c0f7 100644 --- a/src/documents/tests/test_management_exporter.py +++ b/src/documents/tests/test_management_exporter.py @@ -37,10 +37,16 @@ from documents.sanity_checker import check_sanity from documents.settings import EXPORTER_FILE_NAME from documents.tests.utils import DirectoriesMixin from documents.tests.utils import FileSystemAssertsMixin +from documents.tests.utils import SampleDirMixin from documents.tests.utils import paperless_environment -class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase): +class TestExportImport( + DirectoriesMixin, + FileSystemAssertsMixin, + SampleDirMixin, + TestCase, +): def setUp(self) -> None: self.target = Path(tempfile.mkdtemp()) self.addCleanup(shutil.rmtree, self.target) @@ -139,6 +145,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase): @override_settings(PASSPHRASE="test") def _do_export( self, + *, use_filename_format=False, compare_checksums=False, delete=False, @@ -146,6 +153,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase): no_thumbnail=False, split_manifest=False, use_folder_prefix=False, + data_only=False, ): args = ["document_exporter", self.target] if use_filename_format: @@ -162,6 +170,8 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase): args += ["--split-manifest"] if use_folder_prefix: args += ["--use-folder-prefix"] + if data_only: + args += ["--data-only"] call_command(*args) @@ -794,3 +804,25 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase): manifest = self._do_export(use_filename_format=True) for obj in manifest: self.assertNotEqual(obj["model"], "auditlog.logentry") + + def test_export_data_only(self): + """ + GIVEN: + - Request to export documents with data only + WHEN: + - Export command is called + THEN: + - No document files are exported + - Manifest and version are exported + """ + + shutil.rmtree(self.dirs.media_dir / "documents") + shutil.copytree( + self.SAMPLE_DIR / "documents", + self.dirs.media_dir / "documents", + ) + + _ = self._do_export(data_only=True) + + # Manifest and version files only should be present in the exported directory + self.assertFileCountInDir(self.target, 2) diff --git a/src/documents/tests/utils.py b/src/documents/tests/utils.py index fb4fa9f07..2243fa557 100644 --- a/src/documents/tests/utils.py +++ b/src/documents/tests/utils.py @@ -200,6 +200,16 @@ class FileSystemAssertsMixin: self.assertEqual(hash1, hash2, "File SHA256 mismatch") + def assertFileCountInDir(self, path: Union[PathLike, str], count: int): + path = Path(path).resolve() + self.assertTrue(path.is_dir(), f"Path {path} is not a directory") + file_count = len([x for x in path.iterdir() if x.is_file()]) + self.assertEqual( + file_count, + count, + f"Path {path} contains {file_count} files instead of {count} file", + ) + class ConsumerProgressMixin: """