Saves work on an export/import process which is data only

This commit is contained in:
Trenton H 2024-05-17 10:04:12 -07:00
parent 04f52f553a
commit c3d7793e3c
5 changed files with 108 additions and 20 deletions

View File

@ -185,6 +185,13 @@ For PostgreSQL, refer to [Upgrading a PostgreSQL Cluster](https://www.postgresql
For MariaDB, refer to [Upgrading MariaDB](https://mariadb.com/kb/en/upgrading/)
You may also use the exporter and importer with the `--data-only` flag.
!!! warning
You should not change any settings, especially paths, when doing this or there is a
risk of data loss
## Downgrading Paperless {#downgrade-paperless}
Downgrades are possible. However, some updates also contain database
@ -269,6 +276,7 @@ optional arguments:
-sm, --split-manifest
-z, --zip
-zn, --zip-name
--data-only
```
`target` is a folder to which the data gets written. This includes
@ -327,6 +335,9 @@ If `-z` or `--zip` is provided, the export will be a zip file
in the target directory, named according to the current local date or the
value set in `-zn` or `--zip-name`.
If `--data-only` is provided, only the database will be exported. This option is intended
to facilitate database upgrades with needing to clean documents and thumbnails.
!!! warning
If exporting with the file name format, there may be errors due to
@ -345,6 +356,11 @@ and the script does the rest of the work:
document_importer source
```
| Option | Required | Default | Description |
| ----------- | -------- | ------- | ------------------------------------------------------------------------- |
| source | Yes | N/A | The directory containing an export |
| --data-only | No | False | If provided, only import data, do not import document files or thumbnails |
When you use the provided docker compose script, put the export inside
the `export` folder in your paperless source directory. Specify
`../export` as the `source`.

View File

@ -5,7 +5,6 @@ import shutil
import tempfile
import time
from pathlib import Path
from typing import Optional
import tqdm
from django.conf import settings
@ -147,6 +146,13 @@ class Command(BaseCommand):
help="Sets the export zip file name",
)
parser.add_argument(
"--data-only",
default=False,
action="store_true",
help="If set, only the database will be exported, not files",
)
parser.add_argument(
"--no-progress-bar",
default=False,
@ -166,6 +172,7 @@ class Command(BaseCommand):
self.delete = False
self.no_archive = False
self.no_thumbnail = False
self.data_only = False
def handle(self, *args, **options):
self.target = Path(options["target"]).resolve()
@ -177,14 +184,14 @@ class Command(BaseCommand):
self.no_archive: bool = options["no_archive"]
self.no_thumbnail: bool = options["no_thumbnail"]
self.zip_export: bool = options["zip"]
self.data_only: bool = options["data_only"]
self.no_progress_bar: bool = options["no_progress_bar"]
# If zipping, save the original target for later and
# get a temporary directory for the target instead
temp_dir = None
self.original_target: Optional[Path] = None
self.original_target = self.target
if self.zip_export:
self.original_target = self.target
settings.SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
temp_dir = tempfile.TemporaryDirectory(
dir=settings.SCRATCH_DIR,
@ -203,7 +210,7 @@ class Command(BaseCommand):
try:
with FileLock(settings.MEDIA_LOCK):
self.dump(options["no_progress_bar"])
self.dump()
# We've written everything to the temporary directory in this case,
# now make an archive in the original target, with all files stored
@ -222,7 +229,7 @@ class Command(BaseCommand):
if self.zip_export and temp_dir is not None:
temp_dir.cleanup()
def dump(self, progress_bar_disable=False):
def dump(self):
# 1. Take a snapshot of what files exist in the current export folder
for x in self.target.glob("**/*"):
if x.is_file():
@ -334,11 +341,15 @@ class Command(BaseCommand):
manifest += notes
manifest += custom_field_instances
if self.data_only:
self.stdout.write(self.style.NOTICE("Data only export completed"))
return
# 3. Export files from each document
for index, document_dict in tqdm.tqdm(
enumerate(document_manifest),
total=len(document_manifest),
disable=progress_bar_disable,
disable=self.no_progress_bar,
):
# 3.1. store files unencrypted
document_dict["fields"]["storage_type"] = Document.STORAGE_TYPE_UNENCRYPTED

View File

@ -57,6 +57,7 @@ class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument("source")
parser.add_argument(
"--no-progress-bar",
default=False,
@ -64,6 +65,13 @@ class Command(BaseCommand):
help="If set, the progress bar will not be shown",
)
parser.add_argument(
"--data-only",
default=False,
action="store_true",
help="If set, only the database will be exported, not files",
)
def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)
self.source = None
@ -82,17 +90,21 @@ class Command(BaseCommand):
if not os.access(self.source, os.R_OK):
raise CommandError("That path doesn't appear to be readable")
for document_dir in [settings.ORIGINALS_DIR, settings.ARCHIVE_DIR]:
if document_dir.exists() and document_dir.is_dir():
for entry in document_dir.glob("**/*"):
if entry.is_dir():
continue
self.stdout.write(
self.style.WARNING(
f"Found file {entry.relative_to(document_dir)}, this might indicate a non-empty installation",
),
)
break
# Skip this check if operating only on the database
# We can data to exist
if not self.data_only:
for document_dir in [settings.ORIGINALS_DIR, settings.ARCHIVE_DIR]:
if document_dir.exists() and document_dir.is_dir():
for entry in document_dir.glob("**/*"):
if entry.is_dir():
continue
self.stdout.write(
self.style.WARNING(
f"Found file {entry.relative_to(document_dir)}, this might indicate a non-empty installation",
),
)
break
if (
User.objects.exclude(username__in=["consumer", "AnonymousUser"]).count()
!= 0
@ -113,6 +125,8 @@ class Command(BaseCommand):
logging.getLogger().handlers[0].level = logging.ERROR
self.source = Path(options["source"]).resolve()
self.data_only: bool = options["data_only"]
self.no_progress_bar: bool = options["no_progress_bar"]
self.pre_check()
@ -200,7 +214,12 @@ class Command(BaseCommand):
)
raise e
self._import_files_from_manifest(options["no_progress_bar"])
if not self.data_only:
self._import_files_from_manifest(options["no_progress_bar"])
else:
self.stdout.write(self.style.NOTICE("Data only import completed"))
self.stdout.write("Updating search index...")
call_command(

View File

@ -37,10 +37,16 @@ from documents.sanity_checker import check_sanity
from documents.settings import EXPORTER_FILE_NAME
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from documents.tests.utils import SampleDirMixin
from documents.tests.utils import paperless_environment
class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
class TestExportImport(
DirectoriesMixin,
FileSystemAssertsMixin,
SampleDirMixin,
TestCase,
):
def setUp(self) -> None:
self.target = Path(tempfile.mkdtemp())
self.addCleanup(shutil.rmtree, self.target)
@ -139,6 +145,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
@override_settings(PASSPHRASE="test")
def _do_export(
self,
*,
use_filename_format=False,
compare_checksums=False,
delete=False,
@ -146,6 +153,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
no_thumbnail=False,
split_manifest=False,
use_folder_prefix=False,
data_only=False,
):
args = ["document_exporter", self.target]
if use_filename_format:
@ -162,6 +170,8 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
args += ["--split-manifest"]
if use_folder_prefix:
args += ["--use-folder-prefix"]
if data_only:
args += ["--data-only"]
call_command(*args)
@ -794,3 +804,25 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
manifest = self._do_export(use_filename_format=True)
for obj in manifest:
self.assertNotEqual(obj["model"], "auditlog.logentry")
def test_export_data_only(self):
"""
GIVEN:
- Request to export documents with data only
WHEN:
- Export command is called
THEN:
- No document files are exported
- Manifest and version are exported
"""
shutil.rmtree(self.dirs.media_dir / "documents")
shutil.copytree(
self.SAMPLE_DIR / "documents",
self.dirs.media_dir / "documents",
)
_ = self._do_export(data_only=True)
# Manifest and version files only should be present in the exported directory
self.assertFileCountInDir(self.target, 2)

View File

@ -200,6 +200,16 @@ class FileSystemAssertsMixin:
self.assertEqual(hash1, hash2, "File SHA256 mismatch")
def assertFileCountInDir(self, path: Union[PathLike, str], count: int):
path = Path(path).resolve()
self.assertTrue(path.is_dir(), f"Path {path} is not a directory")
file_count = len([x for x in path.iterdir() if x.is_file()])
self.assertEqual(
file_count,
count,
f"Path {path} contains {file_count} files instead of {count} file",
)
class ConsumerProgressMixin:
"""