Adds little classes to reduce duplication of code

This commit is contained in:
Trenton H 2023-11-01 15:22:52 -07:00
parent f65f2613e9
commit d231dc199d
9 changed files with 93 additions and 85 deletions

View File

@ -7,13 +7,15 @@ from django import db
from django.conf import settings from django.conf import settings
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document from documents.models import Document
from documents.tasks import update_document_archive_file from documents.tasks import update_document_archive_file
logger = logging.getLogger("paperless.management.archiver") logger = logging.getLogger("paperless.management.archiver")
class Command(BaseCommand): class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
help = ( help = (
"Using the current classification model, assigns correspondents, tags " "Using the current classification model, assigns correspondents, tags "
"and document types to all documents, effectively allowing you to " "and document types to all documents, effectively allowing you to "
@ -43,20 +45,13 @@ class Command(BaseCommand):
"run on this specific document." "run on this specific document."
), ),
) )
parser.add_argument( self.add_argument_progress_bar_mixin(parser)
"--no-progress-bar", self.add_argument_processes_mixin(parser)
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
parser.add_argument(
"--processes",
default=max(1, os.cpu_count() // 4),
type=int,
help="Number of processes to distribute work amongst",
)
def handle(self, *args, **options): def handle(self, *args, **options):
self.handle_processes_mixin(**options)
self.handle_progress_bar_mixin(**options)
os.makedirs(settings.SCRATCH_DIR, exist_ok=True) os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
overwrite = options["overwrite"] overwrite = options["overwrite"]
@ -74,18 +69,18 @@ class Command(BaseCommand):
) )
# Note to future self: this prevents django from reusing database # Note to future self: this prevents django from reusing database
# conncetions between processes, which is bad and does not work # connections between processes, which is bad and does not work
# with postgres. # with postgres.
db.connections.close_all() db.connections.close_all()
try: try:
logging.getLogger().handlers[0].level = logging.ERROR logging.getLogger().handlers[0].level = logging.ERROR
with multiprocessing.Pool(processes=options["processes"]) as pool: with multiprocessing.Pool(self.process_count) as pool:
list( list(
tqdm.tqdm( tqdm.tqdm(
pool.imap_unordered(update_document_archive_file, document_ids), pool.imap_unordered(update_document_archive_file, document_ids),
total=len(document_ids), total=len(document_ids),
disable=options["no_progress_bar"], disable=self.no_progress_bar,
), ),
) )
except KeyboardInterrupt: except KeyboardInterrupt:

View File

@ -9,8 +9,5 @@ class Command(BaseCommand):
"file. The document consumer will then automatically use this new model." "file. The document consumer will then automatically use this new model."
) )
def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)
def handle(self, *args, **options): def handle(self, *args, **options):
train_classifier() train_classifier()

View File

@ -7,6 +7,8 @@ import tqdm
from django.core.management import BaseCommand from django.core.management import BaseCommand
from django.core.management import CommandError from django.core.management import CommandError
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document from documents.models import Document
@ -41,7 +43,7 @@ def _process_and_match(work: _WorkPackage) -> _WorkResult:
return _WorkResult(work.first_doc.pk, work.second_doc.pk, match) return _WorkResult(work.first_doc.pk, work.second_doc.pk, match)
class Command(BaseCommand): class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
help = "Searches for documents where the content almost matches" help = "Searches for documents where the content almost matches"
def add_arguments(self, parser): def add_arguments(self, parser):
@ -51,23 +53,16 @@ class Command(BaseCommand):
type=float, type=float,
help="Ratio to consider documents a match", help="Ratio to consider documents a match",
) )
parser.add_argument( self.add_argument_progress_bar_mixin(parser)
"--processes", self.add_argument_processes_mixin(parser)
default=4,
type=int,
help="Number of processes to distribute work amongst",
)
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
def handle(self, *args, **options): def handle(self, *args, **options):
RATIO_MIN: Final[float] = 0.0 RATIO_MIN: Final[float] = 0.0
RATIO_MAX: Final[float] = 100.0 RATIO_MAX: Final[float] = 100.0
self.handle_processes_mixin(**options)
self.handle_progress_bar_mixin(**options)
opt_ratio = options["ratio"] opt_ratio = options["ratio"]
checked_pairs: set[tuple[int, int]] = set() checked_pairs: set[tuple[int, int]] = set()
work_pkgs: list[_WorkPackage] = [] work_pkgs: list[_WorkPackage] = []
@ -76,9 +71,6 @@ class Command(BaseCommand):
if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX: if opt_ratio < RATIO_MIN or opt_ratio > RATIO_MAX:
raise CommandError("The ratio must be between 0 and 100") raise CommandError("The ratio must be between 0 and 100")
if options["processes"] < 1:
raise CommandError("There must be at least 1 process")
all_docs = Document.objects.all().order_by("id") all_docs = Document.objects.all().order_by("id")
# Build work packages for processing # Build work packages for processing
@ -103,7 +95,7 @@ class Command(BaseCommand):
# Don't spin up a pool of 1 process # Don't spin up a pool of 1 process
if options["processes"] == 1: if options["processes"] == 1:
results = [] results = []
for work in tqdm.tqdm(work_pkgs, disable=options["no_progress_bar"]): for work in tqdm.tqdm(work_pkgs, disable=self.no_progress_bar):
results.append(_process_and_match(work)) results.append(_process_and_match(work))
else: else:
with multiprocessing.Pool(processes=options["processes"]) as pool: with multiprocessing.Pool(processes=options["processes"]) as pool:
@ -111,7 +103,7 @@ class Command(BaseCommand):
tqdm.tqdm( tqdm.tqdm(
pool.imap_unordered(_process_and_match, work_pkgs), pool.imap_unordered(_process_and_match, work_pkgs),
total=len(work_pkgs), total=len(work_pkgs),
disable=options["no_progress_bar"], disable=self.no_progress_bar,
), ),
) )

View File

@ -1,25 +1,22 @@
from django.core.management import BaseCommand from django.core.management import BaseCommand
from django.db import transaction from django.db import transaction
from documents.management.commands.mixins import ProgressBarMixin
from documents.tasks import index_optimize from documents.tasks import index_optimize
from documents.tasks import index_reindex from documents.tasks import index_reindex
class Command(BaseCommand): class Command(ProgressBarMixin, BaseCommand):
help = "Manages the document index." help = "Manages the document index."
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument("command", choices=["reindex", "optimize"]) parser.add_argument("command", choices=["reindex", "optimize"])
parser.add_argument( self.add_argument_progress_bar_mixin(parser)
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
def handle(self, *args, **options): def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
with transaction.atomic(): with transaction.atomic():
if options["command"] == "reindex": if options["command"] == "reindex":
index_reindex(progress_bar_disable=options["no_progress_bar"]) index_reindex(progress_bar_disable=self.no_progress_bar)
elif options["command"] == "optimize": elif options["command"] == "optimize":
index_optimize() index_optimize()

View File

@ -4,25 +4,22 @@ import tqdm
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django.db.models.signals import post_save from django.db.models.signals import post_save
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document from documents.models import Document
class Command(BaseCommand): class Command(ProgressBarMixin, BaseCommand):
help = "This will rename all documents to match the latest filename format." help = "This will rename all documents to match the latest filename format."
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument( self.add_argument_progress_bar_mixin(parser)
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
def handle(self, *args, **options): def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
logging.getLogger().handlers[0].level = logging.ERROR logging.getLogger().handlers[0].level = logging.ERROR
for document in tqdm.tqdm( for document in tqdm.tqdm(
Document.objects.all(), Document.objects.all(),
disable=options["no_progress_bar"], disable=self.no_progress_bar,
): ):
post_save.send(Document, instance=document) post_save.send(Document, instance=document)

View File

@ -4,6 +4,7 @@ import tqdm
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from documents.classifier import load_classifier from documents.classifier import load_classifier
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document from documents.models import Document
from documents.signals.handlers import set_correspondent from documents.signals.handlers import set_correspondent
from documents.signals.handlers import set_document_type from documents.signals.handlers import set_document_type
@ -13,7 +14,7 @@ from documents.signals.handlers import set_tags
logger = logging.getLogger("paperless.management.retagger") logger = logging.getLogger("paperless.management.retagger")
class Command(BaseCommand): class Command(ProgressBarMixin, BaseCommand):
help = ( help = (
"Using the current classification model, assigns correspondents, tags " "Using the current classification model, assigns correspondents, tags "
"and document types to all documents, effectively allowing you to " "and document types to all documents, effectively allowing you to "
@ -48,12 +49,7 @@ class Command(BaseCommand):
"and tags that do not match anymore due to changed rules." "and tags that do not match anymore due to changed rules."
), ),
) )
parser.add_argument( self.add_argument_progress_bar_mixin(parser)
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
parser.add_argument( parser.add_argument(
"--suggest", "--suggest",
default=False, default=False,
@ -72,6 +68,7 @@ class Command(BaseCommand):
) )
def handle(self, *args, **options): def handle(self, *args, **options):
self.handle_progress_bar_mixin(**options)
# Detect if we support color # Detect if we support color
color = self.style.ERROR("test") != "test" color = self.style.ERROR("test") != "test"
@ -89,7 +86,7 @@ class Command(BaseCommand):
classifier = load_classifier() classifier = load_classifier()
for document in tqdm.tqdm(documents, disable=options["no_progress_bar"]): for document in tqdm.tqdm(documents, disable=self.no_progress_bar):
if options["correspondent"]: if options["correspondent"]:
set_correspondent( set_correspondent(
sender=None, sender=None,

View File

@ -1,20 +1,17 @@
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from documents.management.commands.mixins import ProgressBarMixin
from documents.sanity_checker import check_sanity from documents.sanity_checker import check_sanity
class Command(BaseCommand): class Command(ProgressBarMixin, BaseCommand):
help = "This command checks your document archive for issues." help = "This command checks your document archive for issues."
def add_arguments(self, parser): def add_arguments(self, parser):
parser.add_argument( self.add_argument_progress_bar_mixin(parser)
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
def handle(self, *args, **options): def handle(self, *args, **options):
messages = check_sanity(progress=not options["no_progress_bar"]) self.handle_progress_bar_mixin(**options)
messages = check_sanity(progress=self.use_progress_bar)
messages.log_messages() messages.log_messages()

View File

@ -1,12 +1,13 @@
import logging import logging
import multiprocessing import multiprocessing
import os
import shutil import shutil
import tqdm import tqdm
from django import db from django import db
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from documents.management.commands.mixins import MultiProcessMixin
from documents.management.commands.mixins import ProgressBarMixin
from documents.models import Document from documents.models import Document
from documents.parsers import get_parser_class_for_mime_type from documents.parsers import get_parser_class_for_mime_type
@ -33,7 +34,7 @@ def _process_document(doc_id):
parser.cleanup() parser.cleanup()
class Command(BaseCommand): class Command(MultiProcessMixin, ProgressBarMixin, BaseCommand):
help = "This will regenerate the thumbnails for all documents." help = "This will regenerate the thumbnails for all documents."
def add_arguments(self, parser): def add_arguments(self, parser):
@ -48,22 +49,15 @@ class Command(BaseCommand):
"run on this specific document." "run on this specific document."
), ),
) )
parser.add_argument( self.add_argument_progress_bar_mixin(parser)
"--no-progress-bar", self.add_argument_processes_mixin(parser)
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
parser.add_argument(
"--processes",
default=max(1, os.cpu_count() // 4),
type=int,
help="Number of processes to distribute work amongst",
)
def handle(self, *args, **options): def handle(self, *args, **options):
logging.getLogger().handlers[0].level = logging.ERROR logging.getLogger().handlers[0].level = logging.ERROR
self.handle_processes_mixin(**options)
self.handle_progress_bar_mixin(**options)
if options["document"]: if options["document"]:
documents = Document.objects.filter(pk=options["document"]) documents = Document.objects.filter(pk=options["document"])
else: else:
@ -76,11 +70,11 @@ class Command(BaseCommand):
# with postgres. # with postgres.
db.connections.close_all() db.connections.close_all()
with multiprocessing.Pool(processes=options["processes"]) as pool: with multiprocessing.Pool(processes=self.process_count) as pool:
list( list(
tqdm.tqdm( tqdm.tqdm(
pool.imap_unordered(_process_document, ids), pool.imap_unordered(_process_document, ids),
total=len(ids), total=len(ids),
disable=options["no_progress_bar"], disable=self.no_progress_bar,
), ),
) )

View File

@ -0,0 +1,42 @@
import os
from django.core.management import CommandError
class MultiProcessMixin:
"""
Small class to handle adding an argument and validating it
for the use of multiple processes
"""
def add_argument_processes_mixin(self, parser):
parser.add_argument(
"--processes",
default=max(1, os.cpu_count() // 4),
type=int,
help="Number of processes to distribute work amongst",
)
def handle_processes_mixin(self, *args, **options):
self.process_count = options["processes"]
if self.process_count < 1:
raise CommandError("There must be at least 1 process")
class ProgressBarMixin:
"""
Many commands use a progress bar, which can be disabled
via this class
"""
def add_argument_progress_bar_mixin(self, parser):
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
def handle_progress_bar_mixin(self, *args, **options):
self.no_progress_bar = options["no_progress_bar"]
self.use_progress_bar = not self.no_progress_bar