Add to handler, matching, retagger
This commit is contained in:
		
							parent
							
								
									a632b6b711
								
							
						
					
					
						commit
						6dc6c6c7bb
					
				| @ -372,17 +372,19 @@ currently-imported docs. This problem is common enough that there are | ||||
| tools for it. | ||||
| 
 | ||||
| ``` | ||||
| document_retagger [-h] [-c] [-T] [-t] [-i] [--id-range] [--use-first] [-f] | ||||
| document_retagger [-h] [-c] [-T] [-t] [-cf] [-i] [--id-range] [--use-first] [-f] [--suggest] | ||||
| 
 | ||||
| optional arguments: | ||||
| -c, --correspondent | ||||
| -T, --tags | ||||
| -t, --document_type | ||||
| -s, --storage_path | ||||
| -cf, --custom_fields | ||||
| -i, --inbox-only | ||||
| --id-range | ||||
| --use-first | ||||
| -f, --overwrite | ||||
| --suggest | ||||
| ``` | ||||
| 
 | ||||
| Run this after changing or adding matching rules. It'll loop over all | ||||
| @ -408,6 +410,8 @@ to override this behavior and just use the first correspondent or type | ||||
| it finds. This option does not apply to tags, since any amount of tags | ||||
| can be applied to a document. | ||||
| 
 | ||||
| If you want to suggest changes but not apply them, specify `--suggest`. | ||||
| 
 | ||||
| Finally, `-f` specifies that you wish to overwrite already assigned | ||||
| correspondents, types and/or tags. The default behavior is to not assign | ||||
| correspondents and types to documents that have this data already | ||||
|  | ||||
| @ -15,6 +15,7 @@ class DocumentsConfig(AppConfig): | ||||
|         from documents.signals.handlers import run_workflows_added | ||||
|         from documents.signals.handlers import run_workflows_updated | ||||
|         from documents.signals.handlers import set_correspondent | ||||
|         from documents.signals.handlers import set_custom_fields | ||||
|         from documents.signals.handlers import set_document_type | ||||
|         from documents.signals.handlers import set_storage_path | ||||
|         from documents.signals.handlers import set_tags | ||||
| @ -24,6 +25,7 @@ class DocumentsConfig(AppConfig): | ||||
|         document_consumption_finished.connect(set_document_type) | ||||
|         document_consumption_finished.connect(set_tags) | ||||
|         document_consumption_finished.connect(set_storage_path) | ||||
|         document_consumption_finished.connect(set_custom_fields) | ||||
|         document_consumption_finished.connect(add_to_index) | ||||
|         document_consumption_finished.connect(run_workflows_added) | ||||
|         document_updated.connect(run_workflows_updated) | ||||
|  | ||||
| @ -7,6 +7,7 @@ from documents.classifier import load_classifier | ||||
| from documents.management.commands.mixins import ProgressBarMixin | ||||
| from documents.models import Document | ||||
| from documents.signals.handlers import set_correspondent | ||||
| from documents.signals.handlers import set_custom_fields | ||||
| from documents.signals.handlers import set_document_type | ||||
| from documents.signals.handlers import set_storage_path | ||||
| from documents.signals.handlers import set_tags | ||||
| @ -17,9 +18,9 @@ logger = logging.getLogger("paperless.management.retagger") | ||||
| class Command(ProgressBarMixin, BaseCommand): | ||||
|     help = ( | ||||
|         "Using the current classification model, assigns correspondents, tags " | ||||
|         "and document types to all documents, effectively allowing you to " | ||||
|         "back-tag all previously indexed documents with metadata created (or " | ||||
|         "modified) after their initial import." | ||||
|         "document types, storage paths and custom fields to all documents, effectively" | ||||
|         "allowing you to back-tag all previously indexed documents with metadata created " | ||||
|         "(or modified) after their initial import." | ||||
|     ) | ||||
| 
 | ||||
|     def add_arguments(self, parser): | ||||
| @ -27,6 +28,12 @@ class Command(ProgressBarMixin, BaseCommand): | ||||
|         parser.add_argument("-T", "--tags", default=False, action="store_true") | ||||
|         parser.add_argument("-t", "--document_type", default=False, action="store_true") | ||||
|         parser.add_argument("-s", "--storage_path", default=False, action="store_true") | ||||
|         parser.add_argument( | ||||
|             "-cf", | ||||
|             "--custom_fields", | ||||
|             default=False, | ||||
|             action="store_true", | ||||
|         ) | ||||
|         parser.add_argument("-i", "--inbox-only", default=False, action="store_true") | ||||
|         parser.add_argument( | ||||
|             "--use-first", | ||||
| @ -134,3 +141,16 @@ class Command(ProgressBarMixin, BaseCommand): | ||||
|                     stdout=self.stdout, | ||||
|                     style_func=self.style, | ||||
|                 ) | ||||
| 
 | ||||
|             if options["custom_fields"]: | ||||
|                 set_custom_fields( | ||||
|                     sender=None, | ||||
|                     document=document, | ||||
|                     classifier=classifier, | ||||
|                     replace=options["overwrite"], | ||||
|                     use_first=options["use_first"], | ||||
|                     suggest=options["suggest"], | ||||
|                     base_url=options["base_url"], | ||||
|                     stdout=self.stdout, | ||||
|                     style_func=self.style, | ||||
|                 ) | ||||
|  | ||||
| @ -132,6 +132,25 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def match_custom_fields(document: Document, classifier: DocumentClassifier, user=None): | ||||
|     predicted_custom_field_ids = ( | ||||
|         classifier.predict_custom_fields(document.content) if classifier else [] | ||||
|     ) | ||||
| 
 | ||||
|     fields = [instance.field for instance in document.custom_fields.all()] | ||||
| 
 | ||||
|     return list( | ||||
|         filter( | ||||
|             lambda o: matches(o, document) | ||||
|             or ( | ||||
|                 o.matching_algorithm == MatchingModel.MATCH_AUTO | ||||
|                 and o.pk in predicted_custom_field_ids | ||||
|             ), | ||||
|             fields, | ||||
|         ), | ||||
|     ) | ||||
| 
 | ||||
| 
 | ||||
| def matches(matching_model: MatchingModel, document: Document): | ||||
|     search_kwargs = {} | ||||
| 
 | ||||
|  | ||||
| @ -318,6 +318,67 @@ def set_storage_path( | ||||
|             document.save(update_fields=("storage_path",)) | ||||
| 
 | ||||
| 
 | ||||
| def set_custom_fields( | ||||
|     document: Document, | ||||
|     logging_group=None, | ||||
|     classifier: DocumentClassifier | None = None, | ||||
|     replace=False, | ||||
|     suggest=False, | ||||
|     base_url=None, | ||||
|     stdout=None, | ||||
|     style_func=None, | ||||
|     **kwargs, | ||||
| ): | ||||
|     if replace: | ||||
|         CustomFieldInstance.objects.filter(document=document).exclude( | ||||
|             Q(field__match="") & ~Q(field__matching_algorithm=CustomField.MATCH_AUTO), | ||||
|         ).delete() | ||||
| 
 | ||||
|     current_fields = set([instance.field for instance in document.custom_fields.all()]) | ||||
| 
 | ||||
|     matched_fields = matching.match_custom_fields(document, classifier) | ||||
| 
 | ||||
|     relevant_fields = set(matched_fields) - current_fields | ||||
| 
 | ||||
|     if suggest: | ||||
|         extra_fields = current_fields - set(matched_fields) | ||||
|         extra_fields = [ | ||||
|             f for f in extra_fields if f.matching_algorithm == MatchingModel.MATCH_AUTO | ||||
|         ] | ||||
|         if not relevant_fields and not extra_fields: | ||||
|             return | ||||
|         doc_str = style_func.SUCCESS(str(document)) | ||||
|         if base_url: | ||||
|             stdout.write(doc_str) | ||||
|             stdout.write(f"{base_url}/documents/{document.pk}") | ||||
|         else: | ||||
|             stdout.write(doc_str + style_func.SUCCESS(f" [{document.pk}]")) | ||||
|         if relevant_fields: | ||||
|             stdout.write( | ||||
|                 "Suggest custom fields: " | ||||
|                 + ", ".join([f.name for f in relevant_fields]), | ||||
|             ) | ||||
|         if extra_fields: | ||||
|             stdout.write( | ||||
|                 "Extra custom fields: " + ", ".join([f.name for f in extra_fields]), | ||||
|             ) | ||||
|     else: | ||||
|         if not relevant_fields: | ||||
|             return | ||||
| 
 | ||||
|         message = 'Assigning custom fields "{}" to "{}"' | ||||
|         logger.info( | ||||
|             message.format(document, ", ".join([f.name for f in relevant_fields])), | ||||
|             extra={"group": logging_group}, | ||||
|         ) | ||||
| 
 | ||||
|         for field in relevant_fields: | ||||
|             CustomFieldInstance.objects.create( | ||||
|                 field=field, | ||||
|                 document=document, | ||||
|             ) | ||||
| 
 | ||||
| 
 | ||||
| # see empty_trash in documents/tasks.py for signal handling | ||||
| def cleanup_document_deletion(sender, instance, **kwargs): | ||||
|     with FileLock(settings.MEDIA_LOCK): | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user