Tagging by putting barcode stickers on documents (discussion #3762)

This commit is contained in:
Pascal Krahmer 2024-01-28 18:02:02 +01:00
parent b0c305e852
commit 1dbd224b55
4 changed files with 107 additions and 1 deletions

View File

@ -1159,6 +1159,36 @@ combination with PAPERLESS_CONSUMER_BARCODE_UPSCALE bigger than 1.0.
Defaults to "300" Defaults to "300"
#### [`PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE=<bool>`](#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE) {#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE}
: Enables the detection of barcodes in the scanned document and
assigns or creates tags if a properly formatted barcode is detected.
The barcode must match one of the (configurable) regular expressions.
If the barcode text contains ',' (comma), it is split into multiple
barcodes which are individually processed for tagging.
Matching is case insensitive.
Defaults to false.
#### [`CONSUMER_TAG_BARCODE_MAPPING=<json dict>`](#CONSUMER_TAG_BARCODE_MAPPING) {#CONSUMER_TAG_BARCODE_MAPPING}
: Defines a dictionary of filter regex and substitute expressions.
A barcode is only considered for tagging if at least one regex is matching
the barcode text. Before looking up or creating a tag, the substitute
is applied.
This allows very versatile matching as well as reformatting and mapping of
barcode pattern to tag values.
Syntax: {"<regex>": "<substitute>" [,...]]}
Defaults to {"TAG:(.*)": "\\g<1>"} which includes any barcode beginning with
TAG: followed by any number of characters. It is substitured by its name
without the TAG: text.
## Audit Trail ## Audit Trail
#### [`PAPERLESS_AUDIT_LOG_ENABLED=<bool>`](#PAPERLESS_AUDIT_LOG_ENABLED) {#PAPERLESS_AUDIT_LOG_ENABLED} #### [`PAPERLESS_AUDIT_LOG_ENABLED=<bool>`](#PAPERLESS_AUDIT_LOG_ENABLED) {#PAPERLESS_AUDIT_LOG_ENABLED}

View File

@ -46,6 +46,8 @@
#PAPERLESS_OCR_OUTPUT_TYPE=pdfa #PAPERLESS_OCR_OUTPUT_TYPE=pdfa
#PAPERLESS_OCR_PAGES=1 #PAPERLESS_OCR_PAGES=1
#PAPERLESS_OCR_IMAGE_DPI=300 #PAPERLESS_OCR_IMAGE_DPI=300
#PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE=false
#PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING={"TAG:(.*)": "\\g<1>"}
#PAPERLESS_OCR_CLEAN=clean #PAPERLESS_OCR_CLEAN=clean
#PAPERLESS_OCR_DESKEW=true #PAPERLESS_OCR_DESKEW=true
#PAPERLESS_OCR_ROTATE_PAGES=true #PAPERLESS_OCR_ROTATE_PAGES=true

View File

@ -14,6 +14,7 @@ from PIL import Image
from documents.converters import convert_from_tiff_to_pdf from documents.converters import convert_from_tiff_to_pdf
from documents.data_models import ConsumableDocument from documents.data_models import ConsumableDocument
from documents.models import Tag
from documents.plugins.base import ConsumeTaskPlugin from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import StopConsumeTaskError from documents.plugins.base import StopConsumeTaskError
from documents.plugins.helpers import ProgressStatusOptions from documents.plugins.helpers import ProgressStatusOptions
@ -65,7 +66,9 @@ class BarcodePlugin(ConsumeTaskPlugin):
supported_mimes = {"application/pdf"} supported_mimes = {"application/pdf"}
return ( return (
settings.CONSUMER_ENABLE_ASN_BARCODE or settings.CONSUMER_ENABLE_BARCODES settings.CONSUMER_ENABLE_ASN_BARCODE
or settings.CONSUMER_ENABLE_BARCODES
or settings.CONSUMER_ENABLE_TAG_BARCODE
) and self.input_doc.mime_type in supported_mimes ) and self.input_doc.mime_type in supported_mimes
def setup(self): def setup(self):
@ -90,6 +93,16 @@ class BarcodePlugin(ConsumeTaskPlugin):
logger.info(f"Found ASN in barcode: {located_asn}") logger.info(f"Found ASN in barcode: {located_asn}")
self.metadata.asn = located_asn self.metadata.asn = located_asn
# try reading tags from barcodes
if settings.CONSUMER_ENABLE_TAG_BARCODE:
tags = self.tags
if tags is not None and len(tags) > 0:
if self.metadata.tag_ids:
self.metadata.tag_ids += tags
else:
self.metadata.tag_ids = tags
logger.info(f"Found tags in barcode: {tags}")
separator_pages = self.get_separation_pages() separator_pages = self.get_separation_pages()
if not separator_pages: if not separator_pages:
return "No pages to split on!" return "No pages to split on!"
@ -279,6 +292,54 @@ class BarcodePlugin(ConsumeTaskPlugin):
return asn return asn
@property
def tags(self) -> Optional[list[int]]:
"""
Search the parsed barcodes for any tags.
Returns the detected tag ids (or empty list)
"""
tags = []
# Ensure the barcodes have been read
self.detect()
for x in self.barcodes:
tag_texts = x.value
for raw in tag_texts.split(","):
try:
tag = None
mappings = settings.CONSUMER_TAG_BARCODE_MAPPING.items()
for regex, sub in mappings:
if re.match(regex, raw, flags=re.IGNORECASE):
tag = (
re.sub(regex, sub, raw, flags=re.IGNORECASE)
if sub
else raw
)
break
if tag:
tag = Tag.objects.get_or_create(
name__iexact=tag,
defaults={"name": tag},
)[0]
logger.debug(
f"Found Tag Barcode '{raw}', substituted "
f"to '{tag}' and mapped to "
f"tag #{tag.pk}.",
)
tags.append(tag.pk)
except ValueError as e:
logger.warning(
f"Failed to find or create TAG '{raw}' because: {e}",
)
return tags
def get_separation_pages(self) -> dict[int, bool]: def get_separation_pages(self) -> dict[int, bool]:
""" """
Search the parsed barcodes for separators and returns a dict of page Search the parsed barcodes for separators and returns a dict of page

View File

@ -833,6 +833,19 @@ CONSUMER_BARCODE_UPSCALE: Final[float] = __get_float(
CONSUMER_BARCODE_DPI: Final[int] = __get_int("PAPERLESS_CONSUMER_BARCODE_DPI", 300) CONSUMER_BARCODE_DPI: Final[int] = __get_int("PAPERLESS_CONSUMER_BARCODE_DPI", 300)
CONSUMER_ENABLE_TAG_BARCODE: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_TAG_BARCODE",
)
CONSUMER_TAG_BARCODE_MAPPING = dict(
json.loads(
os.getenv(
"PAPERLESS_CONSUMER_TAG_BARCODE_MAPPING",
'{"TAG:(.*)": "\\\\g<1>"}',
),
),
)
CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED: Final[bool] = __get_boolean( CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED", "PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED",
) )