At least partially working for the tesseract parser

This commit is contained in:
Trenton H
2023-12-07 15:45:50 -08:00
parent 9867db9616
commit 30281bd593
9 changed files with 368 additions and 70 deletions

View File

@@ -50,6 +50,12 @@ def __get_boolean(key: str, default: str = "NO") -> bool:
return bool(os.getenv(key, default).lower() in ("yes", "y", "1", "t", "true"))
def __get_optional_boolean(key: str) -> Optional[bool]:
if key in os.environ:
return __get_boolean(key)
return None
def __get_int(key: str, default: int) -> int:
"""
Return an integer value based on the environment variable or a default
@@ -57,6 +63,12 @@ def __get_int(key: str, default: int) -> int:
return int(os.getenv(key, default))
def __get_optional_int(key: str) -> Optional[int]:
if key in os.environ:
return __get_int(key, -1)
return None
def __get_float(key: str, default: float) -> float:
"""
Return an integer value based on the environment variable or a default
@@ -64,6 +76,12 @@ def __get_float(key: str, default: float) -> float:
return float(os.getenv(key, default))
def __get_optional_float(key: str) -> Optional[float]:
if key in os.environ:
return __get_float(key, -1)
return None
def __get_path(
key: str,
default: Optional[Union[PathLike, str]] = None,
@@ -796,11 +814,10 @@ CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
"PATCHT",
)
consumer_barcode_scanner_tmp: Final[str] = os.getenv(
CONSUMER_BARCODE_SCANNER: Final[str] = os.getenv(
"PAPERLESS_CONSUMER_BARCODE_SCANNER",
"PYZBAR",
)
CONSUMER_BARCODE_SCANNER = consumer_barcode_scanner_tmp.upper()
).upper()
CONSUMER_ENABLE_ASN_BARCODE: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_ASN_BARCODE",
@@ -811,15 +828,12 @@ CONSUMER_ASN_BARCODE_PREFIX: Final[str] = os.getenv(
"ASN",
)
CONSUMER_BARCODE_UPSCALE: Final[float] = float(
os.getenv("PAPERLESS_CONSUMER_BARCODE_UPSCALE", 0.0),
CONSUMER_BARCODE_UPSCALE: Final[float] = __get_float(
"PAPERLESS_CONSUMER_BARCODE_UPSCALE",
0.0,
)
CONSUMER_BARCODE_DPI: Final[str] = int(
os.getenv("PAPERLESS_CONSUMER_BARCODE_DPI", 300),
)
CONSUMER_BARCODE_DPI: Final[int] = __get_int("PAPERLESS_CONSUMER_BARCODE_DPI", 300)
CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED",
@@ -834,7 +848,7 @@ CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT",
)
OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))
OCR_PAGES = __get_optional_int("PAPERLESS_OCR_PAGES")
# The default language that tesseract will attempt to use when parsing
# documents. It should be a 3-letter language code consistent with ISO 639.
@@ -848,28 +862,29 @@ OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
OCR_IMAGE_DPI = __get_optional_int("PAPERLESS_OCR_IMAGE_DPI")
OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean")
OCR_DESKEW = __get_boolean("PAPERLESS_OCR_DESKEW", "true")
OCR_DESKEW: Final[bool] = __get_boolean("PAPERLESS_OCR_DESKEW", "true")
OCR_ROTATE_PAGES = __get_boolean("PAPERLESS_OCR_ROTATE_PAGES", "true")
OCR_ROTATE_PAGES: Final[bool] = __get_boolean("PAPERLESS_OCR_ROTATE_PAGES", "true")
OCR_ROTATE_PAGES_THRESHOLD = float(
os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0),
OCR_ROTATE_PAGES_THRESHOLD: Final[float] = __get_float(
"PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD",
12.0,
)
OCR_MAX_IMAGE_PIXELS: Optional[int] = None
if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
OCR_MAX_IMAGE_PIXELS: Final[Optional[int]] = __get_optional_int(
"PAPERLESS_OCR_MAX_IMAGE_PIXELS",
)
OCR_COLOR_CONVERSION_STRATEGY = os.getenv(
"PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY",
"RGB",
)
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS")
# GNUPG needs a home directory for some reason
GNUPG_HOME = os.getenv("HOME", "/tmp")