At least partially working for the tesseract parser

This commit is contained in:
Trenton H
2023-12-07 15:45:50 -08:00
parent 9867db9616
commit 30281bd593
9 changed files with 368 additions and 70 deletions

View File

@@ -420,7 +420,7 @@ class Consumer(LoggingMixin):
document_parser: DocumentParser = parser_class(
self.logging_group,
progress_callback,
progress_callback=progress_callback,
)
self.log.debug(f"Parser: {type(document_parser).__name__}")

View File

@@ -125,8 +125,10 @@ def get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentPar
if not options:
return None
best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
# Return the parser with the highest weight.
return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
return best_parser["parser"]
def run_convert(
@@ -318,6 +320,7 @@ class DocumentParser(LoggingMixin):
def __init__(self, logging_group, progress_callback=None):
super().__init__()
self.logging_group = logging_group
self.parser_settings = self.get_settings()
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
@@ -330,6 +333,10 @@ class DocumentParser(LoggingMixin):
if self.progress_callback:
self.progress_callback(current_progress, max_progress)
def get_settings(self):
# return None
raise NotImplementedError
def read_file_handle_unicode_errors(self, filepath: Path) -> str:
"""
Helper utility for reading from a file, and handling a problem with its