In the case of an RTL language being extracted via pdfminer.six, fall back to forced OCR, which handles RTL text better

2022-11-29 13:19:16 -08:00
parent 15cba8e14d
commit a2b7687c3b
3 changed files with 57 additions and 1 deletions
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -13,6 +13,10 @@ class NoTextFoundException(Exception):
    pass
 class RtlLanguageException(Exception):
    pass
 class RasterisedDocumentParser(DocumentParser):
    """
    This parser uses Tesseract to try and get some text out of a rasterised
@@ -125,7 +129,26 @@ class RasterisedDocumentParser(DocumentParser):
            stripped = post_process_text(pdfminer_extract_text(pdf_file))
            self.log("debug", f"Extracted text from PDF file {pdf_file}")
            # pdfminer.six does not handle RTL text
            # as a hack, for some languages, return no text, to force
            # OCRMyPdf/Tesseract do handle this correctly
            from langdetect import detect
            lang = detect(stripped)
            self.log("debug", f"Detected language {lang}")
            if lang in {
                "ar",  # Arabic
                "he",  # Hebrew,
                "fa",  # Persian
            }:
                raise RtlLanguageException()
            return stripped
        except RtlLanguageException:
            self.log("warning", f"Detected RTL language {lang}")
            return None
        except Exception:
            # TODO catch all for various issues with PDFminer.six.
            #  If PDFminer fails, fall back to OCR.
@@ -305,7 +328,7 @@ class RasterisedDocumentParser(DocumentParser):
            )
            if original_has_text:
                self.text = text_original
-        except (NoTextFoundException, InputFileError) as e:
+        except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
            self.log(
                "warning",
                f"Encountered an error while running OCR: {str(e)}. "
--- a/src/paperless_tesseract/tests/samples/rtl-test.pdf
+++ b/src/paperless_tesseract/tests/samples/rtl-test.pdf
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -588,6 +588,39 @@ class TestParser(DirectoriesMixin, TestCase):
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
            self.assertNotIn("deskew", params)
    def test_rtl_language_detection(self):
        """
        GIVEN:
            - File with text in an RTL language
        WHEN:
            - Document is parsed
        THEN:
            - Text from the document is extracted
        """
        parser = RasterisedDocumentParser(None)
        with mock.patch.object(
            parser,
            "construct_ocrmypdf_parameters",
            wraps=parser.construct_ocrmypdf_parameters,
        ) as wrapped:
            parser.parse(
                os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
                "application/pdf",
            )
            # There isn't a good way to actually check this working, with RTL correctly return
            #  as it would require tesseract-ocr-ara installed for everyone running the
            #  test suite.  This test does provide the coverage though and attempts to ensure
            # the force OCR happens
            self.assertIsNotNone(parser.get_text())
            self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
            # Check the last call kwargs
            self.assertTrue(
                parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
            )
 class TestParserFileTypes(DirectoriesMixin, TestCase):