In the case of an RTL language being extracted via pdfminer.six, fall back to forced OCR, which handles RTL text better
This commit is contained in:
		
							parent
							
								
									15cba8e14d
								
							
						
					
					
						commit
						a2b7687c3b
					
				@ -13,6 +13,10 @@ class NoTextFoundException(Exception):
 | 
				
			|||||||
    pass
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class RtlLanguageException(Exception):
 | 
				
			||||||
 | 
					    pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class RasterisedDocumentParser(DocumentParser):
 | 
					class RasterisedDocumentParser(DocumentParser):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    This parser uses Tesseract to try and get some text out of a rasterised
 | 
					    This parser uses Tesseract to try and get some text out of a rasterised
 | 
				
			||||||
@ -125,7 +129,26 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
            stripped = post_process_text(pdfminer_extract_text(pdf_file))
 | 
					            stripped = post_process_text(pdfminer_extract_text(pdf_file))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            self.log("debug", f"Extracted text from PDF file {pdf_file}")
 | 
					            self.log("debug", f"Extracted text from PDF file {pdf_file}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # pdfminer.six does not handle RTL text
 | 
				
			||||||
 | 
					            # as a hack, for some languages, return no text, to force
 | 
				
			||||||
 | 
					            # OCRMyPdf/Tesseract do handle this correctly
 | 
				
			||||||
 | 
					            from langdetect import detect
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            lang = detect(stripped)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            self.log("debug", f"Detected language {lang}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if lang in {
 | 
				
			||||||
 | 
					                "ar",  # Arabic
 | 
				
			||||||
 | 
					                "he",  # Hebrew,
 | 
				
			||||||
 | 
					                "fa",  # Persian
 | 
				
			||||||
 | 
					            }:
 | 
				
			||||||
 | 
					                raise RtlLanguageException()
 | 
				
			||||||
            return stripped
 | 
					            return stripped
 | 
				
			||||||
 | 
					        except RtlLanguageException:
 | 
				
			||||||
 | 
					            self.log("warning", f"Detected RTL language {lang}")
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
        except Exception:
 | 
					        except Exception:
 | 
				
			||||||
            # TODO catch all for various issues with PDFminer.six.
 | 
					            # TODO catch all for various issues with PDFminer.six.
 | 
				
			||||||
            #  If PDFminer fails, fall back to OCR.
 | 
					            #  If PDFminer fails, fall back to OCR.
 | 
				
			||||||
@ -305,7 +328,7 @@ class RasterisedDocumentParser(DocumentParser):
 | 
				
			|||||||
            )
 | 
					            )
 | 
				
			||||||
            if original_has_text:
 | 
					            if original_has_text:
 | 
				
			||||||
                self.text = text_original
 | 
					                self.text = text_original
 | 
				
			||||||
        except (NoTextFoundException, InputFileError) as e:
 | 
					        except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
 | 
				
			||||||
            self.log(
 | 
					            self.log(
 | 
				
			||||||
                "warning",
 | 
					                "warning",
 | 
				
			||||||
                f"Encountered an error while running OCR: {str(e)}. "
 | 
					                f"Encountered an error while running OCR: {str(e)}. "
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/rtl-test.pdf
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/paperless_tesseract/tests/samples/rtl-test.pdf
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							@ -588,6 +588,39 @@ class TestParser(DirectoriesMixin, TestCase):
 | 
				
			|||||||
            params = parser.construct_ocrmypdf_parameters("", "", "", "")
 | 
					            params = parser.construct_ocrmypdf_parameters("", "", "", "")
 | 
				
			||||||
            self.assertNotIn("deskew", params)
 | 
					            self.assertNotIn("deskew", params)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def test_rtl_language_detection(self):
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        GIVEN:
 | 
				
			||||||
 | 
					            - File with text in an RTL language
 | 
				
			||||||
 | 
					        WHEN:
 | 
				
			||||||
 | 
					            - Document is parsed
 | 
				
			||||||
 | 
					        THEN:
 | 
				
			||||||
 | 
					            - Text from the document is extracted
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        parser = RasterisedDocumentParser(None)
 | 
				
			||||||
 | 
					        with mock.patch.object(
 | 
				
			||||||
 | 
					            parser,
 | 
				
			||||||
 | 
					            "construct_ocrmypdf_parameters",
 | 
				
			||||||
 | 
					            wraps=parser.construct_ocrmypdf_parameters,
 | 
				
			||||||
 | 
					        ) as wrapped:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            parser.parse(
 | 
				
			||||||
 | 
					                os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
 | 
				
			||||||
 | 
					                "application/pdf",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # There isn't a good way to actually check this working, with RTL correctly return
 | 
				
			||||||
 | 
					            #  as it would require tesseract-ocr-ara installed for everyone running the
 | 
				
			||||||
 | 
					            #  test suite.  This test does provide the coverage though and attempts to ensure
 | 
				
			||||||
 | 
					            # the force OCR happens
 | 
				
			||||||
 | 
					            self.assertIsNotNone(parser.get_text())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
 | 
				
			||||||
 | 
					            # Check the last call kwargs
 | 
				
			||||||
 | 
					            self.assertTrue(
 | 
				
			||||||
 | 
					                parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class TestParserFileTypes(DirectoriesMixin, TestCase):
 | 
					class TestParserFileTypes(DirectoriesMixin, TestCase):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user