fix: call ocr api with retries

2024-05-28 13:52:03 +07:00
parent 5d933f1f90
commit 8df4f82c4f
1 changed files with 39 additions and 25 deletions
--- a/src/paperless_ocr_custom/parsers.py
+++ b/src/paperless_ocr_custom/parsers.py
@@ -6,6 +6,7 @@ import re
 import shutil
 import tempfile
 from pathlib import Path
+import time
 from typing import TYPE_CHECKING
 from typing import Optional

@@ -148,24 +149,37 @@ class RasterisedDocumentParser(DocumentParser):
        except Exception as e:
            self.log.warning(f"Error while calculating DPI for image {image}: {e}")
            return None
+        
+    # call api 
+    def call_ocr_api_with_retries(self,url, headers, params, max_retries=5, delay=5, timeout=100):
+        retries = 0
+        data_ocr = None
+        
+        while retries < max_retries:
+            try:
+                response_ocr = requests.post(url, headers=headers, params=params, timeout=timeout)
+                if response_ocr.status_code == 200:
+                    data_ocr = response_ocr.json()
+                    return data_ocr
+                else:
+                    logging.error('OCR error response: %s', response_ocr.text)
+                    retries += 1
+                    time.sleep(delay)
+            except requests.exceptions.Timeout:
+                logging.warning('OCR request timed out. Retrying...')
+                retries += 1
+                time.sleep(delay)
+            except requests.exceptions.RequestException as e:
+                logging.error('OCR request failed: %s', e)
+                retries += 1
+                time.sleep(delay)
+    
+        logging.error('Max retries reached. OCR request failed.')
+        return None
+    
    # get ocr file img/pdf
    def ocr_file(self,path_file):
-        # get text from api 
-        # ocr_custom_username = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_USERNAME"]
-        # ocr_custom_password = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_PASSWORD"]
-        # url_login = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_LOGIN"]
-        # data = {
-        #     'username': ocr_custom_username,
-        #     'password': ocr_custom_password
-        # }
-        # response_login = requests.post(url_login, data=data)
-        # access_token = ''
-        # if response_login.status_code == 200:
-        #     response_data = response_login.json()
-        #     access_token = response_data.get('access_token','')
-        # else:
-        #     logging.error('login: ', response_login.status_code)
-        
+
        k = ApplicationConfiguration.objects.filter().first()
        access_token = k.ocr_key
        # upload file
@@ -179,7 +193,6 @@ class RasterisedDocumentParser(DocumentParser):
            pdf_data = file.read()
        
        response_upload = requests.post(url_upload_file, files={'file': (str(path_file).split("/")[-1], pdf_data)}, headers=headers)
-        # logging.debug('pdf file',response_upload)
        if response_upload.status_code == 200:
            get_file_id = response_upload.json().get('file_id','')
        else:
@@ -188,13 +201,15 @@ class RasterisedDocumentParser(DocumentParser):
        # ocr by file_id
        params = {'file_id': get_file_id}
        url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"]
-        response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers, params=params)
-        data_ocr = None
-        # logging.error('ocr: ', response_ocr.status_code)
-        if response_ocr.status_code == 200:
-            data_ocr = response_ocr.json()
-        else:
-            logging.error('ocr: ', response_ocr.text)
+        data_ocr = self.call_ocr_api_with_retries(url_ocr_pdf_by_fileid, headers, params, 5, 5, 100)
+        # response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers, params=params)
+        # data_ocr = None
+        # # logging.error('ocr: ', response_ocr.status_code)
+        # if response_ocr.status_code == 200:
+        #     data_ocr = response_ocr.json()
+        # else:
+        #     logging.error('ocr: ', response_ocr.text)
+
        return data_ocr
    

@@ -530,7 +545,6 @@ class RasterisedDocumentParser(DocumentParser):
        try:
            self.log.debug(f"Calling OCRmyPDF with args: {args}")
            # ocrmypdf.ocr(**args)
-            self.log.info("gia tri document_path: ", document_path)
            self.ocr_img_or_pdf(document_path, mime_type,**args)
            if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
                self.archive_path = archive_path