From 8df4f82c4f20c44fd8f14a7241638711952f8918 Mon Sep 17 00:00:00 2001
From: "otxtan@gmail.com" <otxtan@gmail.com>
Date: Tue, 28 May 2024 13:52:03 +0700
Subject: [PATCH] fix: call ocr api with retries

---
 src/paperless_ocr_custom/parsers.py | 64 ++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/src/paperless_ocr_custom/parsers.py b/src/paperless_ocr_custom/parsers.py
index 3448151a9..ecf0362f0 100644
--- a/src/paperless_ocr_custom/parsers.py
+++ b/src/paperless_ocr_custom/parsers.py
@@ -6,6 +6,7 @@ import re
 import shutil
 import tempfile
 from pathlib import Path
+import time
 from typing import TYPE_CHECKING
 from typing import Optional
 
@@ -148,24 +149,37 @@ class RasterisedDocumentParser(DocumentParser):
         except Exception as e:
             self.log.warning(f"Error while calculating DPI for image {image}: {e}")
             return None
+        
+    # call api 
+    def call_ocr_api_with_retries(self,url, headers, params, max_retries=5, delay=5, timeout=100):
+        retries = 0
+        data_ocr = None
+        
+        while retries < max_retries:
+            try:
+                response_ocr = requests.post(url, headers=headers, params=params, timeout=timeout)
+                if response_ocr.status_code == 200:
+                    data_ocr = response_ocr.json()
+                    return data_ocr
+                else:
+                    logging.error('OCR error response: %s', response_ocr.text)
+                    retries += 1
+                    time.sleep(delay)
+            except requests.exceptions.Timeout:
+                logging.warning('OCR request timed out. Retrying...')
+                retries += 1
+                time.sleep(delay)
+            except requests.exceptions.RequestException as e:
+                logging.error('OCR request failed: %s', e)
+                retries += 1
+                time.sleep(delay)
+    
+        logging.error('Max retries reached. OCR request failed.')
+        return None
+    
     # get ocr file img/pdf
     def ocr_file(self,path_file):
-        # get text from api 
-        # ocr_custom_username = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_USERNAME"]
-        # ocr_custom_password = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_PASSWORD"]
-        # url_login = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_LOGIN"]
-        # data = {
-        #     'username': ocr_custom_username,
-        #     'password': ocr_custom_password
-        # }
-        # response_login = requests.post(url_login, data=data)
-        # access_token = ''
-        # if response_login.status_code == 200:
-        #     response_data = response_login.json()
-        #     access_token = response_data.get('access_token','')
-        # else:
-        #     logging.error('login: ', response_login.status_code)
-        
+
         k = ApplicationConfiguration.objects.filter().first()
         access_token = k.ocr_key
         # upload file
@@ -179,7 +193,6 @@ class RasterisedDocumentParser(DocumentParser):
             pdf_data = file.read()
         
         response_upload = requests.post(url_upload_file, files={'file': (str(path_file).split("/")[-1], pdf_data)}, headers=headers)
-        # logging.debug('pdf file',response_upload)
         if response_upload.status_code == 200:
             get_file_id = response_upload.json().get('file_id','')
         else:
@@ -188,13 +201,15 @@ class RasterisedDocumentParser(DocumentParser):
         # ocr by file_id
         params = {'file_id': get_file_id}
         url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"]
-        response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers, params=params)
-        data_ocr = None
-        # logging.error('ocr: ', response_ocr.status_code)
-        if response_ocr.status_code == 200:
-            data_ocr = response_ocr.json()
-        else:
-            logging.error('ocr: ', response_ocr.text)
+        data_ocr = self.call_ocr_api_with_retries(url_ocr_pdf_by_fileid, headers, params, 5, 5, 100)
+        # response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers, params=params)
+        # data_ocr = None
+        # # logging.error('ocr: ', response_ocr.status_code)
+        # if response_ocr.status_code == 200:
+        #     data_ocr = response_ocr.json()
+        # else:
+        #     logging.error('ocr: ', response_ocr.text)
+
         return data_ocr
     
 
@@ -530,7 +545,6 @@ class RasterisedDocumentParser(DocumentParser):
         try:
             self.log.debug(f"Calling OCRmyPDF with args: {args}")
             # ocrmypdf.ocr(**args)
-            self.log.info("gia tri document_path: ", document_path)
             self.ocr_img_or_pdf(document_path, mime_type,**args)
             if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
                 self.archive_path = archive_path