fix: call ocr api with retries

This commit is contained in:
otxtan@gmail.com 2024-05-28 13:52:03 +07:00
parent 5d933f1f90
commit 8df4f82c4f

View File

@ -6,6 +6,7 @@ import re
import shutil import shutil
import tempfile import tempfile
from pathlib import Path from pathlib import Path
import time
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from typing import Optional from typing import Optional
@ -148,24 +149,37 @@ class RasterisedDocumentParser(DocumentParser):
except Exception as e: except Exception as e:
self.log.warning(f"Error while calculating DPI for image {image}: {e}") self.log.warning(f"Error while calculating DPI for image {image}: {e}")
return None return None
# call api
def call_ocr_api_with_retries(self,url, headers, params, max_retries=5, delay=5, timeout=100):
retries = 0
data_ocr = None
while retries < max_retries:
try:
response_ocr = requests.post(url, headers=headers, params=params, timeout=timeout)
if response_ocr.status_code == 200:
data_ocr = response_ocr.json()
return data_ocr
else:
logging.error('OCR error response: %s', response_ocr.text)
retries += 1
time.sleep(delay)
except requests.exceptions.Timeout:
logging.warning('OCR request timed out. Retrying...')
retries += 1
time.sleep(delay)
except requests.exceptions.RequestException as e:
logging.error('OCR request failed: %s', e)
retries += 1
time.sleep(delay)
logging.error('Max retries reached. OCR request failed.')
return None
# get ocr file img/pdf # get ocr file img/pdf
def ocr_file(self,path_file): def ocr_file(self,path_file):
# get text from api
# ocr_custom_username = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_USERNAME"]
# ocr_custom_password = settings.TCGROUP_OCR_CUSTOM["ACCOUNT"]["OCR_CUSTOM_PASSWORD"]
# url_login = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_LOGIN"]
# data = {
# 'username': ocr_custom_username,
# 'password': ocr_custom_password
# }
# response_login = requests.post(url_login, data=data)
# access_token = ''
# if response_login.status_code == 200:
# response_data = response_login.json()
# access_token = response_data.get('access_token','')
# else:
# logging.error('login: ', response_login.status_code)
k = ApplicationConfiguration.objects.filter().first() k = ApplicationConfiguration.objects.filter().first()
access_token = k.ocr_key access_token = k.ocr_key
# upload file # upload file
@ -179,7 +193,6 @@ class RasterisedDocumentParser(DocumentParser):
pdf_data = file.read() pdf_data = file.read()
response_upload = requests.post(url_upload_file, files={'file': (str(path_file).split("/")[-1], pdf_data)}, headers=headers) response_upload = requests.post(url_upload_file, files={'file': (str(path_file).split("/")[-1], pdf_data)}, headers=headers)
# logging.debug('pdf file',response_upload)
if response_upload.status_code == 200: if response_upload.status_code == 200:
get_file_id = response_upload.json().get('file_id','') get_file_id = response_upload.json().get('file_id','')
else: else:
@ -188,13 +201,15 @@ class RasterisedDocumentParser(DocumentParser):
# ocr by file_id # ocr by file_id
params = {'file_id': get_file_id} params = {'file_id': get_file_id}
url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"] url_ocr_pdf_by_fileid = settings.TCGROUP_OCR_CUSTOM["URL"]["URL_OCR_BY_FILEID"]
response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers, params=params) data_ocr = self.call_ocr_api_with_retries(url_ocr_pdf_by_fileid, headers, params, 5, 5, 100)
data_ocr = None # response_ocr = requests.post(url_ocr_pdf_by_fileid, headers=headers, params=params)
# logging.error('ocr: ', response_ocr.status_code) # data_ocr = None
if response_ocr.status_code == 200: # # logging.error('ocr: ', response_ocr.status_code)
data_ocr = response_ocr.json() # if response_ocr.status_code == 200:
else: # data_ocr = response_ocr.json()
logging.error('ocr: ', response_ocr.text) # else:
# logging.error('ocr: ', response_ocr.text)
return data_ocr return data_ocr
@ -530,7 +545,6 @@ class RasterisedDocumentParser(DocumentParser):
try: try:
self.log.debug(f"Calling OCRmyPDF with args: {args}") self.log.debug(f"Calling OCRmyPDF with args: {args}")
# ocrmypdf.ocr(**args) # ocrmypdf.ocr(**args)
self.log.info("gia tri document_path: ", document_path)
self.ocr_img_or_pdf(document_path, mime_type,**args) self.ocr_img_or_pdf(document_path, mime_type,**args)
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS: if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
self.archive_path = archive_path self.archive_path = archive_path