fix: update ocr

This commit is contained in:
otxtan@gmail.com 2024-05-19 18:01:38 +07:00
parent dfaf59d347
commit 58a0bd55c0

View File

@ -18,6 +18,10 @@ from PIL import Image,ImageDraw,ImageFont
from reportlab.pdfgen.canvas import Canvas from reportlab.pdfgen.canvas import Canvas
from reportlab.pdfbase.ttfonts import TTFont from reportlab.pdfbase.ttfonts import TTFont
from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase import pdfmetrics
from pdf2image import convert_from_path
from reportlab.lib.utils import ImageReader
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import Paragraph
from documents.parsers import DocumentParser from documents.parsers import DocumentParser
from documents.parsers import ParseError from documents.parsers import ParseError
@ -197,9 +201,10 @@ class RasterisedDocumentParser(DocumentParser):
logging.error('ocr: ', response_ocr.text) logging.error('ocr: ', response_ocr.text)
return data_ocr return data_ocr
def render_pdf_ocr(self, sidecar, mime_type, input_path, output_path): def render_pdf_ocr(self, sidecar, mime_type, input_path, output_path):
font_name = 'Arial' font_name = 'Arial'
c = None
data = self.ocr_file(input_path) data = self.ocr_file(input_path)
if not data: if not data:
return return
@ -207,6 +212,7 @@ class RasterisedDocumentParser(DocumentParser):
with open(sidecar, "w") as txt_sidecar: with open(sidecar, "w") as txt_sidecar:
txt_sidecar.write(data.get("content","")) txt_sidecar.write(data.get("content",""))
if self.is_image(mime_type): if self.is_image(mime_type):
c = None
img = Image.open(input_path) img = Image.open(input_path)
width, height = img.size width, height = img.size
c = canvas.Canvas(str(output_path), pagesize=(width, height)) c = canvas.Canvas(str(output_path), pagesize=(width, height))
@ -222,31 +228,28 @@ class RasterisedDocumentParser(DocumentParser):
y2 = word["bbox"][1][1] y2 = word["bbox"][1][1]
value = word["value"] value = word["value"]
font_size = (y2-y1) * 72 / 96 font_size = (y2-y1) * 72 / 96
# font = ImageFont.truetype(font_path, font_size)
# text = "Hello, world!"
# text_width, text_height = font.textsize(text)
# print(f"Text width: {text_width}, Text height: {text_height}")
x_center_coordinates =x2 - (x2-x1)/2 x_center_coordinates =x2 - (x2-x1)/2
y_center_coordinates =y2 - (y2-y1)/2 y_center_coordinates =y2 - (y2-y1)/2
w = c.stringWidth(value, font_name, font_size) w = c.stringWidth(value, font_name, font_size)
self.log.debug('w:', )
c.setFont('Arial', font_size) c.setFont('Arial', font_size)
c.drawString(x_center_coordinates - w/2 , height - y_center_coordinates - (font_size/2) , value) c.drawString(x_center_coordinates - w/2 , height - y_center_coordinates - (font_size/2) , value)
c.drawImage(input_path, 0, 0, width=width, height=height) c.drawImage(input_path, 0, 0, width=width, height=height)
c.save() c.save()
else: else:
shutil.copy(str(input_path), str(output_path)) shutil.copy(str(input_path), str(output_path))
output_pdf = PdfWriter() # output_pdf = PdfWriter()
input_pdf = PdfReader(output_path) input_pdf = PdfReader(input_path)
# self.log.info('gia tri get number pages',input_pdf.getNumPages())
images = convert_from_path(input_path, first_page=1, last_page=input_pdf.getNumPages()+1)
can = canvas.Canvas(str(output_path), pagesize=letter)
for page_num, page in enumerate(input_pdf.pages): for page_num, page in enumerate(input_pdf.pages):
page_height = input_pdf.pages[page_num].mediabox[3] page_height = input_pdf.pages[page_num].mediabox[3]
page_width = input_pdf.pages[page_num].mediabox[2] page_width = input_pdf.pages[page_num].mediabox[2]
byte_image = io.BytesIO()
packet = io.BytesIO() images[page_num].save(byte_image, format='JPEG')
can = canvas.Canvas(packet, pagesize=letter) jpg_image = byte_image.getvalue()
self.log.info('gia tri page height',page_height)
can.drawImage(ImageReader(io.BytesIO(jpg_image)), 0, 0, width=float(page_width), height=float(page_height))
pdfmetrics.registerFont(TTFont('Arial', font_path)) pdfmetrics.registerFont(TTFont('Arial', font_path))
width_api_img = data["pages"][page_num]["dimensions"][1] width_api_img = data["pages"][page_num]["dimensions"][1]
height_api_img = data["pages"][page_num]["dimensions"][0] height_api_img = data["pages"][page_num]["dimensions"][0]
@ -265,90 +268,15 @@ class RasterisedDocumentParser(DocumentParser):
y_center_coordinates =y2 - (y2-y1)/2 y_center_coordinates =y2 - (y2-y1)/2
w = can.stringWidth(value, font_name, font_size) w = can.stringWidth(value, font_name, font_size)
can.setFont('Arial', font_size) can.setFont('Arial', font_size)
can.drawString(x_center_coordinates - w/2 , int(page_height) - y_center_coordinates - (font_size/2) , value) can.drawString(x_center_coordinates - w/2 , int(page_height) - y_center_coordinates - (font_size/3) , value)
can.showPage() can.showPage()
can.save() can.save()
packet.seek(0)
new_pdf = PdfReader(packet)
page.merge_page(new_pdf.pages[0])
output_pdf.add_page(page)
output_pdf.write(output_path)
shutil.copyfile(str(output_path), "/home/otxtan/python/opt/paperless/pdfa.pdf") shutil.copyfile(str(output_path), "/home/otxtan/python/opt/paperless/pdfa.pdf")
# create pdf from image
def pdf_create_layer_text(self, sidecar, input_path, output_path):
packet = io.BytesIO()
# test
c = Canvas(packet, pagesize=letter)
# get data
data = self.ocr_file(input_path)
if not data:
return PdfFileReader(packet)
# viet text vao file
with open(sidecar, "w") as txt_sidecar:
txt_sidecar.write(data.get("content",""))
for page in data["pages"]:
if "blocks" in page:
for block in page["blocks"]:
for word in block.get("words",[]):
x1 = word["bbox"][0][0]
y1 = word["bbox"][0][1]
x2 = word["bbox"][1][0]
y2 = word["bbox"][1][1]
value = word["value"]
c.drawString(x1, y1, value)
c.showPage()
c.save()
packet.seek(0)
return PdfFileReader(packet)
# Tạo lớp 2 (nội dung)
def merge_pdfs(self, mime_type, input_path, output_path, overlay):
pdf = None
if self.is_image(mime_type):
packet = io.BytesIO()
img = Image.open(input_path)
width, height = img.size
c = canvas.Canvas(packet, pagesize=(width, height))
c.drawImage(input_path, 0, 0, width=width, height=height)
c.save()
packet.seek(0)
pdf = PdfFileReader(packet)
else:
pdf = PdfFileReader(input_path)
pdf_writer = PdfFileWriter()
for page_number in range(pdf.getNumPages()):
page = pdf.getPage(page_number)
page.mergePage(overlay.getPage(page_number))
pdf_writer.addPage(page)
with open(output_path, 'wb') as out:
pdf_writer.write(out)
with open("/home/otxtan/python/opt/paperless/pdfa.pdf", 'wb') as f:
pdf_writer.write(f)
#
pdf_writer1 = PdfFileWriter()
for page_num in range(pdf.numPages):
page = pdf.getPage(page_num)
pdf_writer1.addPage(page)
with open("/home/otxtan/python/opt/paperless/pdfa1.pdf", 'wb') as output_file:
pdf_writer1.write(output_file)
def ocr_img_or_pdf(self, document_path, mime_type, sidecar, output_file, **kwargs): def ocr_img_or_pdf(self, document_path, mime_type, sidecar, output_file, **kwargs):
self.log.info('mime_type:',mime_type) self.log.info('mime_type:',mime_type)
# overlay_text = self.pdf_create_layer_text(sidecar, document_path, output_file)
# self.merge_pdfs(mime_type, input_path = document_path, output_path = output_file, overlay = overlay_text)
self.render_pdf_ocr(sidecar, mime_type, document_path, output_file) self.render_pdf_ocr(sidecar, mime_type, document_path, output_file)