From dfaf59d34724526dde7b7fe8ddfe7a9636de8b05 Mon Sep 17 00:00:00 2001 From: "otxtan@gmail.com" Date: Sat, 18 May 2024 05:12:13 +0700 Subject: [PATCH] update: ocr pdf --- src/paperless_ocr_custom/parsers.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/src/paperless_ocr_custom/parsers.py b/src/paperless_ocr_custom/parsers.py index ec65b8451..5e4f39ebc 100644 --- a/src/paperless_ocr_custom/parsers.py +++ b/src/paperless_ocr_custom/parsers.py @@ -247,30 +247,28 @@ class RasterisedDocumentParser(DocumentParser): packet = io.BytesIO() can = canvas.Canvas(packet, pagesize=letter) - page_image = page.to_image() - page_image.save(packet, "JPG") pdfmetrics.registerFont(TTFont('Arial', font_path)) - + width_api_img = data["pages"][page_num]["dimensions"][1] + height_api_img = data["pages"][page_num]["dimensions"][0] + rolate_height = height_api_img /page_height + rolate_width = width_api_img /page_width for block in data["pages"][page_num]["blocks"]: for line in block.get("lines", []): for word in line.get("words", []): - x1 = word["bbox"][0][0] - y1 = word["bbox"][0][1] - x2 = word["bbox"][1][0] - y2 = word["bbox"][1][1] + x1 = word["bbox"][0][0] / float(rolate_width) + y1 = word["bbox"][0][1] / float(rolate_height) + x2 = word["bbox"][1][0] / float(rolate_width) + y2 = word["bbox"][1][1] / float(rolate_height) value = word["value"] - font_size = (y2-y1) * 72 / 96 - + font_size = float(y2-y1) * 72 / 96 x_center_coordinates =x2 - (x2-x1)/2 y_center_coordinates =y2 - (y2-y1)/2 w = can.stringWidth(value, font_name, font_size) - self.log.debug('w:', ) can.setFont('Arial', font_size) can.drawString(x_center_coordinates - w/2 , int(page_height) - y_center_coordinates - (font_size/2) , value) - + can.showPage() can.save() - packet.seek(0) new_pdf = PdfReader(packet) page.merge_page(new_pdf.pages[0])