From 74af6d8853f5af47a6f3dca97e36e091827fc361 Mon Sep 17 00:00:00 2001
From: "otxtan@gmail.com" <otxtan@gmail.com>
Date: Tue, 28 May 2024 16:39:03 +0700
Subject: [PATCH] fix : fix size image

---
 src/paperless_ocr_custom/parsers.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/paperless_ocr_custom/parsers.py b/src/paperless_ocr_custom/parsers.py
index ecf0362f0..938c411cb 100644
--- a/src/paperless_ocr_custom/parsers.py
+++ b/src/paperless_ocr_custom/parsers.py
@@ -12,7 +12,7 @@ from typing import Optional
 
 from django.conf import settings
 import requests
-from PyPDF2 import PdfReader
+from PyPDF2 import PdfReader, PdfWriter
 from reportlab.pdfgen import canvas
 from reportlab.lib.pagesizes import letter
 from PIL import Image
@@ -217,7 +217,7 @@ class RasterisedDocumentParser(DocumentParser):
         font_name = 'Arial'
         data = self.ocr_file(input_path)
         if not data:
-                return
+            return
         font_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'fonts', 'arial-font/arial.ttf')            
         with open(sidecar, "w") as txt_sidecar:
             txt_sidecar.write(data.get("content",""))
@@ -258,9 +258,10 @@ class RasterisedDocumentParser(DocumentParser):
                                        first_page=1,
                                        last_page=input_pdf.getNumPages()+1)
             can = canvas.Canvas(str(output_path), pagesize=letter)
-            for page_num, page in enumerate(input_pdf.pages):
-                page_height = input_pdf.pages[page_num].mediabox[3]
-                page_width = input_pdf.pages[page_num].mediabox[2]
+            for page_num, image in enumerate(images):
+                page_width, page_height = image.size
+                # page_height = input_pdf.pages[page_num].mediabox[3]
+                # page_width = input_pdf.pages[page_num].mediabox[2]
                 # set size new page
                 can.setPageSize((page_width, page_height))
                 byte_image = io.BytesIO()
@@ -274,13 +275,14 @@ class RasterisedDocumentParser(DocumentParser):
                 pdfmetrics.registerFont(TTFont('Arial', font_path))
                 width_api_img = data["pages"][page_num]["dimensions"][1]
                 height_api_img = data["pages"][page_num]["dimensions"][0]
+                # print(f'kich thuoc goc: height{page_height}, width{page_width}, kich thuoc api: height{height_api_img} width{width_api_img}')
                 rolate_height =  height_api_img /page_height
                 rolate_width = width_api_img /page_width
                 for block in data["pages"][page_num]["blocks"]:
                     for line in block.get("lines", []):
                         y1 = (line.get("bbox")[0][1] / float(rolate_height))
                         y2 = (line.get("bbox")[1][1] / float(rolate_height))
-                        font_size = (y2 - y1)  * 72 / 96
+                        font_size = math.floor((y2 - y1)  * 72 / 96)-2
                         y_center_coordinates = y2 - (y2 - y1)/2
                         for word in line.get("words", []):   
                             x1 = word["bbox"][0][0] / float(rolate_width)
@@ -294,7 +296,7 @@ class RasterisedDocumentParser(DocumentParser):
                             w = can.stringWidth(value, font_name, font_size)
                             can.setFont('Arial', font_size)
                             can.drawString(x_center_coordinates - w/2,
-                                           int(page_height) - y_center_coordinates - (font_size/3),
+                                           int(page_height) - y_center_coordinates - (font_size/2),
                                            value)            
                 can.drawImage(ImageReader(io.BytesIO(jpg_image)),
                               0, 0,