From 405acc7f7b57b789cda7296daf16c28569e16338 Mon Sep 17 00:00:00 2001 From: Fabien Dubuy <59292746+fdubuy@users.noreply.github.com> Date: Tue, 25 Jun 2024 21:41:08 +0200 Subject: [PATCH] Support more characters known to be in the month names of other languages --- src/documents/parsers.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index ae19d5b53..72aea0387 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -37,13 +37,14 @@ from documents.utils import run_subprocess # TODO: isn't there a date parsing library for this? DATE_REGEX = re.compile( - r"(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[a-zA-ZÀ-ÖØ-öø-ÿ]{3,9} [0-9]{4}|[a-zA-ZÀ-ÖØ-öø-ÿ]{3,9} [0-9]{1,2}, [0-9]{4})(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))([0-9]{1,2}[^ ]{2}[\. ]+[^ ]{3,9}[ \.\/-][0-9]{4})(\b|(?=([_-])))|" - r"(\b|(?!=([_-])))(\b[0-9]{1,2}[ \.\/-][a-zA-ZÀ-ÖØ-öø-ÿ]{3}[ \.\/-][0-9]{4})(\b|(?=([_-])))", + r"(\b|(?!=([_-])))(\d{1,2})[\.\/-](\d{1,2})[\.\/-](\d{4}|\d{2})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))(\d{4}|\d{2})[\.\/-](\d{1,2})[\.\/-](\d{1,2})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))(\d{1,2}[\. ]+[a-zéûäëčžúřěáíóńźçŞığü]{3,9} \d{4}|[a-zéûäëčžúřěáíóńźçŞığü]{3,9} \d{1,2}, \d{4})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{1,2}, (\d{4}))(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))([^\W\d_]{3,9} \d{4})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))(\d{1,2}[^ ]{2}[\. ]+[^ ]{3,9}[ \.\/-]\d{4})(\b|(?=([_-])))|" + r"(\b|(?!=([_-])))(\b\d{1,2}[ \.\/-][a-zéûäëčžúřěáíóńźçŞığü]{3}[ \.\/-]\d{4})(\b|(?=([_-])))", + re.IGNORECASE, )