Initial conversion of the smaller tests into pytest format

This commit is contained in:
Trenton H 2024-06-26 08:32:08 -07:00
parent 1b9cf5121b
commit fb4d1f2b53
7 changed files with 223 additions and 165 deletions

View File

@ -1,4 +1,4 @@
import os from pathlib import Path
from django.conf import settings from django.conf import settings
from PIL import Image from PIL import Image
@ -15,7 +15,7 @@ class TextDocumentParser(DocumentParser):
logging_name = "paperless.parsing.text" logging_name = "paperless.parsing.text"
def get_thumbnail(self, document_path, mime_type, file_name=None): def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
text = self.read_file_handle_unicode_errors(document_path) text = self.read_file_handle_unicode_errors(document_path)
img = Image.new("RGB", (500, 700), color="white") img = Image.new("RGB", (500, 700), color="white")
@ -27,7 +27,7 @@ class TextDocumentParser(DocumentParser):
) )
draw.text((5, 5), text, font=font, fill="black") draw.text((5, 5), text, font=font, fill="black")
out_path = os.path.join(self.tempdir, "thumb.webp") out_path = self.tempdir / "thumb.webp"
img.save(out_path, format="WEBP") img.save(out_path, format="WEBP")
return out_path return out_path

View File

@ -0,0 +1,30 @@
from collections.abc import Generator
from pathlib import Path
import pytest
from paperless_text.parsers import TextDocumentParser
@pytest.fixture(scope="session")
def sample_dir() -> Path:
return (Path(__file__).parent / Path("samples")).resolve()
@pytest.fixture()
def text_parser() -> Generator[TextDocumentParser, None, None]:
try:
parser = TextDocumentParser(logging_group=None)
yield parser
finally:
parser.cleanup()
@pytest.fixture(scope="session")
def sample_txt_file(sample_dir: Path) -> Path:
return sample_dir / "test.txt"
@pytest.fixture(scope="session")
def malformed_txt_file(sample_dir: Path) -> Path:
return sample_dir / "decode_error.txt"

View File

@ -1,37 +1,26 @@
from pathlib import Path from pathlib import Path
from django.test import TestCase
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_text.parsers import TextDocumentParser from paperless_text.parsers import TextDocumentParser
class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): class TestTextParser:
SAMPLE_DIR = Path(__file__).resolve().parent / "samples" def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path):
def test_thumbnail(self):
parser = TextDocumentParser(None)
# just make sure that it does not crash # just make sure that it does not crash
f = parser.get_thumbnail( f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
self.SAMPLE_DIR / "test.txt", assert f.exists()
"text/plain", assert f.is_file()
)
self.assertIsFile(f)
def test_parse(self): def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path):
parser = TextDocumentParser(None) text_parser.parse(sample_txt_file, "text/plain")
parser.parse( assert text_parser.get_text() == "This is a test file.\n"
self.SAMPLE_DIR / "test.txt", assert text_parser.get_archive_path() is None
"text/plain",
)
self.assertEqual(parser.get_text(), "This is a test file.\n") def test_parse_invalid_bytes(
self.assertIsNone(parser.get_archive_path()) self,
text_parser: TextDocumentParser,
def test_parse_invalid_bytes(self): malformed_txt_file: Path,
):
""" """
GIVEN: GIVEN:
- Text file which contains invalid UTF bytes - Text file which contains invalid UTF bytes
@ -41,12 +30,8 @@ class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
- Parsing continues - Parsing continues
- Invalid bytes are removed - Invalid bytes are removed
""" """
parser = TextDocumentParser(None)
parser.parse( text_parser.parse(malformed_txt_file, "text/plain")
self.SAMPLE_DIR / "decode_error.txt",
"text/plain",
)
self.assertEqual(parser.get_text(), "Pantothens<EFBFBD>ure\n") assert text_parser.get_text() == "Pantothens<EFBFBD>ure\n"
self.assertIsNone(parser.get_archive_path()) assert text_parser.get_archive_path() is None

View File

@ -0,0 +1,40 @@
from collections.abc import Generator
from pathlib import Path
import pytest
from paperless_tika.parsers import TikaDocumentParser
@pytest.fixture()
def tika_parser() -> Generator[TikaDocumentParser, None, None]:
try:
parser = TikaDocumentParser(logging_group=None)
yield parser
finally:
parser.cleanup()
@pytest.fixture(scope="session")
def sample_dir() -> Path:
return (Path(__file__).parent / Path("samples")).resolve()
@pytest.fixture(scope="session")
def sample_odt_file(sample_dir: Path) -> Path:
return sample_dir / "sample.odt"
@pytest.fixture(scope="session")
def sample_docx_file(sample_dir: Path) -> Path:
return sample_dir / "sample.docx"
@pytest.fixture(scope="session")
def sample_doc_file(sample_dir: Path) -> Path:
return sample_dir / "sample.doc"
@pytest.fixture(scope="session")
def sample_broken_odt(sample_dir: Path) -> Path:
return sample_dir / "multi-part-broken.odt"

View File

@ -1,9 +1,7 @@
import os import os
from pathlib import Path from pathlib import Path
from typing import Final
import pytest import pytest
from django.test import TestCase
from documents.tests.utils import util_call_with_backoff from documents.tests.utils import util_call_with_backoff
from paperless_tika.parsers import TikaDocumentParser from paperless_tika.parsers import TikaDocumentParser
@ -13,22 +11,19 @@ from paperless_tika.parsers import TikaDocumentParser
"PAPERLESS_CI_TEST" not in os.environ, "PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with", reason="No Gotenberg/Tika servers to test with",
) )
class TestTikaParserAgainstServer(TestCase): @pytest.mark.django_db()
class TestTikaParserAgainstServer:
""" """
This test case tests the Tika parsing against a live tika server, This test case tests the Tika parsing against a live tika server,
if the environment contains the correct value indicating such a server if the environment contains the correct value indicating such a server
is available. is available.
""" """
SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve() def test_basic_parse_odt(
self,
def setUp(self) -> None: tika_parser: TikaDocumentParser,
self.parser = TikaDocumentParser(logging_group=None) sample_odt_file: Path,
):
def tearDown(self) -> None:
self.parser.cleanup()
def test_basic_parse_odt(self):
""" """
GIVEN: GIVEN:
- An input ODT format document - An input ODT format document
@ -38,26 +33,26 @@ class TestTikaParserAgainstServer(TestCase):
- Document content is correct - Document content is correct
- Document date is correct - Document date is correct
""" """
test_file = self.SAMPLE_DIR / Path("sample.odt")
util_call_with_backoff( util_call_with_backoff(
self.parser.parse, tika_parser.parse,
[test_file, "application/vnd.oasis.opendocument.text"], [sample_odt_file, "application/vnd.oasis.opendocument.text"],
) )
self.assertEqual( assert (
self.parser.text, tika_parser.text
"This is an ODT test document, created September 14, 2022", == "This is an ODT test document, created September 14, 2022"
) )
self.assertIsNotNone(self.parser.archive_path) assert tika_parser.archive_path is not None
with open(self.parser.archive_path, "rb") as f: assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
# PDFs begin with the bytes PDF-x.y
self.assertTrue(b"PDF-" in f.read()[:10])
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible # TODO: Unsure what can set the Creation-Date field in a document, enable when possible
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14)) # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
def test_basic_parse_docx(self): def test_basic_parse_docx(
self,
tika_parser: TikaDocumentParser,
sample_docx_file: Path,
):
""" """
GIVEN: GIVEN:
- An input DOCX format document - An input DOCX format document
@ -67,27 +62,29 @@ class TestTikaParserAgainstServer(TestCase):
- Document content is correct - Document content is correct
- Document date is correct - Document date is correct
""" """
test_file = self.SAMPLE_DIR / Path("sample.docx")
util_call_with_backoff( util_call_with_backoff(
self.parser.parse, tika_parser.parse,
[ [
test_file, sample_docx_file,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
], ],
) )
self.assertEqual( assert (
self.parser.text, tika_parser.text
"This is an DOCX test document, also made September 14, 2022", == "This is an DOCX test document, also made September 14, 2022"
) )
self.assertIsNotNone(self.parser.archive_path) assert tika_parser.archive_path is not None
with open(self.parser.archive_path, "rb") as f: with open(tika_parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10]) assert b"PDF-" in f.read()[:10]
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14)) # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
def test_basic_parse_doc(self): def test_basic_parse_doc(
self,
tika_parser: TikaDocumentParser,
sample_doc_file: Path,
):
""" """
GIVEN: GIVEN:
- An input DOC format document - An input DOC format document
@ -97,22 +94,24 @@ class TestTikaParserAgainstServer(TestCase):
- Document content is correct - Document content is correct
- Document date is correct - Document date is correct
""" """
test_file = self.SAMPLE_DIR / "sample.doc"
util_call_with_backoff( util_call_with_backoff(
self.parser.parse, tika_parser.parse,
[test_file, "application/msword"], [sample_doc_file, "application/msword"],
) )
self.assertIn( assert (
"his is a test document, saved in the older .doc format", "This is a test document, saved in the older .doc format"
self.parser.text, in tika_parser.text
) )
self.assertIsNotNone(self.parser.archive_path) assert tika_parser.archive_path is not None
with open(self.parser.archive_path, "rb") as f: with open(tika_parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10]) assert b"PDF-" in f.read()[:10]
def test_tika_fails_multi_part(self): def test_tika_fails_multi_part(
self,
tika_parser: TikaDocumentParser,
sample_broken_odt: Path,
):
""" """
GIVEN: GIVEN:
- An input ODT format document - An input ODT format document
@ -125,13 +124,11 @@ class TestTikaParserAgainstServer(TestCase):
See also: See also:
- https://issues.apache.org/jira/browse/TIKA-4110 - https://issues.apache.org/jira/browse/TIKA-4110
""" """
test_file = self.SAMPLE_DIR / "multi-part-broken.odt"
util_call_with_backoff( util_call_with_backoff(
self.parser.parse, tika_parser.parse,
[test_file, "application/vnd.oasis.opendocument.text"], [sample_broken_odt, "application/vnd.oasis.opendocument.text"],
) )
self.assertIsNotNone(self.parser.archive_path) assert tika_parser.archive_path is not None
with open(self.parser.archive_path, "rb") as f: with open(tika_parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10]) assert b"PDF-" in f.read()[:10]

View File

@ -1,30 +1,30 @@
import datetime import datetime
import os
import zoneinfo import zoneinfo
from http import HTTPStatus
from pathlib import Path from pathlib import Path
from django.test import TestCase import pytest
from django.test import override_settings
from httpx import codes from httpx import codes
from httpx._multipart import DataField from httpx._multipart import DataField
from rest_framework import status from pytest_django.fixtures import SettingsWrapper
from pytest_httpx import HTTPXMock
from documents.parsers import ParseError from documents.parsers import ParseError
from paperless_tika.parsers import TikaDocumentParser from paperless_tika.parsers import TikaDocumentParser
from paperless_tika.tests.utils import HttpxMockMixin
class TestTikaParser(HttpxMockMixin, TestCase): @pytest.mark.django_db()
def setUp(self) -> None: class TestTikaParser:
self.parser = TikaDocumentParser(logging_group=None) def test_parse(
self,
def tearDown(self) -> None: httpx_mock: HTTPXMock,
self.parser.cleanup() settings: SettingsWrapper,
tika_parser: TikaDocumentParser,
@override_settings(TIME_ZONE="America/Chicago") sample_odt_file: Path,
def test_parse(self): ):
settings.TIME_ZONE = "America/Chicago"
# Pretend parse response # Pretend parse response
self.httpx_mock.add_response( httpx_mock.add_response(
json={ json={
"Content-Type": "application/vnd.oasis.opendocument.text", "Content-Type": "application/vnd.oasis.opendocument.text",
"X-TIKA:Parsed-By": [], "X-TIKA:Parsed-By": [],
@ -33,30 +33,29 @@ class TestTikaParser(HttpxMockMixin, TestCase):
}, },
) )
# Pretend convert to PDF response # Pretend convert to PDF response
self.httpx_mock.add_response(content=b"PDF document") httpx_mock.add_response(content=b"PDF document")
file = Path(os.path.join(self.parser.tempdir, "input.odt")) tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
file.touch()
self.parser.parse(file, "application/vnd.oasis.opendocument.text") assert tika_parser.text == "the content"
assert tika_parser.archive_path is not None
with open(tika_parser.archive_path, "rb") as f:
assert f.read() == b"PDF document"
self.assertEqual(self.parser.text, "the content") assert tika_parser.date == datetime.datetime(
self.assertIsNotNone(self.parser.archive_path) 2020,
with open(self.parser.archive_path, "rb") as f: 11,
self.assertEqual(f.read(), b"PDF document") 21,
tzinfo=zoneinfo.ZoneInfo("America/Chicago"),
self.assertEqual(
self.parser.date,
datetime.datetime(
2020,
11,
21,
tzinfo=zoneinfo.ZoneInfo("America/Chicago"),
),
) )
def test_metadata(self): def test_metadata(
self.httpx_mock.add_response( self,
httpx_mock: HTTPXMock,
tika_parser: TikaDocumentParser,
sample_odt_file: Path,
):
httpx_mock.add_response(
json={ json={
"Content-Type": "application/vnd.oasis.opendocument.text", "Content-Type": "application/vnd.oasis.opendocument.text",
"X-TIKA:Parsed-By": [], "X-TIKA:Parsed-By": [],
@ -65,18 +64,20 @@ class TestTikaParser(HttpxMockMixin, TestCase):
}, },
) )
file = Path(os.path.join(self.parser.tempdir, "input.odt")) metadata = tika_parser.extract_metadata(
file.touch() sample_odt_file,
metadata = self.parser.extract_metadata(
file,
"application/vnd.oasis.opendocument.text", "application/vnd.oasis.opendocument.text",
) )
self.assertTrue("dcterms:created" in [m["key"] for m in metadata]) assert "dcterms:created" in [m["key"] for m in metadata]
self.assertTrue("Some-key" in [m["key"] for m in metadata]) assert "Some-key" in [m["key"] for m in metadata]
def test_convert_failure(self): def test_convert_failure(
self,
httpx_mock: HTTPXMock,
tika_parser: TikaDocumentParser,
sample_odt_file: Path,
):
""" """
GIVEN: GIVEN:
- Document needs to be converted to PDF - Document needs to be converted to PDF
@ -86,15 +87,29 @@ class TestTikaParser(HttpxMockMixin, TestCase):
- Parse error is raised - Parse error is raised
""" """
# Pretend convert to PDF response # Pretend convert to PDF response
self.httpx_mock.add_response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
file = Path(os.path.join(self.parser.tempdir, "input.odt")) with pytest.raises(ParseError):
file.touch() tika_parser.convert_to_pdf(sample_odt_file, None)
with self.assertRaises(ParseError): @pytest.mark.parametrize(
self.parser.convert_to_pdf(file, None) ("setting_value", "expected_form_value"),
[
def test_request_pdf_a_format(self): ("pdfa", "PDF/A-2b"),
("pdfa-2", "PDF/A-2b"),
("pdfa-1", "PDF/A-1a"),
("pdfa-3", "PDF/A-3b"),
],
)
def test_request_pdf_a_format(
self,
setting_value: str,
expected_form_value: str,
httpx_mock: HTTPXMock,
settings: SettingsWrapper,
tika_parser: TikaDocumentParser,
sample_odt_file: Path,
):
""" """
GIVEN: GIVEN:
- Document needs to be converted to PDF - Document needs to be converted to PDF
@ -103,31 +118,21 @@ class TestTikaParser(HttpxMockMixin, TestCase):
THEN: THEN:
- Request to Gotenberg contains the expected PDF/A format string - Request to Gotenberg contains the expected PDF/A format string
""" """
file = Path(os.path.join(self.parser.tempdir, "input.odt")) settings.OCR_OUTPUT_TYPE = setting_value
file.touch() httpx_mock.add_response(
status_code=codes.OK,
content=b"PDF document",
method="POST",
)
for setting, expected_key in [ tika_parser.convert_to_pdf(sample_odt_file, None)
("pdfa", "PDF/A-2b"),
("pdfa-2", "PDF/A-2b"),
("pdfa-1", "PDF/A-2b"),
("pdfa-3", "PDF/A-3b"),
]:
with override_settings(OCR_OUTPUT_TYPE=setting):
self.httpx_mock.add_response(
status_code=codes.OK,
content=b"PDF document",
method="POST",
)
self.parser.convert_to_pdf(file, None) request = httpx_mock.get_request()
found = False
for field in request.stream.fields:
if isinstance(field, DataField) and field.name == "pdfa":
assert field.value == expected_form_value
found = True
assert found, "pdfFormat was not found"
request = self.httpx_mock.get_request() httpx_mock.reset(assert_all_responses_were_requested=False)
found = False
for field in request.stream.fields:
if isinstance(field, DataField) and field.name == "pdfa":
self.assertEqual(field.value, expected_key)
found = True
break
self.assertTrue(found)
self.httpx_mock.reset(assert_all_responses_were_requested=False)

View File

@ -2,9 +2,10 @@ import pytest
from pytest_httpx import HTTPXMock from pytest_httpx import HTTPXMock
# TODO: Remove this class once paperless_mail is updated as well
class HttpxMockMixin: class HttpxMockMixin:
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def httpx_mock_auto(self, httpx_mock: HTTPXMock): def _httpx_mock_auto(self, httpx_mock: HTTPXMock):
""" """
Workaround for allowing use of a fixture with unittest style testing Workaround for allowing use of a fixture with unittest style testing
""" """