From fb4d1f2b5317259e38258986419b68587e923a1b Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 26 Jun 2024 08:32:08 -0700 Subject: [PATCH] Initial conversion of the smaller tests into pytest format --- src/paperless_text/parsers.py | 6 +- src/paperless_text/tests/conftest.py | 30 ++++ src/paperless_text/tests/test_parser.py | 49 +++--- src/paperless_tika/tests/conftest.py | 40 +++++ src/paperless_tika/tests/test_live_tika.py | 107 +++++++------ src/paperless_tika/tests/test_tika_parser.py | 153 ++++++++++--------- src/paperless_tika/tests/utils.py | 3 +- 7 files changed, 223 insertions(+), 165 deletions(-) create mode 100644 src/paperless_text/tests/conftest.py create mode 100644 src/paperless_tika/tests/conftest.py diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py index b6481adc9..58df11d7a 100644 --- a/src/paperless_text/parsers.py +++ b/src/paperless_text/parsers.py @@ -1,4 +1,4 @@ -import os +from pathlib import Path from django.conf import settings from PIL import Image @@ -15,7 +15,7 @@ class TextDocumentParser(DocumentParser): logging_name = "paperless.parsing.text" - def get_thumbnail(self, document_path, mime_type, file_name=None): + def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path: text = self.read_file_handle_unicode_errors(document_path) img = Image.new("RGB", (500, 700), color="white") @@ -27,7 +27,7 @@ class TextDocumentParser(DocumentParser): ) draw.text((5, 5), text, font=font, fill="black") - out_path = os.path.join(self.tempdir, "thumb.webp") + out_path = self.tempdir / "thumb.webp" img.save(out_path, format="WEBP") return out_path diff --git a/src/paperless_text/tests/conftest.py b/src/paperless_text/tests/conftest.py new file mode 100644 index 000000000..1d9e4fc2f --- /dev/null +++ b/src/paperless_text/tests/conftest.py @@ -0,0 +1,30 @@ +from collections.abc import Generator +from pathlib import Path + +import pytest + +from paperless_text.parsers import TextDocumentParser + + +@pytest.fixture(scope="session") +def sample_dir() -> Path: + return (Path(__file__).parent / Path("samples")).resolve() + + +@pytest.fixture() +def text_parser() -> Generator[TextDocumentParser, None, None]: + try: + parser = TextDocumentParser(logging_group=None) + yield parser + finally: + parser.cleanup() + + +@pytest.fixture(scope="session") +def sample_txt_file(sample_dir: Path) -> Path: + return sample_dir / "test.txt" + + +@pytest.fixture(scope="session") +def malformed_txt_file(sample_dir: Path) -> Path: + return sample_dir / "decode_error.txt" diff --git a/src/paperless_text/tests/test_parser.py b/src/paperless_text/tests/test_parser.py index cc5ce76fe..0f8cc19ba 100644 --- a/src/paperless_text/tests/test_parser.py +++ b/src/paperless_text/tests/test_parser.py @@ -1,37 +1,26 @@ from pathlib import Path -from django.test import TestCase - -from documents.tests.utils import DirectoriesMixin -from documents.tests.utils import FileSystemAssertsMixin from paperless_text.parsers import TextDocumentParser -class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): - SAMPLE_DIR = Path(__file__).resolve().parent / "samples" - - def test_thumbnail(self): - parser = TextDocumentParser(None) - +class TestTextParser: + def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path): # just make sure that it does not crash - f = parser.get_thumbnail( - self.SAMPLE_DIR / "test.txt", - "text/plain", - ) - self.assertIsFile(f) + f = text_parser.get_thumbnail(sample_txt_file, "text/plain") + assert f.exists() + assert f.is_file() - def test_parse(self): - parser = TextDocumentParser(None) + def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path): + text_parser.parse(sample_txt_file, "text/plain") - parser.parse( - self.SAMPLE_DIR / "test.txt", - "text/plain", - ) + assert text_parser.get_text() == "This is a test file.\n" + assert text_parser.get_archive_path() is None - self.assertEqual(parser.get_text(), "This is a test file.\n") - self.assertIsNone(parser.get_archive_path()) - - def test_parse_invalid_bytes(self): + def test_parse_invalid_bytes( + self, + text_parser: TextDocumentParser, + malformed_txt_file: Path, + ): """ GIVEN: - Text file which contains invalid UTF bytes @@ -41,12 +30,8 @@ class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): - Parsing continues - Invalid bytes are removed """ - parser = TextDocumentParser(None) - parser.parse( - self.SAMPLE_DIR / "decode_error.txt", - "text/plain", - ) + text_parser.parse(malformed_txt_file, "text/plain") - self.assertEqual(parser.get_text(), "Pantothens�ure\n") - self.assertIsNone(parser.get_archive_path()) + assert text_parser.get_text() == "Pantothens�ure\n" + assert text_parser.get_archive_path() is None diff --git a/src/paperless_tika/tests/conftest.py b/src/paperless_tika/tests/conftest.py new file mode 100644 index 000000000..657192e4e --- /dev/null +++ b/src/paperless_tika/tests/conftest.py @@ -0,0 +1,40 @@ +from collections.abc import Generator +from pathlib import Path + +import pytest + +from paperless_tika.parsers import TikaDocumentParser + + +@pytest.fixture() +def tika_parser() -> Generator[TikaDocumentParser, None, None]: + try: + parser = TikaDocumentParser(logging_group=None) + yield parser + finally: + parser.cleanup() + + +@pytest.fixture(scope="session") +def sample_dir() -> Path: + return (Path(__file__).parent / Path("samples")).resolve() + + +@pytest.fixture(scope="session") +def sample_odt_file(sample_dir: Path) -> Path: + return sample_dir / "sample.odt" + + +@pytest.fixture(scope="session") +def sample_docx_file(sample_dir: Path) -> Path: + return sample_dir / "sample.docx" + + +@pytest.fixture(scope="session") +def sample_doc_file(sample_dir: Path) -> Path: + return sample_dir / "sample.doc" + + +@pytest.fixture(scope="session") +def sample_broken_odt(sample_dir: Path) -> Path: + return sample_dir / "multi-part-broken.odt" diff --git a/src/paperless_tika/tests/test_live_tika.py b/src/paperless_tika/tests/test_live_tika.py index 1c6225bdc..7d8cffffd 100644 --- a/src/paperless_tika/tests/test_live_tika.py +++ b/src/paperless_tika/tests/test_live_tika.py @@ -1,9 +1,7 @@ import os from pathlib import Path -from typing import Final import pytest -from django.test import TestCase from documents.tests.utils import util_call_with_backoff from paperless_tika.parsers import TikaDocumentParser @@ -13,22 +11,19 @@ from paperless_tika.parsers import TikaDocumentParser "PAPERLESS_CI_TEST" not in os.environ, reason="No Gotenberg/Tika servers to test with", ) -class TestTikaParserAgainstServer(TestCase): +@pytest.mark.django_db() +class TestTikaParserAgainstServer: """ This test case tests the Tika parsing against a live tika server, if the environment contains the correct value indicating such a server is available. """ - SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve() - - def setUp(self) -> None: - self.parser = TikaDocumentParser(logging_group=None) - - def tearDown(self) -> None: - self.parser.cleanup() - - def test_basic_parse_odt(self): + def test_basic_parse_odt( + self, + tika_parser: TikaDocumentParser, + sample_odt_file: Path, + ): """ GIVEN: - An input ODT format document @@ -38,26 +33,26 @@ class TestTikaParserAgainstServer(TestCase): - Document content is correct - Document date is correct """ - test_file = self.SAMPLE_DIR / Path("sample.odt") - util_call_with_backoff( - self.parser.parse, - [test_file, "application/vnd.oasis.opendocument.text"], + tika_parser.parse, + [sample_odt_file, "application/vnd.oasis.opendocument.text"], ) - self.assertEqual( - self.parser.text, - "This is an ODT test document, created September 14, 2022", + assert ( + tika_parser.text + == "This is an ODT test document, created September 14, 2022" ) - self.assertIsNotNone(self.parser.archive_path) - with open(self.parser.archive_path, "rb") as f: - # PDFs begin with the bytes PDF-x.y - self.assertTrue(b"PDF-" in f.read()[:10]) + assert tika_parser.archive_path is not None + assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10] # TODO: Unsure what can set the Creation-Date field in a document, enable when possible - # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14)) + # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14)) - def test_basic_parse_docx(self): + def test_basic_parse_docx( + self, + tika_parser: TikaDocumentParser, + sample_docx_file: Path, + ): """ GIVEN: - An input DOCX format document @@ -67,27 +62,29 @@ class TestTikaParserAgainstServer(TestCase): - Document content is correct - Document date is correct """ - test_file = self.SAMPLE_DIR / Path("sample.docx") - util_call_with_backoff( - self.parser.parse, + tika_parser.parse, [ - test_file, + sample_docx_file, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", ], ) - self.assertEqual( - self.parser.text, - "This is an DOCX test document, also made September 14, 2022", + assert ( + tika_parser.text + == "This is an DOCX test document, also made September 14, 2022" ) - self.assertIsNotNone(self.parser.archive_path) - with open(self.parser.archive_path, "rb") as f: - self.assertTrue(b"PDF-" in f.read()[:10]) + assert tika_parser.archive_path is not None + with open(tika_parser.archive_path, "rb") as f: + assert b"PDF-" in f.read()[:10] - # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14)) + # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14)) - def test_basic_parse_doc(self): + def test_basic_parse_doc( + self, + tika_parser: TikaDocumentParser, + sample_doc_file: Path, + ): """ GIVEN: - An input DOC format document @@ -97,22 +94,24 @@ class TestTikaParserAgainstServer(TestCase): - Document content is correct - Document date is correct """ - test_file = self.SAMPLE_DIR / "sample.doc" - util_call_with_backoff( - self.parser.parse, - [test_file, "application/msword"], + tika_parser.parse, + [sample_doc_file, "application/msword"], ) - self.assertIn( - "his is a test document, saved in the older .doc format", - self.parser.text, + assert ( + "This is a test document, saved in the older .doc format" + in tika_parser.text ) - self.assertIsNotNone(self.parser.archive_path) - with open(self.parser.archive_path, "rb") as f: - self.assertTrue(b"PDF-" in f.read()[:10]) + assert tika_parser.archive_path is not None + with open(tika_parser.archive_path, "rb") as f: + assert b"PDF-" in f.read()[:10] - def test_tika_fails_multi_part(self): + def test_tika_fails_multi_part( + self, + tika_parser: TikaDocumentParser, + sample_broken_odt: Path, + ): """ GIVEN: - An input ODT format document @@ -125,13 +124,11 @@ class TestTikaParserAgainstServer(TestCase): See also: - https://issues.apache.org/jira/browse/TIKA-4110 """ - test_file = self.SAMPLE_DIR / "multi-part-broken.odt" - util_call_with_backoff( - self.parser.parse, - [test_file, "application/vnd.oasis.opendocument.text"], + tika_parser.parse, + [sample_broken_odt, "application/vnd.oasis.opendocument.text"], ) - self.assertIsNotNone(self.parser.archive_path) - with open(self.parser.archive_path, "rb") as f: - self.assertTrue(b"PDF-" in f.read()[:10]) + assert tika_parser.archive_path is not None + with open(tika_parser.archive_path, "rb") as f: + assert b"PDF-" in f.read()[:10] diff --git a/src/paperless_tika/tests/test_tika_parser.py b/src/paperless_tika/tests/test_tika_parser.py index ee010eb49..f48ef3624 100644 --- a/src/paperless_tika/tests/test_tika_parser.py +++ b/src/paperless_tika/tests/test_tika_parser.py @@ -1,30 +1,30 @@ import datetime -import os import zoneinfo +from http import HTTPStatus from pathlib import Path -from django.test import TestCase -from django.test import override_settings +import pytest from httpx import codes from httpx._multipart import DataField -from rest_framework import status +from pytest_django.fixtures import SettingsWrapper +from pytest_httpx import HTTPXMock from documents.parsers import ParseError from paperless_tika.parsers import TikaDocumentParser -from paperless_tika.tests.utils import HttpxMockMixin -class TestTikaParser(HttpxMockMixin, TestCase): - def setUp(self) -> None: - self.parser = TikaDocumentParser(logging_group=None) - - def tearDown(self) -> None: - self.parser.cleanup() - - @override_settings(TIME_ZONE="America/Chicago") - def test_parse(self): +@pytest.mark.django_db() +class TestTikaParser: + def test_parse( + self, + httpx_mock: HTTPXMock, + settings: SettingsWrapper, + tika_parser: TikaDocumentParser, + sample_odt_file: Path, + ): + settings.TIME_ZONE = "America/Chicago" # Pretend parse response - self.httpx_mock.add_response( + httpx_mock.add_response( json={ "Content-Type": "application/vnd.oasis.opendocument.text", "X-TIKA:Parsed-By": [], @@ -33,30 +33,29 @@ class TestTikaParser(HttpxMockMixin, TestCase): }, ) # Pretend convert to PDF response - self.httpx_mock.add_response(content=b"PDF document") + httpx_mock.add_response(content=b"PDF document") - file = Path(os.path.join(self.parser.tempdir, "input.odt")) - file.touch() + tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text") - self.parser.parse(file, "application/vnd.oasis.opendocument.text") + assert tika_parser.text == "the content" + assert tika_parser.archive_path is not None + with open(tika_parser.archive_path, "rb") as f: + assert f.read() == b"PDF document" - self.assertEqual(self.parser.text, "the content") - self.assertIsNotNone(self.parser.archive_path) - with open(self.parser.archive_path, "rb") as f: - self.assertEqual(f.read(), b"PDF document") - - self.assertEqual( - self.parser.date, - datetime.datetime( - 2020, - 11, - 21, - tzinfo=zoneinfo.ZoneInfo("America/Chicago"), - ), + assert tika_parser.date == datetime.datetime( + 2020, + 11, + 21, + tzinfo=zoneinfo.ZoneInfo("America/Chicago"), ) - def test_metadata(self): - self.httpx_mock.add_response( + def test_metadata( + self, + httpx_mock: HTTPXMock, + tika_parser: TikaDocumentParser, + sample_odt_file: Path, + ): + httpx_mock.add_response( json={ "Content-Type": "application/vnd.oasis.opendocument.text", "X-TIKA:Parsed-By": [], @@ -65,18 +64,20 @@ class TestTikaParser(HttpxMockMixin, TestCase): }, ) - file = Path(os.path.join(self.parser.tempdir, "input.odt")) - file.touch() - - metadata = self.parser.extract_metadata( - file, + metadata = tika_parser.extract_metadata( + sample_odt_file, "application/vnd.oasis.opendocument.text", ) - self.assertTrue("dcterms:created" in [m["key"] for m in metadata]) - self.assertTrue("Some-key" in [m["key"] for m in metadata]) + assert "dcterms:created" in [m["key"] for m in metadata] + assert "Some-key" in [m["key"] for m in metadata] - def test_convert_failure(self): + def test_convert_failure( + self, + httpx_mock: HTTPXMock, + tika_parser: TikaDocumentParser, + sample_odt_file: Path, + ): """ GIVEN: - Document needs to be converted to PDF @@ -86,15 +87,29 @@ class TestTikaParser(HttpxMockMixin, TestCase): - Parse error is raised """ # Pretend convert to PDF response - self.httpx_mock.add_response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR) + httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR) - file = Path(os.path.join(self.parser.tempdir, "input.odt")) - file.touch() + with pytest.raises(ParseError): + tika_parser.convert_to_pdf(sample_odt_file, None) - with self.assertRaises(ParseError): - self.parser.convert_to_pdf(file, None) - - def test_request_pdf_a_format(self): + @pytest.mark.parametrize( + ("setting_value", "expected_form_value"), + [ + ("pdfa", "PDF/A-2b"), + ("pdfa-2", "PDF/A-2b"), + ("pdfa-1", "PDF/A-1a"), + ("pdfa-3", "PDF/A-3b"), + ], + ) + def test_request_pdf_a_format( + self, + setting_value: str, + expected_form_value: str, + httpx_mock: HTTPXMock, + settings: SettingsWrapper, + tika_parser: TikaDocumentParser, + sample_odt_file: Path, + ): """ GIVEN: - Document needs to be converted to PDF @@ -103,31 +118,21 @@ class TestTikaParser(HttpxMockMixin, TestCase): THEN: - Request to Gotenberg contains the expected PDF/A format string """ - file = Path(os.path.join(self.parser.tempdir, "input.odt")) - file.touch() + settings.OCR_OUTPUT_TYPE = setting_value + httpx_mock.add_response( + status_code=codes.OK, + content=b"PDF document", + method="POST", + ) - for setting, expected_key in [ - ("pdfa", "PDF/A-2b"), - ("pdfa-2", "PDF/A-2b"), - ("pdfa-1", "PDF/A-2b"), - ("pdfa-3", "PDF/A-3b"), - ]: - with override_settings(OCR_OUTPUT_TYPE=setting): - self.httpx_mock.add_response( - status_code=codes.OK, - content=b"PDF document", - method="POST", - ) + tika_parser.convert_to_pdf(sample_odt_file, None) - self.parser.convert_to_pdf(file, None) + request = httpx_mock.get_request() + found = False + for field in request.stream.fields: + if isinstance(field, DataField) and field.name == "pdfa": + assert field.value == expected_form_value + found = True + assert found, "pdfFormat was not found" - request = self.httpx_mock.get_request() - found = False - for field in request.stream.fields: - if isinstance(field, DataField) and field.name == "pdfa": - self.assertEqual(field.value, expected_key) - found = True - break - self.assertTrue(found) - - self.httpx_mock.reset(assert_all_responses_were_requested=False) + httpx_mock.reset(assert_all_responses_were_requested=False) diff --git a/src/paperless_tika/tests/utils.py b/src/paperless_tika/tests/utils.py index b26f79ec6..8eb59eef4 100644 --- a/src/paperless_tika/tests/utils.py +++ b/src/paperless_tika/tests/utils.py @@ -2,9 +2,10 @@ import pytest from pytest_httpx import HTTPXMock +# TODO: Remove this class once paperless_mail is updated as well class HttpxMockMixin: @pytest.fixture(autouse=True) - def httpx_mock_auto(self, httpx_mock: HTTPXMock): + def _httpx_mock_auto(self, httpx_mock: HTTPXMock): """ Workaround for allowing use of a fixture with unittest style testing """