From fb4d1f2b5317259e38258986419b68587e923a1b Mon Sep 17 00:00:00 2001
From: Trenton H <797416+stumpylog@users.noreply.github.com>
Date: Wed, 26 Jun 2024 08:32:08 -0700
Subject: [PATCH] Initial conversion of the smaller tests into pytest format

---
 src/paperless_text/parsers.py                |   6 +-
 src/paperless_text/tests/conftest.py         |  30 ++++
 src/paperless_text/tests/test_parser.py      |  49 +++---
 src/paperless_tika/tests/conftest.py         |  40 +++++
 src/paperless_tika/tests/test_live_tika.py   | 107 +++++++------
 src/paperless_tika/tests/test_tika_parser.py | 153 ++++++++++---------
 src/paperless_tika/tests/utils.py            |   3 +-
 7 files changed, 223 insertions(+), 165 deletions(-)
 create mode 100644 src/paperless_text/tests/conftest.py
 create mode 100644 src/paperless_tika/tests/conftest.py

diff --git a/src/paperless_text/parsers.py b/src/paperless_text/parsers.py
index b6481adc9..58df11d7a 100644
--- a/src/paperless_text/parsers.py
+++ b/src/paperless_text/parsers.py
@@ -1,4 +1,4 @@
-import os
+from pathlib import Path
 
 from django.conf import settings
 from PIL import Image
@@ -15,7 +15,7 @@ class TextDocumentParser(DocumentParser):
 
     logging_name = "paperless.parsing.text"
 
-    def get_thumbnail(self, document_path, mime_type, file_name=None):
+    def get_thumbnail(self, document_path: Path, mime_type, file_name=None) -> Path:
         text = self.read_file_handle_unicode_errors(document_path)
 
         img = Image.new("RGB", (500, 700), color="white")
@@ -27,7 +27,7 @@ class TextDocumentParser(DocumentParser):
         )
         draw.text((5, 5), text, font=font, fill="black")
 
-        out_path = os.path.join(self.tempdir, "thumb.webp")
+        out_path = self.tempdir / "thumb.webp"
         img.save(out_path, format="WEBP")
 
         return out_path
diff --git a/src/paperless_text/tests/conftest.py b/src/paperless_text/tests/conftest.py
new file mode 100644
index 000000000..1d9e4fc2f
--- /dev/null
+++ b/src/paperless_text/tests/conftest.py
@@ -0,0 +1,30 @@
+from collections.abc import Generator
+from pathlib import Path
+
+import pytest
+
+from paperless_text.parsers import TextDocumentParser
+
+
+@pytest.fixture(scope="session")
+def sample_dir() -> Path:
+    return (Path(__file__).parent / Path("samples")).resolve()
+
+
+@pytest.fixture()
+def text_parser() -> Generator[TextDocumentParser, None, None]:
+    try:
+        parser = TextDocumentParser(logging_group=None)
+        yield parser
+    finally:
+        parser.cleanup()
+
+
+@pytest.fixture(scope="session")
+def sample_txt_file(sample_dir: Path) -> Path:
+    return sample_dir / "test.txt"
+
+
+@pytest.fixture(scope="session")
+def malformed_txt_file(sample_dir: Path) -> Path:
+    return sample_dir / "decode_error.txt"
diff --git a/src/paperless_text/tests/test_parser.py b/src/paperless_text/tests/test_parser.py
index cc5ce76fe..0f8cc19ba 100644
--- a/src/paperless_text/tests/test_parser.py
+++ b/src/paperless_text/tests/test_parser.py
@@ -1,37 +1,26 @@
 from pathlib import Path
 
-from django.test import TestCase
-
-from documents.tests.utils import DirectoriesMixin
-from documents.tests.utils import FileSystemAssertsMixin
 from paperless_text.parsers import TextDocumentParser
 
 
-class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
-    SAMPLE_DIR = Path(__file__).resolve().parent / "samples"
-
-    def test_thumbnail(self):
-        parser = TextDocumentParser(None)
-
+class TestTextParser:
+    def test_thumbnail(self, text_parser: TextDocumentParser, sample_txt_file: Path):
         # just make sure that it does not crash
-        f = parser.get_thumbnail(
-            self.SAMPLE_DIR / "test.txt",
-            "text/plain",
-        )
-        self.assertIsFile(f)
+        f = text_parser.get_thumbnail(sample_txt_file, "text/plain")
+        assert f.exists()
+        assert f.is_file()
 
-    def test_parse(self):
-        parser = TextDocumentParser(None)
+    def test_parse(self, text_parser: TextDocumentParser, sample_txt_file: Path):
+        text_parser.parse(sample_txt_file, "text/plain")
 
-        parser.parse(
-            self.SAMPLE_DIR / "test.txt",
-            "text/plain",
-        )
+        assert text_parser.get_text() == "This is a test file.\n"
+        assert text_parser.get_archive_path() is None
 
-        self.assertEqual(parser.get_text(), "This is a test file.\n")
-        self.assertIsNone(parser.get_archive_path())
-
-    def test_parse_invalid_bytes(self):
+    def test_parse_invalid_bytes(
+        self,
+        text_parser: TextDocumentParser,
+        malformed_txt_file: Path,
+    ):
         """
         GIVEN:
             - Text file which contains invalid UTF bytes
@@ -41,12 +30,8 @@ class TestTextParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
             - Parsing continues
             - Invalid bytes are removed
         """
-        parser = TextDocumentParser(None)
 
-        parser.parse(
-            self.SAMPLE_DIR / "decode_error.txt",
-            "text/plain",
-        )
+        text_parser.parse(malformed_txt_file, "text/plain")
 
-        self.assertEqual(parser.get_text(), "Pantothens�ure\n")
-        self.assertIsNone(parser.get_archive_path())
+        assert text_parser.get_text() == "Pantothens�ure\n"
+        assert text_parser.get_archive_path() is None
diff --git a/src/paperless_tika/tests/conftest.py b/src/paperless_tika/tests/conftest.py
new file mode 100644
index 000000000..657192e4e
--- /dev/null
+++ b/src/paperless_tika/tests/conftest.py
@@ -0,0 +1,40 @@
+from collections.abc import Generator
+from pathlib import Path
+
+import pytest
+
+from paperless_tika.parsers import TikaDocumentParser
+
+
+@pytest.fixture()
+def tika_parser() -> Generator[TikaDocumentParser, None, None]:
+    try:
+        parser = TikaDocumentParser(logging_group=None)
+        yield parser
+    finally:
+        parser.cleanup()
+
+
+@pytest.fixture(scope="session")
+def sample_dir() -> Path:
+    return (Path(__file__).parent / Path("samples")).resolve()
+
+
+@pytest.fixture(scope="session")
+def sample_odt_file(sample_dir: Path) -> Path:
+    return sample_dir / "sample.odt"
+
+
+@pytest.fixture(scope="session")
+def sample_docx_file(sample_dir: Path) -> Path:
+    return sample_dir / "sample.docx"
+
+
+@pytest.fixture(scope="session")
+def sample_doc_file(sample_dir: Path) -> Path:
+    return sample_dir / "sample.doc"
+
+
+@pytest.fixture(scope="session")
+def sample_broken_odt(sample_dir: Path) -> Path:
+    return sample_dir / "multi-part-broken.odt"
diff --git a/src/paperless_tika/tests/test_live_tika.py b/src/paperless_tika/tests/test_live_tika.py
index 1c6225bdc..7d8cffffd 100644
--- a/src/paperless_tika/tests/test_live_tika.py
+++ b/src/paperless_tika/tests/test_live_tika.py
@@ -1,9 +1,7 @@
 import os
 from pathlib import Path
-from typing import Final
 
 import pytest
-from django.test import TestCase
 
 from documents.tests.utils import util_call_with_backoff
 from paperless_tika.parsers import TikaDocumentParser
@@ -13,22 +11,19 @@ from paperless_tika.parsers import TikaDocumentParser
     "PAPERLESS_CI_TEST" not in os.environ,
     reason="No Gotenberg/Tika servers to test with",
 )
-class TestTikaParserAgainstServer(TestCase):
+@pytest.mark.django_db()
+class TestTikaParserAgainstServer:
     """
     This test case tests the Tika parsing against a live tika server,
     if the environment contains the correct value indicating such a server
     is available.
     """
 
-    SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve()
-
-    def setUp(self) -> None:
-        self.parser = TikaDocumentParser(logging_group=None)
-
-    def tearDown(self) -> None:
-        self.parser.cleanup()
-
-    def test_basic_parse_odt(self):
+    def test_basic_parse_odt(
+        self,
+        tika_parser: TikaDocumentParser,
+        sample_odt_file: Path,
+    ):
         """
         GIVEN:
             - An input ODT format document
@@ -38,26 +33,26 @@ class TestTikaParserAgainstServer(TestCase):
             - Document content is correct
             - Document date is correct
         """
-        test_file = self.SAMPLE_DIR / Path("sample.odt")
-
         util_call_with_backoff(
-            self.parser.parse,
-            [test_file, "application/vnd.oasis.opendocument.text"],
+            tika_parser.parse,
+            [sample_odt_file, "application/vnd.oasis.opendocument.text"],
         )
 
-        self.assertEqual(
-            self.parser.text,
-            "This is an ODT test document, created September 14, 2022",
+        assert (
+            tika_parser.text
+            == "This is an ODT test document, created September 14, 2022"
         )
-        self.assertIsNotNone(self.parser.archive_path)
-        with open(self.parser.archive_path, "rb") as f:
-            # PDFs begin with the bytes PDF-x.y
-            self.assertTrue(b"PDF-" in f.read()[:10])
+        assert tika_parser.archive_path is not None
+        assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
 
         # TODO: Unsure what can set the Creation-Date field in a document, enable when possible
-        # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
+        # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
 
-    def test_basic_parse_docx(self):
+    def test_basic_parse_docx(
+        self,
+        tika_parser: TikaDocumentParser,
+        sample_docx_file: Path,
+    ):
         """
         GIVEN:
             - An input DOCX format document
@@ -67,27 +62,29 @@ class TestTikaParserAgainstServer(TestCase):
             - Document content is correct
             - Document date is correct
         """
-        test_file = self.SAMPLE_DIR / Path("sample.docx")
-
         util_call_with_backoff(
-            self.parser.parse,
+            tika_parser.parse,
             [
-                test_file,
+                sample_docx_file,
                 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
             ],
         )
 
-        self.assertEqual(
-            self.parser.text,
-            "This is an DOCX test document, also made September 14, 2022",
+        assert (
+            tika_parser.text
+            == "This is an DOCX test document, also made September 14, 2022"
         )
-        self.assertIsNotNone(self.parser.archive_path)
-        with open(self.parser.archive_path, "rb") as f:
-            self.assertTrue(b"PDF-" in f.read()[:10])
+        assert tika_parser.archive_path is not None
+        with open(tika_parser.archive_path, "rb") as f:
+            assert b"PDF-" in f.read()[:10]
 
-        # self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
+        # self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
 
-    def test_basic_parse_doc(self):
+    def test_basic_parse_doc(
+        self,
+        tika_parser: TikaDocumentParser,
+        sample_doc_file: Path,
+    ):
         """
         GIVEN:
             - An input DOC format document
@@ -97,22 +94,24 @@ class TestTikaParserAgainstServer(TestCase):
             - Document content is correct
             - Document date is correct
         """
-        test_file = self.SAMPLE_DIR / "sample.doc"
-
         util_call_with_backoff(
-            self.parser.parse,
-            [test_file, "application/msword"],
+            tika_parser.parse,
+            [sample_doc_file, "application/msword"],
         )
 
-        self.assertIn(
-            "his is a test document, saved in the older .doc format",
-            self.parser.text,
+        assert (
+            "This is a test document, saved in the older .doc format"
+            in tika_parser.text
         )
-        self.assertIsNotNone(self.parser.archive_path)
-        with open(self.parser.archive_path, "rb") as f:
-            self.assertTrue(b"PDF-" in f.read()[:10])
+        assert tika_parser.archive_path is not None
+        with open(tika_parser.archive_path, "rb") as f:
+            assert b"PDF-" in f.read()[:10]
 
-    def test_tika_fails_multi_part(self):
+    def test_tika_fails_multi_part(
+        self,
+        tika_parser: TikaDocumentParser,
+        sample_broken_odt: Path,
+    ):
         """
         GIVEN:
             - An input ODT format document
@@ -125,13 +124,11 @@ class TestTikaParserAgainstServer(TestCase):
         See also:
             - https://issues.apache.org/jira/browse/TIKA-4110
         """
-        test_file = self.SAMPLE_DIR / "multi-part-broken.odt"
-
         util_call_with_backoff(
-            self.parser.parse,
-            [test_file, "application/vnd.oasis.opendocument.text"],
+            tika_parser.parse,
+            [sample_broken_odt, "application/vnd.oasis.opendocument.text"],
         )
 
-        self.assertIsNotNone(self.parser.archive_path)
-        with open(self.parser.archive_path, "rb") as f:
-            self.assertTrue(b"PDF-" in f.read()[:10])
+        assert tika_parser.archive_path is not None
+        with open(tika_parser.archive_path, "rb") as f:
+            assert b"PDF-" in f.read()[:10]
diff --git a/src/paperless_tika/tests/test_tika_parser.py b/src/paperless_tika/tests/test_tika_parser.py
index ee010eb49..f48ef3624 100644
--- a/src/paperless_tika/tests/test_tika_parser.py
+++ b/src/paperless_tika/tests/test_tika_parser.py
@@ -1,30 +1,30 @@
 import datetime
-import os
 import zoneinfo
+from http import HTTPStatus
 from pathlib import Path
 
-from django.test import TestCase
-from django.test import override_settings
+import pytest
 from httpx import codes
 from httpx._multipart import DataField
-from rest_framework import status
+from pytest_django.fixtures import SettingsWrapper
+from pytest_httpx import HTTPXMock
 
 from documents.parsers import ParseError
 from paperless_tika.parsers import TikaDocumentParser
-from paperless_tika.tests.utils import HttpxMockMixin
 
 
-class TestTikaParser(HttpxMockMixin, TestCase):
-    def setUp(self) -> None:
-        self.parser = TikaDocumentParser(logging_group=None)
-
-    def tearDown(self) -> None:
-        self.parser.cleanup()
-
-    @override_settings(TIME_ZONE="America/Chicago")
-    def test_parse(self):
+@pytest.mark.django_db()
+class TestTikaParser:
+    def test_parse(
+        self,
+        httpx_mock: HTTPXMock,
+        settings: SettingsWrapper,
+        tika_parser: TikaDocumentParser,
+        sample_odt_file: Path,
+    ):
+        settings.TIME_ZONE = "America/Chicago"
         # Pretend parse response
-        self.httpx_mock.add_response(
+        httpx_mock.add_response(
             json={
                 "Content-Type": "application/vnd.oasis.opendocument.text",
                 "X-TIKA:Parsed-By": [],
@@ -33,30 +33,29 @@ class TestTikaParser(HttpxMockMixin, TestCase):
             },
         )
         # Pretend convert to PDF response
-        self.httpx_mock.add_response(content=b"PDF document")
+        httpx_mock.add_response(content=b"PDF document")
 
-        file = Path(os.path.join(self.parser.tempdir, "input.odt"))
-        file.touch()
+        tika_parser.parse(sample_odt_file, "application/vnd.oasis.opendocument.text")
 
-        self.parser.parse(file, "application/vnd.oasis.opendocument.text")
+        assert tika_parser.text == "the content"
+        assert tika_parser.archive_path is not None
+        with open(tika_parser.archive_path, "rb") as f:
+            assert f.read() == b"PDF document"
 
-        self.assertEqual(self.parser.text, "the content")
-        self.assertIsNotNone(self.parser.archive_path)
-        with open(self.parser.archive_path, "rb") as f:
-            self.assertEqual(f.read(), b"PDF document")
-
-        self.assertEqual(
-            self.parser.date,
-            datetime.datetime(
-                2020,
-                11,
-                21,
-                tzinfo=zoneinfo.ZoneInfo("America/Chicago"),
-            ),
+        assert tika_parser.date == datetime.datetime(
+            2020,
+            11,
+            21,
+            tzinfo=zoneinfo.ZoneInfo("America/Chicago"),
         )
 
-    def test_metadata(self):
-        self.httpx_mock.add_response(
+    def test_metadata(
+        self,
+        httpx_mock: HTTPXMock,
+        tika_parser: TikaDocumentParser,
+        sample_odt_file: Path,
+    ):
+        httpx_mock.add_response(
             json={
                 "Content-Type": "application/vnd.oasis.opendocument.text",
                 "X-TIKA:Parsed-By": [],
@@ -65,18 +64,20 @@ class TestTikaParser(HttpxMockMixin, TestCase):
             },
         )
 
-        file = Path(os.path.join(self.parser.tempdir, "input.odt"))
-        file.touch()
-
-        metadata = self.parser.extract_metadata(
-            file,
+        metadata = tika_parser.extract_metadata(
+            sample_odt_file,
             "application/vnd.oasis.opendocument.text",
         )
 
-        self.assertTrue("dcterms:created" in [m["key"] for m in metadata])
-        self.assertTrue("Some-key" in [m["key"] for m in metadata])
+        assert "dcterms:created" in [m["key"] for m in metadata]
+        assert "Some-key" in [m["key"] for m in metadata]
 
-    def test_convert_failure(self):
+    def test_convert_failure(
+        self,
+        httpx_mock: HTTPXMock,
+        tika_parser: TikaDocumentParser,
+        sample_odt_file: Path,
+    ):
         """
         GIVEN:
             - Document needs to be converted to PDF
@@ -86,15 +87,29 @@ class TestTikaParser(HttpxMockMixin, TestCase):
             - Parse error is raised
         """
         # Pretend convert to PDF response
-        self.httpx_mock.add_response(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR)
+        httpx_mock.add_response(status_code=HTTPStatus.INTERNAL_SERVER_ERROR)
 
-        file = Path(os.path.join(self.parser.tempdir, "input.odt"))
-        file.touch()
+        with pytest.raises(ParseError):
+            tika_parser.convert_to_pdf(sample_odt_file, None)
 
-        with self.assertRaises(ParseError):
-            self.parser.convert_to_pdf(file, None)
-
-    def test_request_pdf_a_format(self):
+    @pytest.mark.parametrize(
+        ("setting_value", "expected_form_value"),
+        [
+            ("pdfa", "PDF/A-2b"),
+            ("pdfa-2", "PDF/A-2b"),
+            ("pdfa-1", "PDF/A-1a"),
+            ("pdfa-3", "PDF/A-3b"),
+        ],
+    )
+    def test_request_pdf_a_format(
+        self,
+        setting_value: str,
+        expected_form_value: str,
+        httpx_mock: HTTPXMock,
+        settings: SettingsWrapper,
+        tika_parser: TikaDocumentParser,
+        sample_odt_file: Path,
+    ):
         """
         GIVEN:
             - Document needs to be converted to PDF
@@ -103,31 +118,21 @@ class TestTikaParser(HttpxMockMixin, TestCase):
         THEN:
             - Request to Gotenberg contains the expected PDF/A format string
         """
-        file = Path(os.path.join(self.parser.tempdir, "input.odt"))
-        file.touch()
+        settings.OCR_OUTPUT_TYPE = setting_value
+        httpx_mock.add_response(
+            status_code=codes.OK,
+            content=b"PDF document",
+            method="POST",
+        )
 
-        for setting, expected_key in [
-            ("pdfa", "PDF/A-2b"),
-            ("pdfa-2", "PDF/A-2b"),
-            ("pdfa-1", "PDF/A-2b"),
-            ("pdfa-3", "PDF/A-3b"),
-        ]:
-            with override_settings(OCR_OUTPUT_TYPE=setting):
-                self.httpx_mock.add_response(
-                    status_code=codes.OK,
-                    content=b"PDF document",
-                    method="POST",
-                )
+        tika_parser.convert_to_pdf(sample_odt_file, None)
 
-                self.parser.convert_to_pdf(file, None)
+        request = httpx_mock.get_request()
+        found = False
+        for field in request.stream.fields:
+            if isinstance(field, DataField) and field.name == "pdfa":
+                assert field.value == expected_form_value
+                found = True
+        assert found, "pdfFormat was not found"
 
-                request = self.httpx_mock.get_request()
-                found = False
-                for field in request.stream.fields:
-                    if isinstance(field, DataField) and field.name == "pdfa":
-                        self.assertEqual(field.value, expected_key)
-                        found = True
-                        break
-                self.assertTrue(found)
-
-                self.httpx_mock.reset(assert_all_responses_were_requested=False)
+        httpx_mock.reset(assert_all_responses_were_requested=False)
diff --git a/src/paperless_tika/tests/utils.py b/src/paperless_tika/tests/utils.py
index b26f79ec6..8eb59eef4 100644
--- a/src/paperless_tika/tests/utils.py
+++ b/src/paperless_tika/tests/utils.py
@@ -2,9 +2,10 @@ import pytest
 from pytest_httpx import HTTPXMock
 
 
+# TODO: Remove this class once paperless_mail is updated as well
 class HttpxMockMixin:
     @pytest.fixture(autouse=True)
-    def httpx_mock_auto(self, httpx_mock: HTTPXMock):
+    def _httpx_mock_auto(self, httpx_mock: HTTPXMock):
         """
         Workaround for allowing use of a fixture with unittest style testing
         """