135 lines
3.9 KiB
Python

import os
from pathlib import Path
import pytest
from documents.tests.utils import util_call_with_backoff
from paperless_tika.parsers import TikaDocumentParser
@pytest.mark.skipif(
"PAPERLESS_CI_TEST" not in os.environ,
reason="No Gotenberg/Tika servers to test with",
)
@pytest.mark.django_db()
class TestTikaParserAgainstServer:
"""
This test case tests the Tika parsing against a live tika server,
if the environment contains the correct value indicating such a server
is available.
"""
def test_basic_parse_odt(
self,
tika_parser: TikaDocumentParser,
sample_odt_file: Path,
):
"""
GIVEN:
- An input ODT format document
WHEN:
- The document is parsed
THEN:
- Document content is correct
- Document date is correct
"""
util_call_with_backoff(
tika_parser.parse,
[sample_odt_file, "application/vnd.oasis.opendocument.text"],
)
assert (
tika_parser.text
== "This is an ODT test document, created September 14, 2022"
)
assert tika_parser.archive_path is not None
assert b"PDF-" in tika_parser.archive_path.read_bytes()[:10]
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
# self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
def test_basic_parse_docx(
self,
tika_parser: TikaDocumentParser,
sample_docx_file: Path,
):
"""
GIVEN:
- An input DOCX format document
WHEN:
- The document is parsed
THEN:
- Document content is correct
- Document date is correct
"""
util_call_with_backoff(
tika_parser.parse,
[
sample_docx_file,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
],
)
assert (
tika_parser.text
== "This is an DOCX test document, also made September 14, 2022"
)
assert tika_parser.archive_path is not None
with open(tika_parser.archive_path, "rb") as f:
assert b"PDF-" in f.read()[:10]
# self.assertEqual(tika_parser.date, datetime.datetime(2022, 9, 14))
def test_basic_parse_doc(
self,
tika_parser: TikaDocumentParser,
sample_doc_file: Path,
):
"""
GIVEN:
- An input DOC format document
WHEN:
- The document is parsed
THEN:
- Document content is correct
- Document date is correct
"""
util_call_with_backoff(
tika_parser.parse,
[sample_doc_file, "application/msword"],
)
assert (
"This is a test document, saved in the older .doc format"
in tika_parser.text
)
assert tika_parser.archive_path is not None
with open(tika_parser.archive_path, "rb") as f:
assert b"PDF-" in f.read()[:10]
def test_tika_fails_multi_part(
self,
tika_parser: TikaDocumentParser,
sample_broken_odt: Path,
):
"""
GIVEN:
- An input ODT format document
- The document is known to crash Tika when uploaded via multi-part form data
WHEN:
- The document is parsed
THEN:
- Document content is correct
- Document date is correct
See also:
- https://issues.apache.org/jira/browse/TIKA-4110
"""
util_call_with_backoff(
tika_parser.parse,
[sample_broken_odt, "application/vnd.oasis.opendocument.text"],
)
assert tika_parser.archive_path is not None
with open(tika_parser.archive_path, "rb") as f:
assert b"PDF-" in f.read()[:10]