paperless-ngx/src/documents/tests/test_management_fuzzy.py

160 lines
4.9 KiB
Python

from io import StringIO
from django.core.management import CommandError
from django.core.management import call_command
from django.test import TestCase
from documents.models import Document
class TestFuzzyMatchCommand(TestCase):
MSG_REGEX = r"Document \d fuzzy match to \d \(confidence \d\d\.\d\d\d\)"
def call_command(self, *args, **kwargs):
stdout = StringIO()
stderr = StringIO()
call_command(
"document_fuzzy_match",
"--no-progress-bar",
*args,
stdout=stdout,
stderr=stderr,
**kwargs,
)
return stdout.getvalue(), stderr.getvalue()
def test_invalid_ratio_lower_limit(self):
"""
GIVEN:
- Invalid ratio below lower limit
WHEN:
- Command is called
THEN:
- Error is raised indicating issue
"""
with self.assertRaises(CommandError) as e:
self.call_command("--ratio", "-1")
self.assertIn("The ratio must be between 0 and 100", str(e))
def test_invalid_ratio_upper_limit(self):
"""
GIVEN:s
- Invalid ratio above upper
WHEN:
- Command is called
THEN:
- Error is raised indicating issue
"""
with self.assertRaises(CommandError) as e:
self.call_command("--ratio", "101")
self.assertIn("The ratio must be between 0 and 100", str(e))
def test_invalid_process_count(self):
"""
GIVEN:
- Invalid process count less than 0 above upper
WHEN:
- Command is called
THEN:
- Error is raised indicating issue
"""
with self.assertRaises(CommandError) as e:
self.call_command("--processes", "0")
self.assertIn("There must be at least 1 process", str(e))
def test_no_matches(self):
"""
GIVEN:
- 2 documents exist
- Similarity between content is 82.32
WHEN:
- Command is called
THEN:
- No matches are found
"""
Document.objects.create(
checksum="BEEFCAFE",
title="A",
content="first document",
mime_type="application/pdf",
filename="test.pdf",
)
Document.objects.create(
checksum="DEADBEAF",
title="A",
content="other first document",
mime_type="application/pdf",
filename="other_test.pdf",
)
stdout, _ = self.call_command()
self.assertEqual(stdout, "No matches found\n")
def test_with_matches(self):
"""
GIVEN:
- 2 documents exist
- Similarity between content is 86.667
WHEN:
- Command is called
THEN:
- 1 match is returned from doc 1 to doc 2
- No match from doc 2 to doc 1 reported
"""
# Content similarity is 86.667
Document.objects.create(
checksum="BEEFCAFE",
title="A",
content="first document scanned by bob",
mime_type="application/pdf",
filename="test.pdf",
)
Document.objects.create(
checksum="DEADBEAF",
title="A",
content="first document scanned by alice",
mime_type="application/pdf",
filename="other_test.pdf",
)
stdout, _ = self.call_command("--processes", "1")
self.assertRegex(stdout, self.MSG_REGEX + "\n")
def test_with_3_matches(self):
"""
GIVEN:
- 3 documents exist
- All documents have similarity over 85.0
WHEN:
- Command is called
THEN:
- 3 matches is returned from each document to the others
- No duplication of matches returned
"""
# Content similarity is 86.667
Document.objects.create(
checksum="BEEFCAFE",
title="A",
content="first document scanned by bob",
mime_type="application/pdf",
filename="test.pdf",
)
Document.objects.create(
checksum="DEADBEAF",
title="A",
content="first document scanned by alice",
mime_type="application/pdf",
filename="other_test.pdf",
)
Document.objects.create(
checksum="CATTLE",
title="A",
content="first document scanned by pete",
mime_type="application/pdf",
filename="final_test.pdf",
)
stdout, _ = self.call_command()
lines = [x.strip() for x in stdout.split("\n") if len(x.strip())]
self.assertEqual(len(lines), 3)
self.assertRegex(lines[0], self.MSG_REGEX)
self.assertRegex(lines[1], self.MSG_REGEX)
self.assertRegex(lines[2], self.MSG_REGEX)