Refactor: Use django-filter logic for filtering full text search queries
This commit is contained in:
parent
39998cb34f
commit
629dffbd23
@ -8,8 +8,8 @@ from datetime import timezone
|
|||||||
from shutil import rmtree
|
from shutil import rmtree
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from dateutil.parser import isoparse
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
from django.db.models import QuerySet
|
||||||
from django.utils import timezone as django_timezone
|
from django.utils import timezone as django_timezone
|
||||||
from guardian.shortcuts import get_users_with_perms
|
from guardian.shortcuts import get_users_with_perms
|
||||||
from whoosh import classify
|
from whoosh import classify
|
||||||
@ -22,6 +22,8 @@ from whoosh.fields import NUMERIC
|
|||||||
from whoosh.fields import TEXT
|
from whoosh.fields import TEXT
|
||||||
from whoosh.fields import Schema
|
from whoosh.fields import Schema
|
||||||
from whoosh.highlight import HtmlFormatter
|
from whoosh.highlight import HtmlFormatter
|
||||||
|
from whoosh.idsets import BitSet
|
||||||
|
from whoosh.idsets import DocIdSet
|
||||||
from whoosh.index import FileIndex
|
from whoosh.index import FileIndex
|
||||||
from whoosh.index import create_in
|
from whoosh.index import create_in
|
||||||
from whoosh.index import exists_in
|
from whoosh.index import exists_in
|
||||||
@ -31,6 +33,7 @@ from whoosh.qparser import QueryParser
|
|||||||
from whoosh.qparser.dateparse import DateParserPlugin
|
from whoosh.qparser.dateparse import DateParserPlugin
|
||||||
from whoosh.qparser.dateparse import English
|
from whoosh.qparser.dateparse import English
|
||||||
from whoosh.qparser.plugins import FieldsPlugin
|
from whoosh.qparser.plugins import FieldsPlugin
|
||||||
|
from whoosh.reading import IndexReader
|
||||||
from whoosh.scoring import TF_IDF
|
from whoosh.scoring import TF_IDF
|
||||||
from whoosh.searching import ResultsPage
|
from whoosh.searching import ResultsPage
|
||||||
from whoosh.searching import Searcher
|
from whoosh.searching import Searcher
|
||||||
@ -202,114 +205,32 @@ def remove_document_from_index(document: Document):
|
|||||||
remove_document(writer, document)
|
remove_document(writer, document)
|
||||||
|
|
||||||
|
|
||||||
|
class MappedDocIdSet(DocIdSet):
|
||||||
|
"""
|
||||||
|
A DocIdSet backed by a set of `Document` IDs.
|
||||||
|
Supports efficiently looking up if a whoosh docnum is in the provided `filter_queryset`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, filter_queryset: QuerySet, ixreader: IndexReader) -> None:
|
||||||
|
super().__init__()
|
||||||
|
document_ids = filter_queryset.order_by("id").values_list("id", flat=True)
|
||||||
|
max_id = document_ids.last() or 0
|
||||||
|
self.document_ids = BitSet(document_ids, size=max_id)
|
||||||
|
self.ixreader = ixreader
|
||||||
|
|
||||||
|
def __contains__(self, docnum):
|
||||||
|
document_id = self.ixreader.stored_fields(docnum)["id"]
|
||||||
|
return document_id in self.document_ids
|
||||||
|
|
||||||
|
def __bool__(self):
|
||||||
|
# searcher.search ignores a filter if it's "falsy".
|
||||||
|
# We use this hack so this DocIdSet, when used as a filter, is never ignored.
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
class DelayedQuery:
|
class DelayedQuery:
|
||||||
param_map = {
|
|
||||||
"correspondent": ("correspondent", ["id", "id__in", "id__none", "isnull"]),
|
|
||||||
"document_type": ("type", ["id", "id__in", "id__none", "isnull"]),
|
|
||||||
"storage_path": ("path", ["id", "id__in", "id__none", "isnull"]),
|
|
||||||
"owner": ("owner", ["id", "id__in", "id__none", "isnull"]),
|
|
||||||
"shared_by": ("shared_by", ["id"]),
|
|
||||||
"tags": ("tag", ["id__all", "id__in", "id__none"]),
|
|
||||||
"added": ("added", ["date__lt", "date__gt"]),
|
|
||||||
"created": ("created", ["date__lt", "date__gt"]),
|
|
||||||
"checksum": ("checksum", ["icontains", "istartswith"]),
|
|
||||||
"original_filename": ("original_filename", ["icontains", "istartswith"]),
|
|
||||||
"custom_fields": (
|
|
||||||
"custom_fields",
|
|
||||||
["icontains", "istartswith", "id__all", "id__in", "id__none"],
|
|
||||||
),
|
|
||||||
}
|
|
||||||
|
|
||||||
def _get_query(self):
|
def _get_query(self):
|
||||||
raise NotImplementedError
|
raise NotImplementedError # pragma: no cover
|
||||||
|
|
||||||
def _get_query_filter(self):
|
|
||||||
criterias = []
|
|
||||||
for key, value in self.query_params.items():
|
|
||||||
# is_tagged is a special case
|
|
||||||
if key == "is_tagged":
|
|
||||||
criterias.append(query.Term("has_tag", self.evalBoolean(value)))
|
|
||||||
continue
|
|
||||||
|
|
||||||
if key == "has_custom_fields":
|
|
||||||
criterias.append(
|
|
||||||
query.Term("has_custom_fields", self.evalBoolean(value)),
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Don't process query params without a filter
|
|
||||||
if "__" not in key:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# All other query params consist of a parameter and a query filter
|
|
||||||
param, query_filter = key.split("__", 1)
|
|
||||||
try:
|
|
||||||
field, supported_query_filters = self.param_map[param]
|
|
||||||
except KeyError:
|
|
||||||
logger.error(f"Unable to build a query filter for parameter {key}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# We only support certain filters per parameter
|
|
||||||
if query_filter not in supported_query_filters:
|
|
||||||
logger.info(
|
|
||||||
f"Query filter {query_filter} not supported for parameter {param}",
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if query_filter == "id":
|
|
||||||
if param == "shared_by":
|
|
||||||
criterias.append(query.Term("is_shared", True))
|
|
||||||
criterias.append(query.Term("owner_id", value))
|
|
||||||
else:
|
|
||||||
criterias.append(query.Term(f"{field}_id", value))
|
|
||||||
elif query_filter == "id__in":
|
|
||||||
in_filter = []
|
|
||||||
for object_id in value.split(","):
|
|
||||||
in_filter.append(
|
|
||||||
query.Term(f"{field}_id", object_id),
|
|
||||||
)
|
|
||||||
criterias.append(query.Or(in_filter))
|
|
||||||
elif query_filter == "id__none":
|
|
||||||
for object_id in value.split(","):
|
|
||||||
criterias.append(
|
|
||||||
query.Not(query.Term(f"{field}_id", object_id)),
|
|
||||||
)
|
|
||||||
elif query_filter == "isnull":
|
|
||||||
criterias.append(
|
|
||||||
query.Term(f"has_{field}", self.evalBoolean(value) is False),
|
|
||||||
)
|
|
||||||
elif query_filter == "id__all":
|
|
||||||
for object_id in value.split(","):
|
|
||||||
criterias.append(query.Term(f"{field}_id", object_id))
|
|
||||||
elif query_filter == "date__lt":
|
|
||||||
criterias.append(
|
|
||||||
query.DateRange(field, start=None, end=isoparse(value)),
|
|
||||||
)
|
|
||||||
elif query_filter == "date__gt":
|
|
||||||
criterias.append(
|
|
||||||
query.DateRange(field, start=isoparse(value), end=None),
|
|
||||||
)
|
|
||||||
elif query_filter == "icontains":
|
|
||||||
criterias.append(
|
|
||||||
query.Term(field, value),
|
|
||||||
)
|
|
||||||
elif query_filter == "istartswith":
|
|
||||||
criterias.append(
|
|
||||||
query.Prefix(field, value),
|
|
||||||
)
|
|
||||||
|
|
||||||
user_criterias = get_permissions_criterias(
|
|
||||||
user=self.user,
|
|
||||||
)
|
|
||||||
if len(criterias) > 0:
|
|
||||||
if len(user_criterias) > 0:
|
|
||||||
criterias.append(query.Or(user_criterias))
|
|
||||||
return query.And(criterias)
|
|
||||||
else:
|
|
||||||
return query.Or(user_criterias) if len(user_criterias) > 0 else None
|
|
||||||
|
|
||||||
def evalBoolean(self, val):
|
|
||||||
return val.lower() in {"true", "1"}
|
|
||||||
|
|
||||||
def _get_query_sortedby(self):
|
def _get_query_sortedby(self):
|
||||||
if "ordering" not in self.query_params:
|
if "ordering" not in self.query_params:
|
||||||
@ -340,13 +261,19 @@ class DelayedQuery:
|
|||||||
else:
|
else:
|
||||||
return sort_fields_map[field], reverse
|
return sort_fields_map[field], reverse
|
||||||
|
|
||||||
def __init__(self, searcher: Searcher, query_params, page_size, user):
|
def __init__(
|
||||||
|
self,
|
||||||
|
searcher: Searcher,
|
||||||
|
query_params,
|
||||||
|
page_size,
|
||||||
|
filter_queryset: QuerySet,
|
||||||
|
):
|
||||||
self.searcher = searcher
|
self.searcher = searcher
|
||||||
self.query_params = query_params
|
self.query_params = query_params
|
||||||
self.page_size = page_size
|
self.page_size = page_size
|
||||||
self.saved_results = dict()
|
self.saved_results = dict()
|
||||||
self.first_score = None
|
self.first_score = None
|
||||||
self.user = user
|
self.filter_queryset = filter_queryset
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
page = self[0:1]
|
page = self[0:1]
|
||||||
@ -362,7 +289,7 @@ class DelayedQuery:
|
|||||||
page: ResultsPage = self.searcher.search_page(
|
page: ResultsPage = self.searcher.search_page(
|
||||||
q,
|
q,
|
||||||
mask=mask,
|
mask=mask,
|
||||||
filter=self._get_query_filter(),
|
filter=MappedDocIdSet(self.filter_queryset, self.searcher.ixreader),
|
||||||
pagenum=math.floor(item.start / self.page_size) + 1,
|
pagenum=math.floor(item.start / self.page_size) + 1,
|
||||||
pagelen=self.page_size,
|
pagelen=self.page_size,
|
||||||
sortedby=sortedby,
|
sortedby=sortedby,
|
||||||
|
@ -1,8 +1,6 @@
|
|||||||
from dateutil.parser import isoparse
|
|
||||||
from django.test import TestCase
|
from django.test import TestCase
|
||||||
from whoosh import query
|
from whoosh import query
|
||||||
|
|
||||||
from documents.index import DelayedQuery
|
|
||||||
from documents.index import get_permissions_criterias
|
from documents.index import get_permissions_criterias
|
||||||
from documents.models import User
|
from documents.models import User
|
||||||
|
|
||||||
@ -58,162 +56,3 @@ class TestDelayedQuery(TestCase):
|
|||||||
)
|
)
|
||||||
for user, expected in tests:
|
for user, expected in tests:
|
||||||
self.assertEqual(get_permissions_criterias(user), expected)
|
self.assertEqual(get_permissions_criterias(user), expected)
|
||||||
|
|
||||||
def test_no_query_filters(self):
|
|
||||||
dq = DelayedQuery(None, {}, None, None)
|
|
||||||
self.assertEqual(dq._get_query_filter(), self.has_no_owner)
|
|
||||||
|
|
||||||
def test_date_query_filters(self):
|
|
||||||
def _get_testset(param: str):
|
|
||||||
date_str = "1970-01-01T02:44"
|
|
||||||
date_obj = isoparse(date_str)
|
|
||||||
return (
|
|
||||||
(
|
|
||||||
{f"{param}__date__lt": date_str},
|
|
||||||
query.And(
|
|
||||||
[
|
|
||||||
query.DateRange(param, start=None, end=date_obj),
|
|
||||||
self.has_no_owner,
|
|
||||||
],
|
|
||||||
),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
{f"{param}__date__gt": date_str},
|
|
||||||
query.And(
|
|
||||||
[
|
|
||||||
query.DateRange(param, start=date_obj, end=None),
|
|
||||||
self.has_no_owner,
|
|
||||||
],
|
|
||||||
),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
query_params = ["created", "added"]
|
|
||||||
for param in query_params:
|
|
||||||
for params, expected in _get_testset(param):
|
|
||||||
dq = DelayedQuery(None, params, None, None)
|
|
||||||
got = dq._get_query_filter()
|
|
||||||
self.assertCountEqual(got, expected)
|
|
||||||
|
|
||||||
def test_is_tagged_query_filter(self):
|
|
||||||
tests = (
|
|
||||||
("True", True),
|
|
||||||
("true", True),
|
|
||||||
("1", True),
|
|
||||||
("False", False),
|
|
||||||
("false", False),
|
|
||||||
("0", False),
|
|
||||||
("foo", False),
|
|
||||||
)
|
|
||||||
for param, expected in tests:
|
|
||||||
dq = DelayedQuery(None, {"is_tagged": param}, None, None)
|
|
||||||
self.assertEqual(
|
|
||||||
dq._get_query_filter(),
|
|
||||||
query.And([query.Term("has_tag", expected), self.has_no_owner]),
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_tags_query_filters(self):
|
|
||||||
# tests contains tuples of query_parameter dics and the expected whoosh query
|
|
||||||
param = "tags"
|
|
||||||
field, _ = DelayedQuery.param_map[param]
|
|
||||||
tests = (
|
|
||||||
(
|
|
||||||
{f"{param}__id__all": "42,43"},
|
|
||||||
query.And(
|
|
||||||
[
|
|
||||||
query.Term(f"{field}_id", "42"),
|
|
||||||
query.Term(f"{field}_id", "43"),
|
|
||||||
self.has_no_owner,
|
|
||||||
],
|
|
||||||
),
|
|
||||||
),
|
|
||||||
# tags does not allow __id
|
|
||||||
(
|
|
||||||
{f"{param}__id": "42"},
|
|
||||||
self.has_no_owner,
|
|
||||||
),
|
|
||||||
# tags does not allow __isnull
|
|
||||||
(
|
|
||||||
{f"{param}__isnull": "true"},
|
|
||||||
self.has_no_owner,
|
|
||||||
),
|
|
||||||
self._get_testset__id__in(param, field),
|
|
||||||
self._get_testset__id__none(param, field),
|
|
||||||
)
|
|
||||||
|
|
||||||
for params, expected in tests:
|
|
||||||
dq = DelayedQuery(None, params, None, None)
|
|
||||||
got = dq._get_query_filter()
|
|
||||||
self.assertCountEqual(got, expected)
|
|
||||||
|
|
||||||
def test_generic_query_filters(self):
|
|
||||||
def _get_testset(param: str):
|
|
||||||
field, _ = DelayedQuery.param_map[param]
|
|
||||||
return (
|
|
||||||
(
|
|
||||||
{f"{param}__id": "42"},
|
|
||||||
query.And(
|
|
||||||
[
|
|
||||||
query.Term(f"{field}_id", "42"),
|
|
||||||
self.has_no_owner,
|
|
||||||
],
|
|
||||||
),
|
|
||||||
),
|
|
||||||
self._get_testset__id__in(param, field),
|
|
||||||
self._get_testset__id__none(param, field),
|
|
||||||
(
|
|
||||||
{f"{param}__isnull": "true"},
|
|
||||||
query.And(
|
|
||||||
[
|
|
||||||
query.Term(f"has_{field}", False),
|
|
||||||
self.has_no_owner,
|
|
||||||
],
|
|
||||||
),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
{f"{param}__isnull": "false"},
|
|
||||||
query.And(
|
|
||||||
[
|
|
||||||
query.Term(f"has_{field}", True),
|
|
||||||
self.has_no_owner,
|
|
||||||
],
|
|
||||||
),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
query_params = ["correspondent", "document_type", "storage_path", "owner"]
|
|
||||||
for param in query_params:
|
|
||||||
for params, expected in _get_testset(param):
|
|
||||||
dq = DelayedQuery(None, params, None, None)
|
|
||||||
got = dq._get_query_filter()
|
|
||||||
self.assertCountEqual(got, expected)
|
|
||||||
|
|
||||||
def test_char_query_filter(self):
|
|
||||||
def _get_testset(param: str):
|
|
||||||
return (
|
|
||||||
(
|
|
||||||
{f"{param}__icontains": "foo"},
|
|
||||||
query.And(
|
|
||||||
[
|
|
||||||
query.Term(f"{param}", "foo"),
|
|
||||||
self.has_no_owner,
|
|
||||||
],
|
|
||||||
),
|
|
||||||
),
|
|
||||||
(
|
|
||||||
{f"{param}__istartswith": "foo"},
|
|
||||||
query.And(
|
|
||||||
[
|
|
||||||
query.Prefix(f"{param}", "foo"),
|
|
||||||
self.has_no_owner,
|
|
||||||
],
|
|
||||||
),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
query_params = ["checksum", "original_filename"]
|
|
||||||
for param in query_params:
|
|
||||||
for params, expected in _get_testset(param):
|
|
||||||
dq = DelayedQuery(None, params, None, None)
|
|
||||||
got = dq._get_query_filter()
|
|
||||||
self.assertCountEqual(got, expected)
|
|
||||||
|
@ -852,6 +852,8 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def filter_queryset(self, queryset):
|
def filter_queryset(self, queryset):
|
||||||
|
filtered_queryset = super().filter_queryset(queryset)
|
||||||
|
|
||||||
if self._is_search_request():
|
if self._is_search_request():
|
||||||
from documents import index
|
from documents import index
|
||||||
|
|
||||||
@ -866,10 +868,10 @@ class UnifiedSearchViewSet(DocumentViewSet):
|
|||||||
self.searcher,
|
self.searcher,
|
||||||
self.request.query_params,
|
self.request.query_params,
|
||||||
self.paginator.get_page_size(self.request),
|
self.paginator.get_page_size(self.request),
|
||||||
self.request.user,
|
filter_queryset=filtered_queryset,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return super().filter_queryset(queryset)
|
return filtered_queryset
|
||||||
|
|
||||||
def list(self, request, *args, **kwargs):
|
def list(self, request, *args, **kwargs):
|
||||||
if self._is_search_request():
|
if self._is_search_request():
|
||||||
@ -1203,7 +1205,7 @@ class GlobalSearchView(PassUserMixin):
|
|||||||
s,
|
s,
|
||||||
request.query_params,
|
request.query_params,
|
||||||
10,
|
10,
|
||||||
request.user,
|
filter_queryset=all_docs,
|
||||||
)._get_query()
|
)._get_query()
|
||||||
results = s.search(q, limit=OBJECT_LIMIT)
|
results = s.search(q, limit=OBJECT_LIMIT)
|
||||||
docs = docs | all_docs.filter(id__in=[r["id"] for r in results])
|
docs = docs | all_docs.filter(id__in=[r["id"] for r in results])
|
||||||
@ -1452,12 +1454,12 @@ class StatisticsView(APIView):
|
|||||||
{
|
{
|
||||||
"documents_total": documents_total,
|
"documents_total": documents_total,
|
||||||
"documents_inbox": documents_inbox,
|
"documents_inbox": documents_inbox,
|
||||||
"inbox_tag": inbox_tags.first().pk
|
"inbox_tag": (
|
||||||
if inbox_tags.exists()
|
inbox_tags.first().pk if inbox_tags.exists() else None
|
||||||
else None, # backwards compatibility
|
), # backwards compatibility
|
||||||
"inbox_tags": [tag.pk for tag in inbox_tags]
|
"inbox_tags": (
|
||||||
if inbox_tags.exists()
|
[tag.pk for tag in inbox_tags] if inbox_tags.exists() else None
|
||||||
else None,
|
),
|
||||||
"document_file_type_counts": document_file_type_counts,
|
"document_file_type_counts": document_file_type_counts,
|
||||||
"character_count": character_count,
|
"character_count": character_count,
|
||||||
"tag_count": len(tags),
|
"tag_count": len(tags),
|
||||||
|
Loading…
x
Reference in New Issue
Block a user