Improve search app. Refactored to resolve search queries by terms first then by field.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2018-10-04 03:29:03 -04:00
parent 2b2981af25
commit e0b7f31111
3 changed files with 119 additions and 161 deletions

View File

@@ -17,6 +17,8 @@
* Strip HTML entities from the browser's window title.
Closes GitLab issue #517. Thanks to Daniel Carrico @daniel1113
for the report.
* Improve search app. Refactored to resolve search queries
by terms first then by field.
3.1.3 (2018-09-27)
==================

View File

@@ -13,6 +13,13 @@ from .settings import setting_limit
logger = logging.getLogger(__name__)
QUERY_OPERATION_AND = 1
QUERY_OPERATION_OR = 2
TERM_OPERATION_AND = 'AND'
TERM_OPERATION_OR = 'OR'
TERM_OPERATIONS = [TERM_OPERATION_AND, TERM_OPERATION_OR]
class SearchModel(object):
_registry = {}
@@ -61,7 +68,7 @@ class SearchModel(object):
inside_quotes = False
negated = False
term = []
term_letters = []
terms = []
for letter in text:
@@ -70,22 +77,23 @@ class SearchModel(object):
else:
if letter in QUOTES:
if inside_quotes:
if term:
if term_letters:
terms.append(
{
'meta': False,
'negated': negated,
'string': ''.join(term)
'string': ''.join(term_letters)
}
)
negated = False
term = []
term_letters = []
inside_quotes = not inside_quotes
else:
if not inside_quotes and letter == SPACE_CHARACTER:
if term:
if term == ['O', 'R']:
if term_letters:
term_string = ''.join(term_letters)
if term_string in TERM_OPERATIONS:
meta = True
else:
meta = False
@@ -94,20 +102,20 @@ class SearchModel(object):
{
'meta': meta,
'negated': negated,
'string': ''.join(term)
'string': term_string
}
)
negated = False
term = []
term_letters = []
else:
term.append(letter)
term_letters.append(letter)
if term:
if term_letters:
terms.append(
{
'meta': False,
'negated': negated,
'string': ''.join(term)
'string': ''.join(term_letters)
}
)
@@ -146,31 +154,6 @@ class SearchModel(object):
search_field = SearchField(self, *args, **kwargs)
self.search_fields.append(search_field)
def assemble_query(self, terms, search_fields):
"""
Returns a query, that is a combination of Q objects. That combination
aims to search keywords within a model by testing the given search
fields.
"""
queries = []
for term in terms:
query = None
if term['string'] != 'OR':
for field in search_fields:
q = Q(**{'%s__%s' % (field, 'icontains'): term['string']})
if term['negated']:
q = ~q
if query is None:
query = q
else:
query = query | q
queries.append(query)
return queries
def get_all_search_fields(self):
return self.search_fields
@@ -197,106 +180,27 @@ class SearchModel(object):
AccessControlList = apps.get_model(
app_label='acls', model_name='AccessControlList'
)
elapsed_time = 0
start_time = datetime.datetime.now()
result_set = set()
search_dict = {}
if 'q' in query_string:
# Simple search
for search_field in self.get_all_search_fields():
search_dict.setdefault(search_field.get_model(), {
'searches': [],
'label': search_field.label,
'return_value': search_field.return_value
})
search_dict[search_field.get_model()]['searches'].append(
{
'field_name': [search_field.field],
'terms': SearchModel.get_terms(
query_string.get('q', '').strip()
)
}
)
else:
for search_field in self.get_all_search_fields():
if search_field.field in query_string and query_string[search_field.field]:
search_dict.setdefault(search_field.get_model(), {
'searches': [],
'label': search_field.label,
'return_value': search_field.return_value
})
search_dict[search_field.get_model()]['searches'].append(
{
'field_name': [search_field.field],
'terms': SearchModel.get_terms(
query_string[search_field.field]
)
}
)
result = None
for search_field in self.get_all_search_fields():
for model, data in search_dict.items():
logger.debug('model: %s', model)
terms = self.get_terms(
query_string.get(
search_field.field, query_string.get('q', '')
).strip()
)
# Initialize per model result set
model_result_set = set()
for query_entry in data['searches']:
# Fashion a list of queries for a field for each term
field_query_list = self.assemble_query(
query_entry['terms'], query_entry['field_name']
)
logger.debug('field_query_list: %s', field_query_list)
# Initialize per field result set
field_result_set = set()
# Get results per search field
intersection = True
for query in field_query_list:
logger.debug('query: %s', query)
if query:
term_query_result_set = set(
model.objects.filter(query).values_list(
data['return_value'], flat=True
)
)
# Convert the QuerySet to a Python set and perform the
# AND operation on the program and not as a query.
# This operation ANDs all the field term results
# belonging to a single model, making sure to only include
# results in the final field result variable if all the
# terms are found in a single field.
if not field_result_set:
field_result_set = term_query_result_set
else:
if intersection:
field_result_set &= term_query_result_set
else:
field_result_set |= term_query_result_set
logger.debug(
'term_query_result_set: %s', len(term_query_result_set)
)
logger.debug('field_result_set: %s', len(field_result_set))
intersection = True
else:
intersection = False
if global_and_search:
if not model_result_set:
model_result_set = field_result_set
else:
model_result_set &= field_result_set
field_query = search_field.get_query(terms=terms)
if field_query:
if result is None:
result = field_query
else:
model_result_set |= field_result_set
result_set = result_set | model_result_set
if global_and_search:
result = result & field_query
else:
result = result | field_query
elapsed_time = force_text(
datetime.datetime.now() - start_time
@@ -304,9 +208,8 @@ class SearchModel(object):
logger.debug('elapsed_time: %s', elapsed_time)
queryset = self.model.objects.filter(
pk__in=list(result_set)[:setting_limit.value]
)
pk_list = set(self.model.objects.filter(result or Q()).values_list('pk', flat=True)[:setting_limit.value])
queryset = self.model.objects.filter(pk__in=pk_list)
if self.permission:
queryset = AccessControlList.objects.filter_by_access(
@@ -316,7 +219,6 @@ class SearchModel(object):
return queryset, elapsed_time
# SearchField classes
class SearchField(object):
"""
Search for terms in fields that directly belong to the parent SearchModel
@@ -325,10 +227,36 @@ class SearchField(object):
self.search_model = search_model
self.field = field
self.label = label
self.return_value = 'pk'
def get_full_name(self):
return self.field
def get_model(self):
return self.search_model.model
def get_query(self, terms):
query_operation = QUERY_OPERATION_AND
result = None
for term in terms:
if term['meta']:
# It is a meta term, modifies the query operation
# and is not searched
if term['string'] == TERM_OPERATION_OR:
query_operation = QUERY_OPERATION_OR
else:
q_object = Q(
**{'%s__%s' % (self.field, 'icontains'): term['string']}
)
if term['negated']:
q_object = ~q_object
if result is None:
result = q_object
else:
if query_operation == QUERY_OPERATION_AND:
result = result & q_object
else:
result = result | q_object
return result

View File

@@ -3,33 +3,23 @@ from __future__ import unicode_literals
from django.test import override_settings
from common.tests import BaseTestCase
from documents.models import DocumentType
from documents.search import document_search
from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
from documents.tests import (
DocumentTestMixin, TEST_DOCUMENT_FILENAME, TEST_SMALL_DOCUMENT_FILENAME
)
@override_settings(OCR_AUTO_OCR=False)
class DocumentSearchTestCase(BaseTestCase):
def setUp(self):
super(DocumentSearchTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=file_object, label='mayan_11_1.pdf'
)
def tearDown(self):
self.document_type.delete()
super(DocumentSearchTestCase, self).tearDown()
class DocumentSearchTestCase(DocumentTestMixin, BaseTestCase):
auto_upload_document = False
test_document_filename = TEST_DOCUMENT_FILENAME
def test_simple_search_after_related_name_change(self):
"""
Test that simple search works after related_name changes to
document versions and document version pages
"""
self.document = self.upload_document()
queryset, elapsed_time = document_search.search(
{'q': 'Mayan'}, user=self.admin_user
)
@@ -38,6 +28,7 @@ class DocumentSearchTestCase(BaseTestCase):
def test_advanced_search_after_related_name_change(self):
# Test versions__filename
self.document = self.upload_document()
queryset, elapsed_time = document_search.search(
{'label': self.document.label}, user=self.admin_user
)
@@ -53,10 +44,11 @@ class DocumentSearchTestCase(BaseTestCase):
self.assertTrue(self.document in queryset)
def test_simple_or_search(self):
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document_2 = self.document_type.new_document(
file_object=file_object, label='second_doc.pdf'
)
self.document = self.upload_document()
self.test_document_filename = TEST_SMALL_DOCUMENT_FILENAME
self.document_2 = self.upload_document()
self.document_2.label = 'second_doc.pdf'
self.document_2.save()
queryset, elapsed_time = document_search.search(
{'q': 'Mayan OR second'}, user=self.admin_user
@@ -66,12 +58,48 @@ class DocumentSearchTestCase(BaseTestCase):
self.assertTrue(self.document_2 in queryset)
def test_simple_and_search(self):
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document_2 = self.document_type.new_document(
file_object=file_object, label='second_doc.pdf'
)
self.test_document_filename = TEST_SMALL_DOCUMENT_FILENAME
self.document_2 = self.upload_document()
self.document_2.label = 'second_doc.pdf'
self.document_2.save()
queryset, elapsed_time = document_search.search(
{'q': 'Mayan second'}, user=self.admin_user
{'q': 'non_valid second'}, user=self.admin_user
)
self.assertEqual(queryset.count(), 0)
queryset, elapsed_time = document_search.search(
{'q': 'second non_valid'}, user=self.admin_user
)
self.assertEqual(queryset.count(), 0)
def test_simple_negated_search(self):
self.test_document_filename = TEST_SMALL_DOCUMENT_FILENAME
self.document_2 = self.upload_document()
self.document_2.label = 'second_doc.pdf'
self.document_2.save()
queryset, elapsed_time = document_search.search(
{'q': '-non_valid second'}, user=self.admin_user
)
self.assertEqual(queryset.count(), 1)
queryset, elapsed_time = document_search.search(
{'label': '-second'}, user=self.admin_user
)
self.assertEqual(queryset.count(), 0)
queryset, elapsed_time = document_search.search(
{'label': '-second -Mayan'}, user=self.admin_user
)
self.assertEqual(queryset.count(), 0)
queryset, elapsed_time = document_search.search(
{'label': '-second OR -Mayan'}, user=self.admin_user
)
self.assertEqual(queryset.count(), 1)
queryset, elapsed_time = document_search.search(
{'label': '-non_valid -second'}, user=self.admin_user
)
self.assertEqual(queryset.count(), 0)