diff --git a/HISTORY.rst b/HISTORY.rst index a50b1a6e81..a142ae9f0f 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -17,6 +17,8 @@ * Strip HTML entities from the browser's window title. Closes GitLab issue #517. Thanks to Daniel Carrico @daniel1113 for the report. +* Improve search app. Refactored to resolve search queries + by terms first then by field. 3.1.3 (2018-09-27) ================== diff --git a/mayan/apps/dynamic_search/classes.py b/mayan/apps/dynamic_search/classes.py index 8571b58049..47ed18ba77 100644 --- a/mayan/apps/dynamic_search/classes.py +++ b/mayan/apps/dynamic_search/classes.py @@ -13,6 +13,13 @@ from .settings import setting_limit logger = logging.getLogger(__name__) +QUERY_OPERATION_AND = 1 +QUERY_OPERATION_OR = 2 + +TERM_OPERATION_AND = 'AND' +TERM_OPERATION_OR = 'OR' +TERM_OPERATIONS = [TERM_OPERATION_AND, TERM_OPERATION_OR] + class SearchModel(object): _registry = {} @@ -61,7 +68,7 @@ class SearchModel(object): inside_quotes = False negated = False - term = [] + term_letters = [] terms = [] for letter in text: @@ -70,22 +77,23 @@ class SearchModel(object): else: if letter in QUOTES: if inside_quotes: - if term: + if term_letters: terms.append( { 'meta': False, 'negated': negated, - 'string': ''.join(term) + 'string': ''.join(term_letters) } ) negated = False - term = [] + term_letters = [] inside_quotes = not inside_quotes else: if not inside_quotes and letter == SPACE_CHARACTER: - if term: - if term == ['O', 'R']: + if term_letters: + term_string = ''.join(term_letters) + if term_string in TERM_OPERATIONS: meta = True else: meta = False @@ -94,20 +102,20 @@ class SearchModel(object): { 'meta': meta, 'negated': negated, - 'string': ''.join(term) + 'string': term_string } ) negated = False - term = [] + term_letters = [] else: - term.append(letter) + term_letters.append(letter) - if term: + if term_letters: terms.append( { 'meta': False, 'negated': negated, - 'string': ''.join(term) + 'string': ''.join(term_letters) } ) @@ -146,31 +154,6 @@ class SearchModel(object): search_field = SearchField(self, *args, **kwargs) self.search_fields.append(search_field) - def assemble_query(self, terms, search_fields): - """ - Returns a query, that is a combination of Q objects. That combination - aims to search keywords within a model by testing the given search - fields. - """ - queries = [] - - for term in terms: - query = None - if term['string'] != 'OR': - for field in search_fields: - q = Q(**{'%s__%s' % (field, 'icontains'): term['string']}) - - if term['negated']: - q = ~q - - if query is None: - query = q - else: - query = query | q - - queries.append(query) - return queries - def get_all_search_fields(self): return self.search_fields @@ -197,106 +180,27 @@ class SearchModel(object): AccessControlList = apps.get_model( app_label='acls', model_name='AccessControlList' ) - elapsed_time = 0 start_time = datetime.datetime.now() - result_set = set() - search_dict = {} - if 'q' in query_string: - # Simple search - for search_field in self.get_all_search_fields(): - search_dict.setdefault(search_field.get_model(), { - 'searches': [], - 'label': search_field.label, - 'return_value': search_field.return_value - }) - search_dict[search_field.get_model()]['searches'].append( - { - 'field_name': [search_field.field], - 'terms': SearchModel.get_terms( - query_string.get('q', '').strip() - ) - } - ) - else: - for search_field in self.get_all_search_fields(): - if search_field.field in query_string and query_string[search_field.field]: - search_dict.setdefault(search_field.get_model(), { - 'searches': [], - 'label': search_field.label, - 'return_value': search_field.return_value - }) - search_dict[search_field.get_model()]['searches'].append( - { - 'field_name': [search_field.field], - 'terms': SearchModel.get_terms( - query_string[search_field.field] - ) - } - ) + result = None + for search_field in self.get_all_search_fields(): - for model, data in search_dict.items(): - logger.debug('model: %s', model) + terms = self.get_terms( + query_string.get( + search_field.field, query_string.get('q', '') + ).strip() + ) - # Initialize per model result set - model_result_set = set() - - for query_entry in data['searches']: - # Fashion a list of queries for a field for each term - field_query_list = self.assemble_query( - query_entry['terms'], query_entry['field_name'] - ) - - logger.debug('field_query_list: %s', field_query_list) - - # Initialize per field result set - field_result_set = set() - - # Get results per search field - intersection = True - for query in field_query_list: - logger.debug('query: %s', query) - - if query: - term_query_result_set = set( - model.objects.filter(query).values_list( - data['return_value'], flat=True - ) - ) - - # Convert the QuerySet to a Python set and perform the - # AND operation on the program and not as a query. - # This operation ANDs all the field term results - # belonging to a single model, making sure to only include - # results in the final field result variable if all the - # terms are found in a single field. - if not field_result_set: - field_result_set = term_query_result_set - else: - if intersection: - field_result_set &= term_query_result_set - else: - field_result_set |= term_query_result_set - - logger.debug( - 'term_query_result_set: %s', len(term_query_result_set) - ) - logger.debug('field_result_set: %s', len(field_result_set)) - - intersection = True - else: - intersection = False - - if global_and_search: - if not model_result_set: - model_result_set = field_result_set - else: - model_result_set &= field_result_set + field_query = search_field.get_query(terms=terms) + if field_query: + if result is None: + result = field_query else: - model_result_set |= field_result_set - - result_set = result_set | model_result_set + if global_and_search: + result = result & field_query + else: + result = result | field_query elapsed_time = force_text( datetime.datetime.now() - start_time @@ -304,9 +208,8 @@ class SearchModel(object): logger.debug('elapsed_time: %s', elapsed_time) - queryset = self.model.objects.filter( - pk__in=list(result_set)[:setting_limit.value] - ) + pk_list = set(self.model.objects.filter(result or Q()).values_list('pk', flat=True)[:setting_limit.value]) + queryset = self.model.objects.filter(pk__in=pk_list) if self.permission: queryset = AccessControlList.objects.filter_by_access( @@ -316,7 +219,6 @@ class SearchModel(object): return queryset, elapsed_time -# SearchField classes class SearchField(object): """ Search for terms in fields that directly belong to the parent SearchModel @@ -325,10 +227,36 @@ class SearchField(object): self.search_model = search_model self.field = field self.label = label - self.return_value = 'pk' def get_full_name(self): return self.field def get_model(self): return self.search_model.model + + def get_query(self, terms): + query_operation = QUERY_OPERATION_AND + result = None + + for term in terms: + if term['meta']: + # It is a meta term, modifies the query operation + # and is not searched + if term['string'] == TERM_OPERATION_OR: + query_operation = QUERY_OPERATION_OR + else: + q_object = Q( + **{'%s__%s' % (self.field, 'icontains'): term['string']} + ) + if term['negated']: + q_object = ~q_object + + if result is None: + result = q_object + else: + if query_operation == QUERY_OPERATION_AND: + result = result & q_object + else: + result = result | q_object + + return result diff --git a/mayan/apps/dynamic_search/tests/test_models.py b/mayan/apps/dynamic_search/tests/test_models.py index c45ae9a3ba..1cdc55e2c1 100644 --- a/mayan/apps/dynamic_search/tests/test_models.py +++ b/mayan/apps/dynamic_search/tests/test_models.py @@ -3,33 +3,23 @@ from __future__ import unicode_literals from django.test import override_settings from common.tests import BaseTestCase -from documents.models import DocumentType from documents.search import document_search -from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH +from documents.tests import ( + DocumentTestMixin, TEST_DOCUMENT_FILENAME, TEST_SMALL_DOCUMENT_FILENAME +) @override_settings(OCR_AUTO_OCR=False) -class DocumentSearchTestCase(BaseTestCase): - def setUp(self): - super(DocumentSearchTestCase, self).setUp() - self.document_type = DocumentType.objects.create( - label=TEST_DOCUMENT_TYPE_LABEL - ) - - with open(TEST_SMALL_DOCUMENT_PATH) as file_object: - self.document = self.document_type.new_document( - file_object=file_object, label='mayan_11_1.pdf' - ) - - def tearDown(self): - self.document_type.delete() - super(DocumentSearchTestCase, self).tearDown() +class DocumentSearchTestCase(DocumentTestMixin, BaseTestCase): + auto_upload_document = False + test_document_filename = TEST_DOCUMENT_FILENAME def test_simple_search_after_related_name_change(self): """ Test that simple search works after related_name changes to document versions and document version pages """ + self.document = self.upload_document() queryset, elapsed_time = document_search.search( {'q': 'Mayan'}, user=self.admin_user ) @@ -38,6 +28,7 @@ class DocumentSearchTestCase(BaseTestCase): def test_advanced_search_after_related_name_change(self): # Test versions__filename + self.document = self.upload_document() queryset, elapsed_time = document_search.search( {'label': self.document.label}, user=self.admin_user ) @@ -53,10 +44,11 @@ class DocumentSearchTestCase(BaseTestCase): self.assertTrue(self.document in queryset) def test_simple_or_search(self): - with open(TEST_SMALL_DOCUMENT_PATH) as file_object: - self.document_2 = self.document_type.new_document( - file_object=file_object, label='second_doc.pdf' - ) + self.document = self.upload_document() + self.test_document_filename = TEST_SMALL_DOCUMENT_FILENAME + self.document_2 = self.upload_document() + self.document_2.label = 'second_doc.pdf' + self.document_2.save() queryset, elapsed_time = document_search.search( {'q': 'Mayan OR second'}, user=self.admin_user @@ -66,12 +58,48 @@ class DocumentSearchTestCase(BaseTestCase): self.assertTrue(self.document_2 in queryset) def test_simple_and_search(self): - with open(TEST_SMALL_DOCUMENT_PATH) as file_object: - self.document_2 = self.document_type.new_document( - file_object=file_object, label='second_doc.pdf' - ) + self.test_document_filename = TEST_SMALL_DOCUMENT_FILENAME + self.document_2 = self.upload_document() + self.document_2.label = 'second_doc.pdf' + self.document_2.save() queryset, elapsed_time = document_search.search( - {'q': 'Mayan second'}, user=self.admin_user + {'q': 'non_valid second'}, user=self.admin_user + ) + self.assertEqual(queryset.count(), 0) + + queryset, elapsed_time = document_search.search( + {'q': 'second non_valid'}, user=self.admin_user + ) + self.assertEqual(queryset.count(), 0) + + def test_simple_negated_search(self): + self.test_document_filename = TEST_SMALL_DOCUMENT_FILENAME + self.document_2 = self.upload_document() + self.document_2.label = 'second_doc.pdf' + self.document_2.save() + + queryset, elapsed_time = document_search.search( + {'q': '-non_valid second'}, user=self.admin_user + ) + self.assertEqual(queryset.count(), 1) + + queryset, elapsed_time = document_search.search( + {'label': '-second'}, user=self.admin_user + ) + self.assertEqual(queryset.count(), 0) + + queryset, elapsed_time = document_search.search( + {'label': '-second -Mayan'}, user=self.admin_user + ) + self.assertEqual(queryset.count(), 0) + + queryset, elapsed_time = document_search.search( + {'label': '-second OR -Mayan'}, user=self.admin_user + ) + self.assertEqual(queryset.count(), 1) + + queryset, elapsed_time = document_search.search( + {'label': '-non_valid -second'}, user=self.admin_user ) self.assertEqual(queryset.count(), 0)