diff --git a/HISTORY.rst b/HISTORY.rst index 1c1af4756b..b32e84ee12 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -1,3 +1,13 @@ +3.1.5 (2018-10-XX) +================== +* Consolidate some document indexing test code into a new mixin. +* Split the code of the mountindex command to be able to add tests. +* Fix the way the children of IndexInstanceNode are accessed. Fixes + GitLab issue #518. Thanks to TheOneValen @TheOneValen for the report. +* Remove newlines from the index name levels before using them as FUSE + directories. +* Fixed duplicated FUSE directory removal. + 3.1.4 (2018-10-4) ================= * Fix the link to the documenation. Closes GitLab issue #516. diff --git a/mayan/apps/mirroring/apps.py b/mayan/apps/mirroring/apps.py index 5a60c5ee19..428df7d1d7 100644 --- a/mayan/apps/mirroring/apps.py +++ b/mayan/apps/mirroring/apps.py @@ -5,5 +5,6 @@ from django.utils.translation import ugettext_lazy as _ class MirroringApp(apps.AppConfig): + has_tests = True name = 'mirroring' verbose_name = _('Mirroring') diff --git a/mayan/apps/mirroring/classes.py b/mayan/apps/mirroring/classes.py new file mode 100644 index 0000000000..c6f8fe5ce3 --- /dev/null +++ b/mayan/apps/mirroring/classes.py @@ -0,0 +1,220 @@ +from __future__ import print_function, unicode_literals + +import datetime +from errno import ENOENT +import logging +from stat import S_IFDIR, S_IFREG +from time import time + +from fuse import FuseOSError, Operations + +from django.core.cache import caches +from django.core.exceptions import MultipleObjectsReturned +from django.db.models import Count, F, Func, Value + +from document_indexing.models import Index, IndexInstanceNode +from documents.models import Document + +from .literals import ( + MAX_FILE_DESCRIPTOR, MIN_FILE_DESCRIPTOR, FILE_MODE, DIRECTORY_MODE +) +from .settings import ( + setting_document_lookup_cache_timeout, setting_node_lookup_cache_timeout +) + +logger = logging.getLogger(__name__) + + +class IndexFilesystem(Operations): + @staticmethod + def _clean_queryset(queryset): + # Remove newline carriage return to make multiline indexes + # valid directoy names + return queryset.annotate( + clean_value=Func( + F('value'), Value('\r\n'), Value(' '), function='replace' + ) + ) + + def _get_next_file_descriptor(self): + while(True): + self.file_descriptor_count += 1 + if self.file_descriptor_count > MAX_FILE_DESCRIPTOR: + self.file_descriptor_count = MIN_FILE_DESCRIPTOR + + try: + if not self.file_descriptors[self.file_descriptor_count]: + return self.file_descriptor_count + except KeyError: + return self.file_descriptor_count + + def _path_to_node(self, path, access_only=False, directory_only=True): + logger.debug('path: %s', path) + logger.debug('directory_only: %s', directory_only) + + parts = path.split('/') + + logger.debug('parts: %s', parts) + + node = self.index.instance_root + + if len(parts) > 1 and parts[1] != '': + obj = self.cache.get(path) + + if obj: + node_pk = obj.get('node_pk') + if node_pk: + if access_only: + return True + else: + return IndexInstanceNode.objects.get(pk=node_pk) + + document_pk = obj.get('document_pk') + if document_pk: + if access_only: + return True + else: + return Document.objects.get(pk=document_pk) + + for count, part in enumerate(parts[1:]): + try: + node = IndexFilesystem._clean_queryset(node.get_children()).get(clean_value=part) + except IndexInstanceNode.DoesNotExist: + logger.debug('%s does not exists', part) + + if directory_only: + return None + else: + try: + if node.index_template_node.link_documents: + result = node.documents.get(label=part) + logger.debug( + 'path %s is a valid file path', path + ) + self.cache.set( + path, {'document_pk': result.pk}, + setting_document_lookup_cache_timeout.value + ) + + return result + else: + return None + except Document.DoesNotExist: + logger.debug( + 'path %s is a file, but is not found', path + ) + return None + except MultipleObjectsReturned: + return None + except MultipleObjectsReturned: + return None + + self.cache.set( + path, {'node_pk': node.pk}, + setting_node_lookup_cache_timeout.value + ) + + logger.debug('node: %s', node) + logger.debug('node is root: %s', node.is_root_node()) + + return node + + def __init__(self, index_slug): + self.file_descriptor_count = MIN_FILE_DESCRIPTOR + self.file_descriptors = {} + self.cache = caches['default'] + + try: + self.index = Index.objects.get(slug=index_slug) + except Index.DoesNotExist: + print('Unknown index slug: {}.'.format(index_slug)) + exit(1) + + def access(self, path, fh=None): + result = self._path_to_node( + path=path, access_only=True, directory_only=False + ) + + if not result: + raise FuseOSError(ENOENT) + + def getattr(self, path, fh=None): + logger.debug('path: %s, fh: %s', path, fh) + + now = time() + result = self._path_to_node(path=path, directory_only=False) + + if not result: + raise FuseOSError(ENOENT) + + if isinstance(result, IndexInstanceNode): + return { + 'st_mode': (S_IFDIR | DIRECTORY_MODE), 'st_ctime': now, + 'st_mtime': now, 'st_atime': now, 'st_nlink': 2 + } + else: + return { + 'st_mode': (S_IFREG | FILE_MODE), + 'st_ctime': ( + result.date_added.replace(tzinfo=None) - result.date_added.utcoffset() - datetime.datetime(1970, 1, 1) + ).total_seconds(), + 'st_mtime': ( + result.latest_version.timestamp.replace(tzinfo=None) - result.latest_version.timestamp.utcoffset() - datetime.datetime(1970, 1, 1) + ).total_seconds(), + 'st_atime': now, + 'st_size': result.size + } + + def open(self, path, flags): + result = self._path_to_node(path=path, directory_only=False) + + if isinstance(result, Document): + next_file_descriptor = self._get_next_file_descriptor() + self.file_descriptors[next_file_descriptor] = result.open() + return next_file_descriptor + else: + raise FuseOSError(ENOENT) + + def read(self, path, size, offset, fh): + self.file_descriptors[fh].seek(offset) + return self.file_descriptors[fh].read(size) + + def readdir(self, path, fh): + logger.debug('path: %s', path) + + node = self._path_to_node(path=path, directory_only=True) + + if not node: + raise FuseOSError(ENOENT) + + yield '.' + yield '..' + + # Index instance nodes to directories + queryset = IndexFilesystem._clean_queryset(node.get_children()).exclude( + clean_value__contains='/' + ).values('clean_value') + + # Find nodes with the same resulting value and remove them + for duplicate in queryset.order_by().annotate(count_id=Count('id')).filter(count_id__gt=1): + queryset = queryset.exclude(clean_value=duplicate['clean_value']) + + for value in queryset.values_list('clean_value', flat=True): + yield value + + # Documents + if node.index_template_node.link_documents: + queryset = node.documents.values('label').exclude( + label__contains='/' + ) + + # Find duplicated document and remove them + for duplicate in queryset.order_by().annotate(count_id=Count('id')).filter(count_id__gt=1): + queryset = queryset.exclude(label=duplicate['label']) + + for document_label in queryset.values_list('label', flat=True): + yield document_label + + def release(self, path, fh): + self.file_descriptors[fh] = None + del(self.file_descriptors[fh]) diff --git a/mayan/apps/mirroring/management/commands/mountindex.py b/mayan/apps/mirroring/management/commands/mountindex.py index 3d03f7ff41..d1965cfd1c 100644 --- a/mayan/apps/mirroring/management/commands/mountindex.py +++ b/mayan/apps/mirroring/management/commands/mountindex.py @@ -1,215 +1,17 @@ from __future__ import print_function, unicode_literals -import datetime -from errno import ENOENT import logging -from stat import S_IFDIR, S_IFREG -from time import time -from fuse import FUSE, FuseOSError, Operations +from fuse import FUSE from django.core import management -from django.core.cache import caches -from django.core.exceptions import MultipleObjectsReturned from django.core.management.base import CommandError -from django.db.models import Count -from document_indexing.models import Index, IndexInstanceNode -from documents.models import Document - -from ...literals import ( - MAX_FILE_DESCRIPTOR, MIN_FILE_DESCRIPTOR, FILE_MODE, DIRECTORY_MODE -) -from ...settings import ( - setting_document_lookup_cache_timeout, setting_node_lookup_cache_timeout -) +from ...classes import IndexFilesystem logger = logging.getLogger(__name__) -class IndexFS(Operations): - def _get_next_file_descriptor(self): - while(True): - self.file_descriptor_count += 1 - if self.file_descriptor_count > MAX_FILE_DESCRIPTOR: - self.file_descriptor_count = MIN_FILE_DESCRIPTOR - - try: - if not self.file_descriptors[self.file_descriptor_count]: - return self.file_descriptor_count - except KeyError: - return self.file_descriptor_count - - def _path_to_node(self, path, access_only=False, directory_only=True): - logger.debug('path: %s', path) - logger.debug('directory_only: %s', directory_only) - - parts = path.split('/') - - logger.debug('parts: %s', parts) - - node = self.index.instance_root - - if len(parts) > 1 and parts[1] != '': - obj = self.cache.get(path) - - if obj: - node_pk = obj.get('node_pk') - if node_pk: - if access_only: - return True - else: - return IndexInstanceNode.objects.get(pk=node_pk) - - document_pk = obj.get('document_pk') - if document_pk: - if access_only: - return True - else: - return Document.objects.get(pk=document_pk) - - for count, part in enumerate(parts[1:]): - try: - node = node.children.get(value=part) - except IndexInstanceNode.DoesNotExist: - logger.debug('%s does not exists', part) - - if directory_only: - return None - else: - try: - if node.index_template_node.link_documents: - result = node.documents.get(label=part) - logger.debug( - 'path %s is a valid file path', path - ) - self.cache.set( - path, {'document_pk': result.pk}, - setting_document_lookup_cache_timeout.value - ) - - return result - else: - return None - except Document.DoesNotExist: - logger.debug( - 'path %s is a file, but is not found', path - ) - return None - except MultipleObjectsReturned: - return None - except MultipleObjectsReturned: - return None - - self.cache.set( - path, {'node_pk': node.pk}, - setting_node_lookup_cache_timeout.value - ) - - logger.debug('node: %s', node) - logger.debug('node is root: %s', node.is_root_node()) - - return node - - def __init__(self, index_slug): - self.file_descriptor_count = MIN_FILE_DESCRIPTOR - self.file_descriptors = {} - self.cache = caches['default'] - - try: - self.index = Index.objects.get(slug=index_slug) - except Index.DoesNotExist: - print('Unknown index slug: {}.'.format(index_slug)) - exit(1) - - def access(self, path, fh=None): - result = self._path_to_node( - path=path, access_only=True, directory_only=False - ) - - if not result: - raise FuseOSError(ENOENT) - - def getattr(self, path, fh=None): - logger.debug('path: %s, fh: %s', path, fh) - - now = time() - result = self._path_to_node(path=path, directory_only=False) - - if not result: - raise FuseOSError(ENOENT) - - if isinstance(result, IndexInstanceNode): - return { - 'st_mode': (S_IFDIR | DIRECTORY_MODE), 'st_ctime': now, - 'st_mtime': now, 'st_atime': now, 'st_nlink': 2 - } - else: - return { - 'st_mode': (S_IFREG | FILE_MODE), - 'st_ctime': ( - result.date_added.replace(tzinfo=None) - result.date_added.utcoffset() - datetime.datetime(1970, 1, 1) - ).total_seconds(), - 'st_mtime': ( - result.latest_version.timestamp.replace(tzinfo=None) - result.latest_version.timestamp.utcoffset() - datetime.datetime(1970, 1, 1) - ).total_seconds(), - 'st_atime': now, - 'st_size': result.size - } - - def open(self, path, flags): - result = self._path_to_node(path=path, directory_only=False) - - if isinstance(result, Document): - next_file_descriptor = self._get_next_file_descriptor() - self.file_descriptors[next_file_descriptor] = result.open() - return next_file_descriptor - else: - raise FuseOSError(ENOENT) - - def release(self, path, fh): - self.file_descriptors[fh] = None - del(self.file_descriptors[fh]) - - def read(self, path, size, offset, fh): - self.file_descriptors[fh].seek(offset) - return self.file_descriptors[fh].read(size) - - def readdir(self, path, fh): - logger.debug('path: %s', path) - - node = self._path_to_node(path=path, directory_only=True) - - if not node: - raise FuseOSError(ENOENT) - - yield '.' - yield '..' - - # Nodes - queryset = node.get_children().values('value').exclude( - value__contains='/' - ) - - for duplicate in queryset.order_by().annotate(count_id=Count('id')).filter(count_id__gt=1): - queryset = queryset.exclude(label=duplicate['label']) - - for child_node in queryset.values_list('value', flat=True): - yield child_node - - # Documents - if node.index_template_node.link_documents: - queryset = node.documents.values('label').exclude( - label__contains='/' - ) - - for duplicate in queryset.order_by().annotate(count_id=Count('id')).filter(count_id__gt=1): - queryset = queryset.exclude(label=duplicate['label']) - - for document_label in queryset.values_list('label', flat=True): - yield document_label - - class Command(management.BaseCommand): help = 'Mount an index as a FUSE filesystem.' @@ -235,7 +37,7 @@ class Command(management.BaseCommand): try: FUSE( - operations=IndexFS(index_slug=options['slug']), + operations=IndexFilesystem(index_slug=options['slug']), mountpoint=options['mount_point'], nothreads=True, foreground=True, allow_other=options['allow_other'], allow_root=options['allow_root'] diff --git a/mayan/apps/mirroring/tests/__init__.py b/mayan/apps/mirroring/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mayan/apps/mirroring/tests/literals.py b/mayan/apps/mirroring/tests/literals.py new file mode 100644 index 0000000000..83d80d2359 --- /dev/null +++ b/mayan/apps/mirroring/tests/literals.py @@ -0,0 +1,4 @@ +from __future__ import absolute_import, unicode_literals + +TEST_NODE_EXPRESSION = 'level_1' +TEST_NODE_EXPRESSION_MULTILINE = 'first\r\nsecond\r\nthird' diff --git a/mayan/apps/mirroring/tests/test_classes.py b/mayan/apps/mirroring/tests/test_classes.py new file mode 100644 index 0000000000..79914f0893 --- /dev/null +++ b/mayan/apps/mirroring/tests/test_classes.py @@ -0,0 +1,116 @@ +from __future__ import absolute_import, unicode_literals + +import hashlib + +from fuse import FuseOSError + +from django.test import override_settings + +from common.tests import BaseTestCase +from documents.tests import DocumentTestMixin + +from document_indexing.tests import DocumentIndexingTestMixin + +from ..classes import IndexFilesystem + +from .literals import ( + TEST_NODE_EXPRESSION, TEST_NODE_EXPRESSION_MULTILINE +) + + +@override_settings(OCR_AUTO_OCR=False) +class IndexFSTestCase(DocumentIndexingTestMixin, DocumentTestMixin, BaseTestCase): + auto_upload_document = False + + def test_document_access(self): + self._create_index() + + self.index.node_templates.create( + parent=self.index.template_root, expression=TEST_NODE_EXPRESSION, + link_documents=True + ) + + document = self.upload_document() + index_filesystem = IndexFilesystem(index_slug=self.index.slug) + + self.assertEqual( + index_filesystem.access( + '/{}/{}'.format(TEST_NODE_EXPRESSION, document.label) + ), None + ) + + def test_document_access_failure(self): + self._create_index() + + self.index.node_templates.create( + parent=self.index.template_root, expression=TEST_NODE_EXPRESSION, + link_documents=True + ) + + document = self.upload_document() + index_filesystem = IndexFilesystem(index_slug=self.index.slug) + + with self.assertRaises(FuseOSError): + index_filesystem.access( + '/{}/{}_non_valid'.format(TEST_NODE_EXPRESSION, document.label) + ) + + def test_document_open(self): + self._create_index() + + self.index.node_templates.create( + parent=self.index.template_root, expression=TEST_NODE_EXPRESSION, + link_documents=True + ) + + document = self.upload_document() + index_filesystem = IndexFilesystem(index_slug=self.index.slug) + + file_handle = index_filesystem.open( + '/{}/{}'.format(TEST_NODE_EXPRESSION, document.label), 'rb' + ) + + self.assertEqual( + hashlib.sha256( + index_filesystem.read( + path=None, size=document.size, offset=0, fh=file_handle + ) + ).hexdigest(), + document.checksum + ) + + def test_multiline_indexes(self): + self._create_index() + + self.index.node_templates.create( + parent=self.index.template_root, + expression=TEST_NODE_EXPRESSION_MULTILINE, + link_documents=True + ) + + self.upload_document() + index_filesystem = IndexFilesystem(index_slug=self.index.slug) + + self.assertEqual( + list(index_filesystem.readdir('/', ''))[2:], + [TEST_NODE_EXPRESSION_MULTILINE.replace('\r\n', ' ')] + ) + + def test_duplicated_indexes(self): + self._create_index() + + self.index.node_templates.create( + parent=self.index.template_root, expression=TEST_NODE_EXPRESSION, + link_documents=True + ) + self.index.node_templates.create( + parent=self.index.template_root, expression=TEST_NODE_EXPRESSION, + link_documents=True + ) + + self.upload_document() + index_filesystem = IndexFilesystem(index_slug=self.index.slug) + + self.assertEqual( + list(index_filesystem.readdir('/', ''))[2:], [] + )