2018-10-05 21:49:52 -04:00
parent 598312c4d3
commit ffbdcef3b4
7 changed files with 354 additions and 201 deletions
							
							
							
						
@@ -1,3 +1,13 @@
3.1.5 (2018-10-XX)
==================
* Consolidate some document indexing test code into a new mixin.
* Split the code of the mountindex command to be able to add tests.
* Fix the way the children of IndexInstanceNode are accessed. Fixes
  GitLab issue #518. Thanks to TheOneValen @TheOneValen for the report.
* Remove newlines from the index name levels before using them as FUSE
  directories.
* Fixed duplicated FUSE directory removal.
3.1.4 (2018-10-4)
=================
* Fix the link to the documenation. Closes GitLab issue #516.
							
								
							
							
							
						
 
							
							
								
							
							
						
@@ -5,5 +5,6 @@ from django.utils.translation import ugettext_lazy as _
class MirroringApp(apps.AppConfig):
    has_tests = True
    name = 'mirroring'
    verbose_name = _('Mirroring')
							
							
							
						
 
							
							
							
						
@@ -0,0 +1,220 @@
from __future__ import print_function, unicode_literals
import datetime
from errno import ENOENT
import logging
from stat import S_IFDIR, S_IFREG
from time import time
from fuse import FuseOSError, Operations
from django.core.cache import caches
from django.core.exceptions import MultipleObjectsReturned
from django.db.models import Count, F, Func, Value
from document_indexing.models import Index, IndexInstanceNode
from documents.models import Document
from .literals import (
    MAX_FILE_DESCRIPTOR, MIN_FILE_DESCRIPTOR, FILE_MODE, DIRECTORY_MODE
)
from .settings import (
    setting_document_lookup_cache_timeout, setting_node_lookup_cache_timeout
)
logger = logging.getLogger(__name__)
class IndexFilesystem(Operations):
    @staticmethod
    def _clean_queryset(queryset):
        # Remove newline carriage return to make multiline indexes
        # valid directoy names
        return queryset.annotate(
            clean_value=Func(
                F('value'), Value('\r\n'), Value(' '), function='replace'
            )
        )
    def _get_next_file_descriptor(self):
        while(True):
            self.file_descriptor_count += 1
            if self.file_descriptor_count > MAX_FILE_DESCRIPTOR:
                self.file_descriptor_count = MIN_FILE_DESCRIPTOR
            try:
                if not self.file_descriptors[self.file_descriptor_count]:
                    return self.file_descriptor_count
            except KeyError:
                return self.file_descriptor_count
    def _path_to_node(self, path, access_only=False, directory_only=True):
        logger.debug('path: %s', path)
        logger.debug('directory_only: %s', directory_only)
        parts = path.split('/')
        logger.debug('parts: %s', parts)
        node = self.index.instance_root
        if len(parts) > 1 and parts[1] != '':
            obj = self.cache.get(path)
            if obj:
                node_pk = obj.get('node_pk')
                if node_pk:
                    if access_only:
                        return True
                    else:
                        return IndexInstanceNode.objects.get(pk=node_pk)
                document_pk = obj.get('document_pk')
                if document_pk:
                    if access_only:
                        return True
                    else:
                        return Document.objects.get(pk=document_pk)
            for count, part in enumerate(parts[1:]):
                try:
                    node = IndexFilesystem._clean_queryset(node.get_children()).get(clean_value=part)
                except IndexInstanceNode.DoesNotExist:
                    logger.debug('%s does not exists', part)
                    if directory_only:
                        return None
                    else:
                        try:
                            if node.index_template_node.link_documents:
                                result = node.documents.get(label=part)
                                logger.debug(
                                    'path %s is a valid file path', path
                                )
                                self.cache.set(
                                    path, {'document_pk': result.pk},
                                    setting_document_lookup_cache_timeout.value
                                )
                                return result
                            else:
                                return None
                        except Document.DoesNotExist:
                            logger.debug(
                                'path %s is a file, but is not found', path
                            )
                            return None
                        except MultipleObjectsReturned:
                            return None
                except MultipleObjectsReturned:
                    return None
            self.cache.set(
                path, {'node_pk': node.pk},
                setting_node_lookup_cache_timeout.value
            )
        logger.debug('node: %s', node)
        logger.debug('node is root: %s', node.is_root_node())
        return node
    def __init__(self, index_slug):
        self.file_descriptor_count = MIN_FILE_DESCRIPTOR
        self.file_descriptors = {}
        self.cache = caches['default']
        try:
            self.index = Index.objects.get(slug=index_slug)
        except Index.DoesNotExist:
            print('Unknown index slug: {}.'.format(index_slug))
            exit(1)
    def access(self, path, fh=None):
        result = self._path_to_node(
            path=path, access_only=True, directory_only=False
        )
        if not result:
            raise FuseOSError(ENOENT)
    def getattr(self, path, fh=None):
        logger.debug('path: %s, fh: %s', path, fh)
        now = time()
        result = self._path_to_node(path=path, directory_only=False)
        if not result:
            raise FuseOSError(ENOENT)
        if isinstance(result, IndexInstanceNode):
            return {
                'st_mode': (S_IFDIR | DIRECTORY_MODE), 'st_ctime': now,
                'st_mtime': now, 'st_atime': now, 'st_nlink': 2
            }
        else:
            return {
                'st_mode': (S_IFREG | FILE_MODE),
                'st_ctime': (
                    result.date_added.replace(tzinfo=None) - result.date_added.utcoffset() - datetime.datetime(1970, 1, 1)
                ).total_seconds(),
                'st_mtime': (
                    result.latest_version.timestamp.replace(tzinfo=None) - result.latest_version.timestamp.utcoffset() - datetime.datetime(1970, 1, 1)
                ).total_seconds(),
                'st_atime': now,
                'st_size': result.size
            }
    def open(self, path, flags):
        result = self._path_to_node(path=path, directory_only=False)
        if isinstance(result, Document):
            next_file_descriptor = self._get_next_file_descriptor()
            self.file_descriptors[next_file_descriptor] = result.open()
            return next_file_descriptor
        else:
            raise FuseOSError(ENOENT)
    def read(self, path, size, offset, fh):
        self.file_descriptors[fh].seek(offset)
        return self.file_descriptors[fh].read(size)
    def readdir(self, path, fh):
        logger.debug('path: %s', path)
        node = self._path_to_node(path=path, directory_only=True)
        if not node:
            raise FuseOSError(ENOENT)
        yield '.'
        yield '..'
        # Index instance nodes to directories
        queryset = IndexFilesystem._clean_queryset(node.get_children()).exclude(
            clean_value__contains='/'
        ).values('clean_value')
        # Find nodes with the same resulting value and remove them
        for duplicate in queryset.order_by().annotate(count_id=Count('id')).filter(count_id__gt=1):
            queryset = queryset.exclude(clean_value=duplicate['clean_value'])
        for value in queryset.values_list('clean_value', flat=True):
            yield value
        # Documents
        if node.index_template_node.link_documents:
            queryset = node.documents.values('label').exclude(
                label__contains='/'
            )
            # Find duplicated document and remove them
            for duplicate in queryset.order_by().annotate(count_id=Count('id')).filter(count_id__gt=1):
                queryset = queryset.exclude(label=duplicate['label'])
            for document_label in queryset.values_list('label', flat=True):
                yield document_label
    def release(self, path, fh):
        self.file_descriptors[fh] = None
        del(self.file_descriptors[fh])
							
							
							
						
@@ -1,215 +1,17 @@
from __future__ import print_function, unicode_literals
import datetime
from errno import ENOENT
import logging
from stat import S_IFDIR, S_IFREG
from time import time
from fuse import FUSE, FuseOSError, Operations
from fuse import FUSE
from django.core import management
from django.core.cache import caches
from django.core.exceptions import MultipleObjectsReturned
from django.core.management.base import CommandError
from django.db.models import Count
from document_indexing.models import Index, IndexInstanceNode
from documents.models import Document
from ...literals import (
    MAX_FILE_DESCRIPTOR, MIN_FILE_DESCRIPTOR, FILE_MODE, DIRECTORY_MODE
)
from ...settings import (
    setting_document_lookup_cache_timeout, setting_node_lookup_cache_timeout
)
from ...classes import IndexFilesystem
logger = logging.getLogger(__name__)
class IndexFS(Operations):
    def _get_next_file_descriptor(self):
        while(True):
            self.file_descriptor_count += 1
            if self.file_descriptor_count > MAX_FILE_DESCRIPTOR:
                self.file_descriptor_count = MIN_FILE_DESCRIPTOR
            try:
                if not self.file_descriptors[self.file_descriptor_count]:
                    return self.file_descriptor_count
            except KeyError:
                return self.file_descriptor_count
    def _path_to_node(self, path, access_only=False, directory_only=True):
        logger.debug('path: %s', path)
        logger.debug('directory_only: %s', directory_only)
        parts = path.split('/')
        logger.debug('parts: %s', parts)
        node = self.index.instance_root
        if len(parts) > 1 and parts[1] != '':
            obj = self.cache.get(path)
            if obj:
                node_pk = obj.get('node_pk')
                if node_pk:
                    if access_only:
                        return True
                    else:
                        return IndexInstanceNode.objects.get(pk=node_pk)
                document_pk = obj.get('document_pk')
                if document_pk:
                    if access_only:
                        return True
                    else:
                        return Document.objects.get(pk=document_pk)
            for count, part in enumerate(parts[1:]):
                try:
                    node = node.children.get(value=part)
                except IndexInstanceNode.DoesNotExist:
                    logger.debug('%s does not exists', part)
                    if directory_only:
                        return None
                    else:
                        try:
                            if node.index_template_node.link_documents:
                                result = node.documents.get(label=part)
                                logger.debug(
                                    'path %s is a valid file path', path
                                )
                                self.cache.set(
                                    path, {'document_pk': result.pk},
                                    setting_document_lookup_cache_timeout.value
                                )
                                return result
                            else:
                                return None
                        except Document.DoesNotExist:
                            logger.debug(
                                'path %s is a file, but is not found', path
                            )
                            return None
                        except MultipleObjectsReturned:
                            return None
                except MultipleObjectsReturned:
                    return None
            self.cache.set(
                path, {'node_pk': node.pk},
                setting_node_lookup_cache_timeout.value
            )
        logger.debug('node: %s', node)
        logger.debug('node is root: %s', node.is_root_node())
        return node
    def __init__(self, index_slug):
        self.file_descriptor_count = MIN_FILE_DESCRIPTOR
        self.file_descriptors = {}
        self.cache = caches['default']
        try:
            self.index = Index.objects.get(slug=index_slug)
        except Index.DoesNotExist:
            print('Unknown index slug: {}.'.format(index_slug))
            exit(1)
    def access(self, path, fh=None):
        result = self._path_to_node(
            path=path, access_only=True, directory_only=False
        )
        if not result:
            raise FuseOSError(ENOENT)
    def getattr(self, path, fh=None):
        logger.debug('path: %s, fh: %s', path, fh)
        now = time()
        result = self._path_to_node(path=path, directory_only=False)
        if not result:
            raise FuseOSError(ENOENT)
        if isinstance(result, IndexInstanceNode):
            return {
                'st_mode': (S_IFDIR | DIRECTORY_MODE), 'st_ctime': now,
                'st_mtime': now, 'st_atime': now, 'st_nlink': 2
            }
        else:
            return {
                'st_mode': (S_IFREG | FILE_MODE),
                'st_ctime': (
                    result.date_added.replace(tzinfo=None) - result.date_added.utcoffset() - datetime.datetime(1970, 1, 1)
                ).total_seconds(),
                'st_mtime': (
                    result.latest_version.timestamp.replace(tzinfo=None) - result.latest_version.timestamp.utcoffset() - datetime.datetime(1970, 1, 1)
                ).total_seconds(),
                'st_atime': now,
                'st_size': result.size
            }
    def open(self, path, flags):
        result = self._path_to_node(path=path, directory_only=False)
        if isinstance(result, Document):
            next_file_descriptor = self._get_next_file_descriptor()
            self.file_descriptors[next_file_descriptor] = result.open()
            return next_file_descriptor
        else:
            raise FuseOSError(ENOENT)
    def release(self, path, fh):
        self.file_descriptors[fh] = None
        del(self.file_descriptors[fh])
    def read(self, path, size, offset, fh):
        self.file_descriptors[fh].seek(offset)
        return self.file_descriptors[fh].read(size)
    def readdir(self, path, fh):
        logger.debug('path: %s', path)
        node = self._path_to_node(path=path, directory_only=True)
        if not node:
            raise FuseOSError(ENOENT)
        yield '.'
        yield '..'
        # Nodes
        queryset = node.get_children().values('value').exclude(
            value__contains='/'
        )
        for duplicate in queryset.order_by().annotate(count_id=Count('id')).filter(count_id__gt=1):
            queryset = queryset.exclude(label=duplicate['label'])
        for child_node in queryset.values_list('value', flat=True):
            yield child_node
        # Documents
        if node.index_template_node.link_documents:
            queryset = node.documents.values('label').exclude(
                label__contains='/'
            )
            for duplicate in queryset.order_by().annotate(count_id=Count('id')).filter(count_id__gt=1):
                queryset = queryset.exclude(label=duplicate['label'])
            for document_label in queryset.values_list('label', flat=True):
                yield document_label
class Command(management.BaseCommand):
    help = 'Mount an index as a FUSE filesystem.'
							
							
							
								
							
						
@@ -235,7 +37,7 @@ class Command(management.BaseCommand):
        try:
            FUSE(
                operations=IndexFS(index_slug=options['slug']),
                operations=IndexFilesystem(index_slug=options['slug']),
                mountpoint=options['mount_point'], nothreads=True, foreground=True,
                allow_other=options['allow_other'],
                allow_root=options['allow_root']
							
								
							
							
							
						
 
							
							
							
						
@@ -0,0 +1,4 @@
from __future__ import absolute_import, unicode_literals
TEST_NODE_EXPRESSION = 'level_1'
TEST_NODE_EXPRESSION_MULTILINE = 'first\r\nsecond\r\nthird'
							
							
							
						
@@ -0,0 +1,116 @@
from __future__ import absolute_import, unicode_literals
import hashlib
from fuse import FuseOSError
from django.test import override_settings
from common.tests import BaseTestCase
from documents.tests import DocumentTestMixin
from document_indexing.tests import DocumentIndexingTestMixin
from ..classes import IndexFilesystem
from .literals import (
    TEST_NODE_EXPRESSION, TEST_NODE_EXPRESSION_MULTILINE
)
@override_settings(OCR_AUTO_OCR=False)
class IndexFSTestCase(DocumentIndexingTestMixin, DocumentTestMixin, BaseTestCase):
    auto_upload_document = False
    def test_document_access(self):
        self._create_index()
        self.index.node_templates.create(
            parent=self.index.template_root, expression=TEST_NODE_EXPRESSION,
            link_documents=True
        )
        document = self.upload_document()
        index_filesystem = IndexFilesystem(index_slug=self.index.slug)
        self.assertEqual(
            index_filesystem.access(
                '/{}/{}'.format(TEST_NODE_EXPRESSION, document.label)
            ), None
        )
    def test_document_access_failure(self):
        self._create_index()
        self.index.node_templates.create(
            parent=self.index.template_root, expression=TEST_NODE_EXPRESSION,
            link_documents=True
        )
        document = self.upload_document()
        index_filesystem = IndexFilesystem(index_slug=self.index.slug)
        with self.assertRaises(FuseOSError):
            index_filesystem.access(
                '/{}/{}_non_valid'.format(TEST_NODE_EXPRESSION, document.label)
            )
    def test_document_open(self):
        self._create_index()
        self.index.node_templates.create(
            parent=self.index.template_root, expression=TEST_NODE_EXPRESSION,
            link_documents=True
        )
        document = self.upload_document()
        index_filesystem = IndexFilesystem(index_slug=self.index.slug)
        file_handle = index_filesystem.open(
            '/{}/{}'.format(TEST_NODE_EXPRESSION, document.label), 'rb'
        )
        self.assertEqual(
            hashlib.sha256(
                index_filesystem.read(
                    path=None, size=document.size, offset=0, fh=file_handle
                )
            ).hexdigest(),
            document.checksum
        )
    def test_multiline_indexes(self):
        self._create_index()
        self.index.node_templates.create(
            parent=self.index.template_root,
            expression=TEST_NODE_EXPRESSION_MULTILINE,
            link_documents=True
        )
        self.upload_document()
        index_filesystem = IndexFilesystem(index_slug=self.index.slug)
        self.assertEqual(
            list(index_filesystem.readdir('/', ''))[2:],
            [TEST_NODE_EXPRESSION_MULTILINE.replace('\r\n', ' ')]
        )
    def test_duplicated_indexes(self):
        self._create_index()
        self.index.node_templates.create(
            parent=self.index.template_root, expression=TEST_NODE_EXPRESSION,
            link_documents=True
        )
        self.index.node_templates.create(
            parent=self.index.template_root, expression=TEST_NODE_EXPRESSION,
            link_documents=True
        )
        self.upload_document()
        index_filesystem = IndexFilesystem(index_slug=self.index.slug)
        self.assertEqual(
            list(index_filesystem.readdir('/', ''))[2:], []
        )