Split the code of the mountindex command to be able to add tests. Fix the way the children of IndexInstanceNode are accessed. Fixes GitLab issue #518. Thanks to TheOneValen @TheOneValen for the report. Remove newlines from the index name levels before using them as FUSE directories. Fixed duplicated FUSE directory removal.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2018-10-05 21:49:52 -04:00
parent 598312c4d3
commit ffbdcef3b4
7 changed files with 354 additions and 201 deletions

View File

@@ -1,3 +1,13 @@
3.1.5 (2018-10-XX)
==================
* Consolidate some document indexing test code into a new mixin.
* Split the code of the mountindex command to be able to add tests.
* Fix the way the children of IndexInstanceNode are accessed. Fixes
GitLab issue #518. Thanks to TheOneValen @TheOneValen for the report.
* Remove newlines from the index name levels before using them as FUSE
directories.
* Fixed duplicated FUSE directory removal.
3.1.4 (2018-10-4)
=================
* Fix the link to the documenation. Closes GitLab issue #516.

View File

@@ -5,5 +5,6 @@ from django.utils.translation import ugettext_lazy as _
class MirroringApp(apps.AppConfig):
has_tests = True
name = 'mirroring'
verbose_name = _('Mirroring')

View File

@@ -0,0 +1,220 @@
from __future__ import print_function, unicode_literals
import datetime
from errno import ENOENT
import logging
from stat import S_IFDIR, S_IFREG
from time import time
from fuse import FuseOSError, Operations
from django.core.cache import caches
from django.core.exceptions import MultipleObjectsReturned
from django.db.models import Count, F, Func, Value
from document_indexing.models import Index, IndexInstanceNode
from documents.models import Document
from .literals import (
MAX_FILE_DESCRIPTOR, MIN_FILE_DESCRIPTOR, FILE_MODE, DIRECTORY_MODE
)
from .settings import (
setting_document_lookup_cache_timeout, setting_node_lookup_cache_timeout
)
logger = logging.getLogger(__name__)
class IndexFilesystem(Operations):
@staticmethod
def _clean_queryset(queryset):
# Remove newline carriage return to make multiline indexes
# valid directoy names
return queryset.annotate(
clean_value=Func(
F('value'), Value('\r\n'), Value(' '), function='replace'
)
)
def _get_next_file_descriptor(self):
while(True):
self.file_descriptor_count += 1
if self.file_descriptor_count > MAX_FILE_DESCRIPTOR:
self.file_descriptor_count = MIN_FILE_DESCRIPTOR
try:
if not self.file_descriptors[self.file_descriptor_count]:
return self.file_descriptor_count
except KeyError:
return self.file_descriptor_count
def _path_to_node(self, path, access_only=False, directory_only=True):
logger.debug('path: %s', path)
logger.debug('directory_only: %s', directory_only)
parts = path.split('/')
logger.debug('parts: %s', parts)
node = self.index.instance_root
if len(parts) > 1 and parts[1] != '':
obj = self.cache.get(path)
if obj:
node_pk = obj.get('node_pk')
if node_pk:
if access_only:
return True
else:
return IndexInstanceNode.objects.get(pk=node_pk)
document_pk = obj.get('document_pk')
if document_pk:
if access_only:
return True
else:
return Document.objects.get(pk=document_pk)
for count, part in enumerate(parts[1:]):
try:
node = IndexFilesystem._clean_queryset(node.get_children()).get(clean_value=part)
except IndexInstanceNode.DoesNotExist:
logger.debug('%s does not exists', part)
if directory_only:
return None
else:
try:
if node.index_template_node.link_documents:
result = node.documents.get(label=part)
logger.debug(
'path %s is a valid file path', path
)
self.cache.set(
path, {'document_pk': result.pk},
setting_document_lookup_cache_timeout.value
)
return result
else:
return None
except Document.DoesNotExist:
logger.debug(
'path %s is a file, but is not found', path
)
return None
except MultipleObjectsReturned:
return None
except MultipleObjectsReturned:
return None
self.cache.set(
path, {'node_pk': node.pk},
setting_node_lookup_cache_timeout.value
)
logger.debug('node: %s', node)
logger.debug('node is root: %s', node.is_root_node())
return node
def __init__(self, index_slug):
self.file_descriptor_count = MIN_FILE_DESCRIPTOR
self.file_descriptors = {}
self.cache = caches['default']
try:
self.index = Index.objects.get(slug=index_slug)
except Index.DoesNotExist:
print('Unknown index slug: {}.'.format(index_slug))
exit(1)
def access(self, path, fh=None):
result = self._path_to_node(
path=path, access_only=True, directory_only=False
)
if not result:
raise FuseOSError(ENOENT)
def getattr(self, path, fh=None):
logger.debug('path: %s, fh: %s', path, fh)
now = time()
result = self._path_to_node(path=path, directory_only=False)
if not result:
raise FuseOSError(ENOENT)
if isinstance(result, IndexInstanceNode):
return {
'st_mode': (S_IFDIR | DIRECTORY_MODE), 'st_ctime': now,
'st_mtime': now, 'st_atime': now, 'st_nlink': 2
}
else:
return {
'st_mode': (S_IFREG | FILE_MODE),
'st_ctime': (
result.date_added.replace(tzinfo=None) - result.date_added.utcoffset() - datetime.datetime(1970, 1, 1)
).total_seconds(),
'st_mtime': (
result.latest_version.timestamp.replace(tzinfo=None) - result.latest_version.timestamp.utcoffset() - datetime.datetime(1970, 1, 1)
).total_seconds(),
'st_atime': now,
'st_size': result.size
}
def open(self, path, flags):
result = self._path_to_node(path=path, directory_only=False)
if isinstance(result, Document):
next_file_descriptor = self._get_next_file_descriptor()
self.file_descriptors[next_file_descriptor] = result.open()
return next_file_descriptor
else:
raise FuseOSError(ENOENT)
def read(self, path, size, offset, fh):
self.file_descriptors[fh].seek(offset)
return self.file_descriptors[fh].read(size)
def readdir(self, path, fh):
logger.debug('path: %s', path)
node = self._path_to_node(path=path, directory_only=True)
if not node:
raise FuseOSError(ENOENT)
yield '.'
yield '..'
# Index instance nodes to directories
queryset = IndexFilesystem._clean_queryset(node.get_children()).exclude(
clean_value__contains='/'
).values('clean_value')
# Find nodes with the same resulting value and remove them
for duplicate in queryset.order_by().annotate(count_id=Count('id')).filter(count_id__gt=1):
queryset = queryset.exclude(clean_value=duplicate['clean_value'])
for value in queryset.values_list('clean_value', flat=True):
yield value
# Documents
if node.index_template_node.link_documents:
queryset = node.documents.values('label').exclude(
label__contains='/'
)
# Find duplicated document and remove them
for duplicate in queryset.order_by().annotate(count_id=Count('id')).filter(count_id__gt=1):
queryset = queryset.exclude(label=duplicate['label'])
for document_label in queryset.values_list('label', flat=True):
yield document_label
def release(self, path, fh):
self.file_descriptors[fh] = None
del(self.file_descriptors[fh])

View File

@@ -1,215 +1,17 @@
from __future__ import print_function, unicode_literals
import datetime
from errno import ENOENT
import logging
from stat import S_IFDIR, S_IFREG
from time import time
from fuse import FUSE, FuseOSError, Operations
from fuse import FUSE
from django.core import management
from django.core.cache import caches
from django.core.exceptions import MultipleObjectsReturned
from django.core.management.base import CommandError
from django.db.models import Count
from document_indexing.models import Index, IndexInstanceNode
from documents.models import Document
from ...literals import (
MAX_FILE_DESCRIPTOR, MIN_FILE_DESCRIPTOR, FILE_MODE, DIRECTORY_MODE
)
from ...settings import (
setting_document_lookup_cache_timeout, setting_node_lookup_cache_timeout
)
from ...classes import IndexFilesystem
logger = logging.getLogger(__name__)
class IndexFS(Operations):
def _get_next_file_descriptor(self):
while(True):
self.file_descriptor_count += 1
if self.file_descriptor_count > MAX_FILE_DESCRIPTOR:
self.file_descriptor_count = MIN_FILE_DESCRIPTOR
try:
if not self.file_descriptors[self.file_descriptor_count]:
return self.file_descriptor_count
except KeyError:
return self.file_descriptor_count
def _path_to_node(self, path, access_only=False, directory_only=True):
logger.debug('path: %s', path)
logger.debug('directory_only: %s', directory_only)
parts = path.split('/')
logger.debug('parts: %s', parts)
node = self.index.instance_root
if len(parts) > 1 and parts[1] != '':
obj = self.cache.get(path)
if obj:
node_pk = obj.get('node_pk')
if node_pk:
if access_only:
return True
else:
return IndexInstanceNode.objects.get(pk=node_pk)
document_pk = obj.get('document_pk')
if document_pk:
if access_only:
return True
else:
return Document.objects.get(pk=document_pk)
for count, part in enumerate(parts[1:]):
try:
node = node.children.get(value=part)
except IndexInstanceNode.DoesNotExist:
logger.debug('%s does not exists', part)
if directory_only:
return None
else:
try:
if node.index_template_node.link_documents:
result = node.documents.get(label=part)
logger.debug(
'path %s is a valid file path', path
)
self.cache.set(
path, {'document_pk': result.pk},
setting_document_lookup_cache_timeout.value
)
return result
else:
return None
except Document.DoesNotExist:
logger.debug(
'path %s is a file, but is not found', path
)
return None
except MultipleObjectsReturned:
return None
except MultipleObjectsReturned:
return None
self.cache.set(
path, {'node_pk': node.pk},
setting_node_lookup_cache_timeout.value
)
logger.debug('node: %s', node)
logger.debug('node is root: %s', node.is_root_node())
return node
def __init__(self, index_slug):
self.file_descriptor_count = MIN_FILE_DESCRIPTOR
self.file_descriptors = {}
self.cache = caches['default']
try:
self.index = Index.objects.get(slug=index_slug)
except Index.DoesNotExist:
print('Unknown index slug: {}.'.format(index_slug))
exit(1)
def access(self, path, fh=None):
result = self._path_to_node(
path=path, access_only=True, directory_only=False
)
if not result:
raise FuseOSError(ENOENT)
def getattr(self, path, fh=None):
logger.debug('path: %s, fh: %s', path, fh)
now = time()
result = self._path_to_node(path=path, directory_only=False)
if not result:
raise FuseOSError(ENOENT)
if isinstance(result, IndexInstanceNode):
return {
'st_mode': (S_IFDIR | DIRECTORY_MODE), 'st_ctime': now,
'st_mtime': now, 'st_atime': now, 'st_nlink': 2
}
else:
return {
'st_mode': (S_IFREG | FILE_MODE),
'st_ctime': (
result.date_added.replace(tzinfo=None) - result.date_added.utcoffset() - datetime.datetime(1970, 1, 1)
).total_seconds(),
'st_mtime': (
result.latest_version.timestamp.replace(tzinfo=None) - result.latest_version.timestamp.utcoffset() - datetime.datetime(1970, 1, 1)
).total_seconds(),
'st_atime': now,
'st_size': result.size
}
def open(self, path, flags):
result = self._path_to_node(path=path, directory_only=False)
if isinstance(result, Document):
next_file_descriptor = self._get_next_file_descriptor()
self.file_descriptors[next_file_descriptor] = result.open()
return next_file_descriptor
else:
raise FuseOSError(ENOENT)
def release(self, path, fh):
self.file_descriptors[fh] = None
del(self.file_descriptors[fh])
def read(self, path, size, offset, fh):
self.file_descriptors[fh].seek(offset)
return self.file_descriptors[fh].read(size)
def readdir(self, path, fh):
logger.debug('path: %s', path)
node = self._path_to_node(path=path, directory_only=True)
if not node:
raise FuseOSError(ENOENT)
yield '.'
yield '..'
# Nodes
queryset = node.get_children().values('value').exclude(
value__contains='/'
)
for duplicate in queryset.order_by().annotate(count_id=Count('id')).filter(count_id__gt=1):
queryset = queryset.exclude(label=duplicate['label'])
for child_node in queryset.values_list('value', flat=True):
yield child_node
# Documents
if node.index_template_node.link_documents:
queryset = node.documents.values('label').exclude(
label__contains='/'
)
for duplicate in queryset.order_by().annotate(count_id=Count('id')).filter(count_id__gt=1):
queryset = queryset.exclude(label=duplicate['label'])
for document_label in queryset.values_list('label', flat=True):
yield document_label
class Command(management.BaseCommand):
help = 'Mount an index as a FUSE filesystem.'
@@ -235,7 +37,7 @@ class Command(management.BaseCommand):
try:
FUSE(
operations=IndexFS(index_slug=options['slug']),
operations=IndexFilesystem(index_slug=options['slug']),
mountpoint=options['mount_point'], nothreads=True, foreground=True,
allow_other=options['allow_other'],
allow_root=options['allow_root']

View File

View File

@@ -0,0 +1,4 @@
from __future__ import absolute_import, unicode_literals
TEST_NODE_EXPRESSION = 'level_1'
TEST_NODE_EXPRESSION_MULTILINE = 'first\r\nsecond\r\nthird'

View File

@@ -0,0 +1,116 @@
from __future__ import absolute_import, unicode_literals
import hashlib
from fuse import FuseOSError
from django.test import override_settings
from common.tests import BaseTestCase
from documents.tests import DocumentTestMixin
from document_indexing.tests import DocumentIndexingTestMixin
from ..classes import IndexFilesystem
from .literals import (
TEST_NODE_EXPRESSION, TEST_NODE_EXPRESSION_MULTILINE
)
@override_settings(OCR_AUTO_OCR=False)
class IndexFSTestCase(DocumentIndexingTestMixin, DocumentTestMixin, BaseTestCase):
auto_upload_document = False
def test_document_access(self):
self._create_index()
self.index.node_templates.create(
parent=self.index.template_root, expression=TEST_NODE_EXPRESSION,
link_documents=True
)
document = self.upload_document()
index_filesystem = IndexFilesystem(index_slug=self.index.slug)
self.assertEqual(
index_filesystem.access(
'/{}/{}'.format(TEST_NODE_EXPRESSION, document.label)
), None
)
def test_document_access_failure(self):
self._create_index()
self.index.node_templates.create(
parent=self.index.template_root, expression=TEST_NODE_EXPRESSION,
link_documents=True
)
document = self.upload_document()
index_filesystem = IndexFilesystem(index_slug=self.index.slug)
with self.assertRaises(FuseOSError):
index_filesystem.access(
'/{}/{}_non_valid'.format(TEST_NODE_EXPRESSION, document.label)
)
def test_document_open(self):
self._create_index()
self.index.node_templates.create(
parent=self.index.template_root, expression=TEST_NODE_EXPRESSION,
link_documents=True
)
document = self.upload_document()
index_filesystem = IndexFilesystem(index_slug=self.index.slug)
file_handle = index_filesystem.open(
'/{}/{}'.format(TEST_NODE_EXPRESSION, document.label), 'rb'
)
self.assertEqual(
hashlib.sha256(
index_filesystem.read(
path=None, size=document.size, offset=0, fh=file_handle
)
).hexdigest(),
document.checksum
)
def test_multiline_indexes(self):
self._create_index()
self.index.node_templates.create(
parent=self.index.template_root,
expression=TEST_NODE_EXPRESSION_MULTILINE,
link_documents=True
)
self.upload_document()
index_filesystem = IndexFilesystem(index_slug=self.index.slug)
self.assertEqual(
list(index_filesystem.readdir('/', ''))[2:],
[TEST_NODE_EXPRESSION_MULTILINE.replace('\r\n', ' ')]
)
def test_duplicated_indexes(self):
self._create_index()
self.index.node_templates.create(
parent=self.index.template_root, expression=TEST_NODE_EXPRESSION,
link_documents=True
)
self.index.node_templates.create(
parent=self.index.template_root, expression=TEST_NODE_EXPRESSION,
link_documents=True
)
self.upload_document()
index_filesystem = IndexFilesystem(index_slug=self.index.slug)
self.assertEqual(
list(index_filesystem.readdir('/', ''))[2:], []
)