Files
mayan-edms/mayan/apps/sources/models.py
2018-09-05 00:40:10 -04:00

800 lines
29 KiB
Python

from __future__ import unicode_literals
import imaplib
import json
import logging
import os
import poplib
import subprocess
import yaml
from django.core.exceptions import ValidationError
from django.core.files import File
from django.core.files.base import ContentFile
from django.db import models, transaction
from django.utils.encoding import (
force_str, force_text, python_2_unicode_compatible
)
from django.utils.timezone import now
from django.utils.translation import ugettext_lazy as _
from model_utils.managers import InheritanceManager
from common.compressed_files import Archive
from common.exceptions import NoMIMETypeMatch
from common.utils import TemporaryFile
from converter.models import Transformation
from djcelery.models import PeriodicTask, IntervalSchedule
from documents.models import Document, DocumentType
from documents.settings import setting_language
from metadata.api import set_bulk_metadata
from metadata.models import MetadataType
from .classes import PseudoFile, SourceUploadedFile, StagingFile
from .exceptions import SourceException
from .literals import (
DEFAULT_INTERVAL, DEFAULT_POP3_TIMEOUT, DEFAULT_IMAP_MAILBOX,
DEFAULT_METADATA_ATTACHMENT_NAME, SCANNER_ADF_MODE_CHOICES,
SCANNER_ADF_MODE_SIMPLEX, SCANNER_MODE_COLOR, SCANNER_MODE_CHOICES,
SCANNER_SOURCE_CHOICES, SCANNER_SOURCE_FLATBED,
SOURCE_CHOICES, SOURCE_CHOICE_STAGING, SOURCE_CHOICE_WATCH,
SOURCE_CHOICE_WEB_FORM, SOURCE_INTERACTIVE_UNCOMPRESS_CHOICES,
SOURCE_UNCOMPRESS_CHOICES, SOURCE_UNCOMPRESS_CHOICE_N,
SOURCE_UNCOMPRESS_CHOICE_Y, SOURCE_CHOICE_EMAIL_IMAP,
SOURCE_CHOICE_EMAIL_POP3, SOURCE_CHOICE_SANE_SCANNER,
)
from .settings import setting_scanimage_path
from .wizards import WizardStep
logger = logging.getLogger(__name__)
@python_2_unicode_compatible
class Source(models.Model):
label = models.CharField(
db_index=True, max_length=64, unique=True, verbose_name=_('Label')
)
enabled = models.BooleanField(default=True, verbose_name=_('Enabled'))
objects = InheritanceManager()
class Meta:
ordering = ('label',)
verbose_name = _('Source')
verbose_name_plural = _('Sources')
def __str__(self):
return '%s' % self.label
@classmethod
def class_fullname(cls):
return force_text(dict(SOURCE_CHOICES).get(cls.source_type))
def clean_up_upload_file(self, upload_file_object):
pass
# TODO: Should raise NotImplementedError?
def fullname(self):
return ' '.join([self.class_fullname(), '"%s"' % self.label])
def handle_upload(self, file_object, description=None, document_type=None, expand=False, label=None, language=None, user=None):
"""
Handle an upload request from a file object which may be an individual
document or a compressed file containing multiple documents.
"""
documents = []
if not document_type:
document_type = self.document_type
kwargs = {
'description': description, 'document_type': document_type,
'label': label, 'language': language, 'user': user
}
if expand:
try:
compressed_file = Archive.open(file_object=file_object)
for compressed_file_child in compressed_file.members():
with compressed_file.open_member(filename=compressed_file_child) as file_object:
kwargs.update(
{'label': force_text(compressed_file_child)}
)
documents.append(
self.upload_document(
file_object=file_object, **kwargs
)
)
except NoMIMETypeMatch:
logging.debug('Exception: NoMIMETypeMatch')
documents.append(
self.upload_document(file_object=file_object, **kwargs)
)
else:
documents.append(
self.upload_document(file_object=file_object, **kwargs)
)
# Return a list of newly created documents. Used by the email source
# to assign the from and subject metadata values.
return documents
def get_upload_file_object(self, form_data):
pass
# TODO: Should raise NotImplementedError?
def upload_document(self, file_object, document_type, description=None, label=None, language=None, querystring=None, user=None):
"""
Upload an individual document
"""
try:
with transaction.atomic():
document = Document(
description=description or '', document_type=document_type,
label=label or file_object.name,
language=language or setting_language.value
)
document.save(_user=user)
except Exception as exception:
logger.critical(
'Unexpected exception while trying to create new document '
'"%s" from source "%s"; %s',
label or file_object.name, self, exception
)
raise
else:
try:
document_version = document.new_version(
file_object=file_object, _user=user,
)
if user:
document.add_as_recent_document_for_user(user)
Transformation.objects.copy(
source=self, targets=document_version.pages.all()
)
except Exception as exception:
logger.critical(
'Unexpected exception while trying to create version for '
'new document "%s" from source "%s"; %s',
label or file_object.name, self, exception, exc_info=True
)
document.delete(to_trash=False)
raise
else:
WizardStep.post_upload_process(
document=document, querystring=querystring
)
return document
class InteractiveSource(Source):
objects = InheritanceManager()
class Meta:
verbose_name = _('Interactive source')
verbose_name_plural = _('Interactive sources')
class SaneScanner(InteractiveSource):
can_compress = False
is_interactive = True
source_type = SOURCE_CHOICE_SANE_SCANNER
device_name = models.CharField(
max_length=255,
help_text=_('Device name as returned by the SANE backend.'),
verbose_name=_('Device name')
)
mode = models.CharField(
blank=True, choices=SCANNER_MODE_CHOICES, default=SCANNER_MODE_COLOR,
help_text=_(
'Selects the scan mode (e.g., lineart, monochrome, or color). '
'If this option is not supported by your scanner, leave it blank.'
), max_length=16, verbose_name=_('Mode')
)
resolution = models.PositiveIntegerField(
blank=True, null=True, help_text=_(
'Sets the resolution of the scanned image in DPI (dots per inch). '
'Typical value is 200. If this option is not supported by your '
'scanner, leave it blank.'
), verbose_name=_('Resolution')
)
source = models.CharField(
blank=True, choices=SCANNER_SOURCE_CHOICES, help_text=_(
'Selects the scan source (such as a document-feeder). If this '
'option is not supported by your scanner, leave it blank.'
), max_length=32, null=True, verbose_name=_('Paper source')
)
adf_mode = models.CharField(
blank=True, choices=SCANNER_ADF_MODE_CHOICES,
help_text=_(
'Selects the document feeder mode (simplex/duplex). If this '
'option is not supported by your scanner, leave it blank.'
), max_length=16, verbose_name=_('ADF mode')
)
objects = models.Manager()
class Meta:
verbose_name = _('SANE Scanner')
verbose_name_plural = _('SANE Scanners')
def clean_up_upload_file(self, upload_file_object):
pass
def get_upload_file_object(self, form_data):
temporary_file_object = TemporaryFile()
command_line = [
setting_scanimage_path.value, '-d', self.device_name,
'--format', 'tiff',
]
if self.resolution:
command_line.extend(
['--resolution', '{}'.format(self.resolution)]
)
if self.mode:
command_line.extend(
['--mode', self.mode]
)
if self.source:
command_line.extend(
['--source', self.source]
)
if self.adf_mode:
command_line.extend(
['--adf-mode', self.adf_mode]
)
filestderr = TemporaryFile()
try:
logger.debug('Scan command line: %s', command_line)
subprocess.check_call(
command_line, stdout=temporary_file_object, stderr=filestderr
)
except subprocess.CalledProcessError:
filestderr.seek(0)
error_message = filestderr.read()
logger.error(
'Exception while scanning from source:%s ; %s', self,
error_message
)
message = _('Error while scanning; %s') % error_message
self.logs.create(message=message)
raise SourceException(message)
else:
return SourceUploadedFile(
source=self, file=PseudoFile(
file=temporary_file_object, name='scan {}'.format(now())
)
)
class StagingFolderSource(InteractiveSource):
"""
The Staging folder source is interactive but instead of displaying an
HTML form (like the Webform source) that allows users to freely choose a
file from their computers, shows a list of files from a filesystem folder.
When creating staging folders administrators choose a folder in the same
machine where Mayan is installed. This folder is then used as the
destination location of networked scanners or multifunctional copiers.
The scenario for staging folders is as follows: An user walks up to the
networked copier, scan several papers documents, returns to their
computer, open Mayan, select to upload a new document but choose the
previously defined staging folder source, now they see the list of
documents with a small preview and can proceed to process one by one and
convert the scanned files into Mayan EDMS documents. Staging folders are
useful when many users share a few networked scanners.
"""
can_compress = True
is_interactive = True
source_type = SOURCE_CHOICE_STAGING
folder_path = models.CharField(
max_length=255, help_text=_('Server side filesystem path.'),
verbose_name=_('Folder path')
)
preview_width = models.IntegerField(
help_text=_('Width value to be passed to the converter backend.'),
verbose_name=_('Preview width')
)
preview_height = models.IntegerField(
blank=True, null=True,
help_text=_('Height value to be passed to the converter backend.'),
verbose_name=_('Preview height')
)
uncompress = models.CharField(
choices=SOURCE_INTERACTIVE_UNCOMPRESS_CHOICES, max_length=1,
help_text=_('Whether to expand or not compressed archives.'),
verbose_name=_('Uncompress')
)
delete_after_upload = models.BooleanField(
default=True,
help_text=_(
'Delete the file after is has been successfully uploaded.'
),
verbose_name=_('Delete after upload')
)
objects = models.Manager()
class Meta:
verbose_name = _('Staging folder')
verbose_name_plural = _('Staging folders')
def clean_up_upload_file(self, upload_file_object):
if self.delete_after_upload:
try:
upload_file_object.extra_data.delete()
except Exception as exception:
logger.error(
'Error deleting staging file: %s; %s', upload_file_object,
exception
)
raise Exception(
_('Error deleting staging file; %s') % exception
)
def get_file(self, *args, **kwargs):
return StagingFile(staging_folder=self, *args, **kwargs)
def get_files(self):
try:
for entry in sorted([os.path.normcase(f) for f in os.listdir(self.folder_path) if os.path.isfile(os.path.join(self.folder_path, f))]):
yield self.get_file(filename=entry)
except OSError as exception:
logger.error(
'Unable get list of staging files from source: %s; %s', self,
exception
)
raise Exception(
_('Unable get list of staging files: %s') % exception
)
def get_upload_file_object(self, form_data):
staging_file = self.get_file(
encoded_filename=form_data['staging_file_id']
)
return SourceUploadedFile(
source=self, file=staging_file.as_file(), extra_data=staging_file
)
class WebFormSource(InteractiveSource):
"""
The webform source is an HTML form with a drag and drop window that opens
a file browser on the user's computer. This Source is interactive, meaning
users control live what documents they want to upload. This source is
useful when admins want to allow users to upload any kind of file as
documents from their own computers such as when each user has their own
scanner.
"""
can_compress = True
is_interactive = True
source_type = SOURCE_CHOICE_WEB_FORM
# TODO: unify uncompress as an InteractiveSource field
uncompress = models.CharField(
choices=SOURCE_INTERACTIVE_UNCOMPRESS_CHOICES,
help_text=_('Whether to expand or not compressed archives.'),
max_length=1, verbose_name=_('Uncompress')
)
objects = models.Manager()
class Meta:
verbose_name = _('Web form')
verbose_name_plural = _('Web forms')
# Default path
def get_upload_file_object(self, form_data):
return SourceUploadedFile(source=self, file=form_data['file'])
class OutOfProcessSource(Source):
is_interactive = False
objects = models.Manager()
class Meta:
verbose_name = _('Out of process')
verbose_name_plural = _('Out of process')
class IntervalBaseModel(OutOfProcessSource):
interval = models.PositiveIntegerField(
default=DEFAULT_INTERVAL,
help_text=_('Interval in seconds between checks for new documents.'),
verbose_name=_('Interval')
)
document_type = models.ForeignKey(
DocumentType,
help_text=_(
'Assign a document type to documents uploaded from this source.'
), on_delete=models.CASCADE,
verbose_name=_('Document type')
)
uncompress = models.CharField(
choices=SOURCE_UNCOMPRESS_CHOICES,
help_text=_('Whether to expand or not, compressed archives.'),
max_length=1, verbose_name=_('Uncompress')
)
objects = models.Manager()
class Meta:
verbose_name = _('Interval source')
verbose_name_plural = _('Interval sources')
def _delete_periodic_task(self, pk=None):
try:
periodic_task = PeriodicTask.objects.get(
name=self._get_periodic_task_name(pk)
)
interval_instance = periodic_task.interval
if tuple(interval_instance.periodictask_set.values_list('id', flat=True)) == (periodic_task.pk,):
# Only delete the interval if nobody else is using it
interval_instance.delete()
else:
periodic_task.delete()
except PeriodicTask.DoesNotExist:
logger.warning(
'Tried to delete non existant periodic task "%s"',
self._get_periodic_task_name(pk)
)
def _get_periodic_task_name(self, pk=None):
return 'check_interval_source-%i' % (pk or self.pk)
def delete(self, *args, **kwargs):
pk = self.pk
super(IntervalBaseModel, self).delete(*args, **kwargs)
self._delete_periodic_task(pk)
def save(self, *args, **kwargs):
new_source = not self.pk
super(IntervalBaseModel, self).save(*args, **kwargs)
if not new_source:
self._delete_periodic_task()
interval_instance, created = IntervalSchedule.objects.get_or_create(
every=self.interval, period='seconds'
)
# Create a new interval or reuse someone else's
PeriodicTask.objects.create(
name=self._get_periodic_task_name(),
interval=interval_instance,
task='sources.tasks.task_check_interval_source',
kwargs=json.dumps({'source_id': self.pk})
)
class EmailBaseModel(IntervalBaseModel):
"""
POP3 email and IMAP email sources are non-interactive sources that
periodically fetch emails from an email account using either the POP3 or
IMAP email protocol. These sources are useful when users need to scan
documents outside their office, they can photograph a paper document with
their phones and send the image to a designated email that is setup as a
Mayan POP3 or IMAP source. Mayan will periodically download the emails
and process them as Mayan documents.
"""
host = models.CharField(max_length=128, verbose_name=_('Host'))
ssl = models.BooleanField(default=True, verbose_name=_('SSL'))
port = models.PositiveIntegerField(blank=True, null=True, help_text=_(
'Typical choices are 110 for POP3, 995 for POP3 over SSL, 143 for '
'IMAP, 993 for IMAP over SSL.'), verbose_name=_('Port')
)
username = models.CharField(max_length=96, verbose_name=_('Username'))
password = models.CharField(max_length=96, verbose_name=_('Password'))
metadata_attachment_name = models.CharField(
default=DEFAULT_METADATA_ATTACHMENT_NAME,
help_text=_(
'Name of the attachment that will contains the metadata type '
'names and value pairs to be assigned to the rest of the '
'downloaded attachments. Note: This attachment has to be the '
'first attachment.'
), max_length=128, verbose_name=_('Metadata attachment name')
)
subject_metadata_type = models.ForeignKey(
blank=True, help_text=_(
'Select a metadata type valid for the document type selected in '
'which to store the email\'s subject.'
), on_delete=models.CASCADE, null=True, related_name='email_subject',
to=MetadataType, verbose_name=_('Subject metadata type')
)
from_metadata_type = models.ForeignKey(
blank=True, help_text=_(
'Select a metadata type valid for the document type selected in '
'which to store the email\'s "from" value.'
), on_delete=models.CASCADE, null=True, related_name='email_from',
to=MetadataType, verbose_name=_('From metadata type')
)
store_body = models.BooleanField(
default=True, help_text=_(
'Store the body of the email as a text document.'
), verbose_name=_('Store email body')
)
objects = models.Manager()
class Meta:
verbose_name = _('Email source')
verbose_name_plural = _('Email sources')
def clean(self):
if self.subject_metadata_type:
if self.subject_metadata_type.pk not in self.document_type.metadata.values_list('metadata_type', flat=True):
raise ValidationError(
{
'subject_metadata_type': _(
'Subject metadata type "%(metadata_type)s" is not '
'valid for the document type: %(document_type)s'
) % {
'metadata_type': self.subject_metadata_type,
'document_type': self.document_type
}
}
)
if self.from_metadata_type:
if self.from_metadata_type.pk not in self.document_type.metadata.values_list('metadata_type', flat=True):
raise ValidationError(
{
'from_metadata_type': _(
'"From" metadata type "%(metadata_type)s" is not '
'valid for the document type: %(document_type)s'
) % {
'metadata_type': self.from_metadata_type,
'document_type': self.document_type
}
}
)
@staticmethod
def process_message(source, message_text, message_properties=None):
from flanker import mime
counter = 1
message = mime.from_string(force_str(message_text))
metadata_dictionary = {}
if not message_properties:
message_properties = {}
message_properties['Subject'] = message_properties.get(
'Subject', message.headers.get('Subject')
)
message_properties['From'] = message_properties.get(
'From', message.headers.get('From')
)
if source.subject_metadata_type:
metadata_dictionary[
source.subject_metadata_type.name
] = message_properties.get('Subject')
if source.from_metadata_type:
metadata_dictionary[
source.from_metadata_type.name
] = message_properties.get('From')
# Messages are tree based, do nested processing of message parts until
# a message with no children is found, then work out way up.
if message.parts:
for part in message.parts:
EmailBaseModel.process_message(
source=source, message_text=part.to_string(),
message_properties=message_properties
)
else:
# Treat inlines as attachments, both are extracted and saved as
# documents
if message.is_attachment() or message.is_inline():
label = message.detected_file_name or 'attachment-{}'.format(counter)
with ContentFile(content=message.body, name=label) as file_object:
if label == source.metadata_attachment_name:
metadata_dictionary = yaml.safe_load(
file_object.read()
)
logger.debug(
'Got metadata dictionary: %s', metadata_dictionary
)
else:
documents = source.handle_upload(
document_type=source.document_type,
file_object=file_object, expand=(
source.uncompress == SOURCE_UNCOMPRESS_CHOICE_Y
)
)
if metadata_dictionary:
for document in documents:
set_bulk_metadata(
document=document,
metadata_dictionary=metadata_dictionary
)
else:
# If it is not an attachment then it should be a body message part.
# Another option is to use message.is_body()
if message.detected_content_type == 'text/html':
label = 'email_body.html'
else:
label = 'email_body.txt'
with ContentFile(content=message.body, name=label) as file_object:
documents = source.handle_upload(
document_type=source.document_type,
file_object=file_object,
expand=SOURCE_UNCOMPRESS_CHOICE_N
)
if metadata_dictionary:
for document in documents:
set_bulk_metadata(
document=document,
metadata_dictionary=metadata_dictionary
)
class IMAPEmail(EmailBaseModel):
source_type = SOURCE_CHOICE_EMAIL_IMAP
mailbox = models.CharField(
default=DEFAULT_IMAP_MAILBOX,
help_text=_('IMAP Mailbox from which to check for messages.'),
max_length=64, verbose_name=_('Mailbox')
)
objects = models.Manager()
class Meta:
verbose_name = _('IMAP email')
verbose_name_plural = _('IMAP email')
# http://www.doughellmann.com/PyMOTW/imaplib/
def check_source(self):
logger.debug('Starting IMAP email fetch')
logger.debug('host: %s', self.host)
logger.debug('ssl: %s', self.ssl)
if self.ssl:
mailbox = imaplib.IMAP4_SSL(self.host, self.port)
else:
mailbox = imaplib.IMAP4(self.host, self.port)
mailbox.login(self.username, self.password)
mailbox.select(self.mailbox)
status, data = mailbox.search(None, 'NOT', 'DELETED')
if data:
messages_info = data[0].split()
logger.debug('messages count: %s', len(messages_info))
for message_number in messages_info:
logger.debug('message_number: %s', message_number)
status, data = mailbox.fetch(message_number, '(RFC822)')
EmailBaseModel.process_message(
source=self, message_text=data[0][1]
)
mailbox.store(message_number, '+FLAGS', '\\Deleted')
mailbox.expunge()
mailbox.close()
mailbox.logout()
class POP3Email(EmailBaseModel):
source_type = SOURCE_CHOICE_EMAIL_POP3
timeout = models.PositiveIntegerField(
default=DEFAULT_POP3_TIMEOUT, verbose_name=_('Timeout')
)
objects = models.Manager()
class Meta:
verbose_name = _('POP email')
verbose_name_plural = _('POP email')
def check_source(self):
logger.debug('Starting POP3 email fetch')
logger.debug('host: %s', self.host)
logger.debug('ssl: %s', self.ssl)
if self.ssl:
mailbox = poplib.POP3_SSL(self.host, self.port)
else:
mailbox = poplib.POP3(self.host, self.port, timeout=self.timeout)
mailbox.getwelcome()
mailbox.user(self.username)
mailbox.pass_(self.password)
messages_info = mailbox.list()
logger.debug('messages_info:')
logger.debug(messages_info)
logger.debug('messages count: %s', len(messages_info[1]))
for message_info in messages_info[1]:
message_number, message_size = message_info.split()
logger.debug('message_number: %s', message_number)
logger.debug('message_size: %s', message_size)
complete_message = '\n'.join(mailbox.retr(message_number)[1])
EmailBaseModel.process_message(
source=self, message_text=complete_message
)
mailbox.dele(message_number)
mailbox.quit()
class WatchFolderSource(IntervalBaseModel):
"""
The watch folder is another non-interactive source that like the email
source, works by periodically checking and processing documents. This
source instead of using an email account, monitors a filesystem folder.
Administrators can define watch folders, examples /home/mayan/watch_bills
or /home/mayan/watch_invoices and users just need to copy the documents
they want to upload as a bill or invoice to the respective filesystem
folder. Mayan will periodically scan these filesystem locations and
upload the files as documents, deleting them if configured.
"""
source_type = SOURCE_CHOICE_WATCH
folder_path = models.CharField(
help_text=_('Server side filesystem path.'), max_length=255,
verbose_name=_('Folder path')
)
objects = models.Manager()
class Meta:
verbose_name = _('Watch folder')
verbose_name_plural = _('Watch folders')
def check_source(self):
# Force self.folder_path to unicode to avoid os.listdir returning
# str for non-latin filenames, gh-issue #163
for file_name in os.listdir(force_text(self.folder_path)):
full_path = os.path.join(self.folder_path, file_name)
if os.path.isfile(full_path):
with File(file=open(full_path, mode='rb')) as file_object:
self.handle_upload(
file_object=file_object,
expand=(self.uncompress == SOURCE_UNCOMPRESS_CHOICE_Y),
label=file_name
)
os.unlink(full_path)
class SourceLog(models.Model):
source = models.ForeignKey(
on_delete=models.CASCADE, related_name='logs', to=Source,
verbose_name=_('Source')
)
datetime = models.DateTimeField(
auto_now_add=True, editable=False, verbose_name=_('Date time')
)
message = models.TextField(
blank=True, editable=False, verbose_name=_('Message')
)
class Meta:
get_latest_by = 'datetime'
ordering = ('-datetime',)
verbose_name = _('Log entry')
verbose_name_plural = _('Log entries')