diff --git a/README.md b/README.md index 3012b25ce2..0f72eba7a5 100755 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ Executables: * ImageMagick - Convert, Edit, Or Compose Bitmap Images * tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google. +* libmagic License ------- diff --git a/apps/documents/models.py b/apps/documents/models.py index b525750523..0cf858fd69 100755 --- a/apps/documents/models.py +++ b/apps/documents/models.py @@ -3,6 +3,7 @@ import os import mimetypes from datetime import datetime import sys +from python_magic import magic from django.conf import settings from django.db import models @@ -34,10 +35,7 @@ def get_filename_from_uuid(instance, filename, directory=STORAGE_DIRECTORY_NAME) return '%s/%s' % (directory, instance.uuid) def populate_file_extension_and_mimetype(instance, filename): - # First populate the file extension and mimetype - instance.file_mimetype, encoding = mimetypes.guess_type(filename) - if not instance.file_mimetype: - instance.file_mimetype = u'unknown' + # First populate the file extension filename, extension = os.path.splitext(filename) instance.file_filename = filename #remove prefix '.' @@ -59,6 +57,7 @@ class Document(models.Model): file = models.FileField(upload_to=get_filename_from_uuid, storage=STORAGE_BACKEND(), verbose_name=_(u'file')) uuid = models.CharField(max_length=48, default=UUID_FUNCTION(), blank=True, editable=False) file_mimetype = models.CharField(max_length=64, default='', editable=False) + file_mime_encoding = models.CharField(max_length=64, default='', editable=False) #FAT filename can be up to 255 using LFN file_filename = models.CharField(max_length=64, default='', editable=False) file_extension = models.CharField(max_length=16, default='', editable=False) @@ -78,6 +77,21 @@ class Document(models.Model): def get_fullname(self): return os.extsep.join([self.file_filename, self.file_extension]) + def update_mimetype(self): + try: + mime = magic.Magic(mime=True) + self.file_mimetype = mime.from_buffer(self.read()) + mime_encoding = magic.Magic(mime_encoding=True) + self.file_mime_encoding = mime_encoding.from_buffer(self.read()) + except: + self.file_mimetype = u'unknown' + self.file_mime_encoding = u'unknown' + finally: + self.save() + + def read(self, count=1024): + return self.file.storage.open(self.file.url).read(count) + @models.permalink def get_absolute_url(self): return ('document_view', [self.id]) diff --git a/apps/documents/views.py b/apps/documents/views.py index 921c9c7cba..5570c1ba1e 100755 --- a/apps/documents/views.py +++ b/apps/documents/views.py @@ -116,6 +116,7 @@ def upload_document_with_type(request, document_type_id, multiple=True): if local_form.is_valid(): instance = local_form.save() instance.update_checksum() + instance.update_mimetype() if 'document_type_available_filenames' in local_form.cleaned_data: if local_form.cleaned_data['document_type_available_filenames']: instance.file_filename = local_form.cleaned_data['document_type_available_filenames'].filename @@ -147,6 +148,7 @@ def upload_document_with_type(request, document_type_id, multiple=True): document = Document(file=staging_file.upload(), document_type=document_type) document.save() document.update_checksum() + document.update_mimetype() except Exception, e: messages.error(request, e) else: @@ -224,6 +226,7 @@ def document_view(request, document_id): {'label':_(u'Filename'), 'field':'file_filename'}, {'label':_(u'File extension'), 'field':'file_extension'}, {'label':_(u'File mimetype'), 'field':'file_mimetype'}, + {'label':_(u'File mime encoding'), 'field':'file_mime_encoding'}, {'label':_(u'File size'), 'field':lambda x: pretty_size(x.file.storage.size(x.file.path)) if x.exists() else '-'}, {'label':_(u'Exists in storage'), 'field':'exists'}, {'label':_(u'Date added'), 'field':lambda x: x.date_added.date()}, diff --git a/docs/Changelog.txt b/docs/Changelog.txt new file mode 100644 index 0000000000..6a21606454 --- /dev/null +++ b/docs/Changelog.txt @@ -0,0 +1,2 @@ +* Added python-magic for smarter MIME type detection (https://github.com/ahupp/python-magic) +* Added a new Document model field: file_mime_encoding diff --git a/modules/__init__.py b/modules/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/modules/python_magic/README b/modules/python_magic/README new file mode 100644 index 0000000000..5b438b7db8 --- /dev/null +++ b/modules/python_magic/README @@ -0,0 +1,57 @@ += python-magic = + +Adam Hupp + +Distributed under the PSF License: http://www.python.org/psf/license/ + +python-magic is a simple wrapper for libmagic. libmagic identifies +file types according to their headers. It is the core of the Unix +"file" command. + += Installation = + +To build and install run: + +# python setup.py install + += Installation on Win32 = + +You need magic1.dll from http://gnuwin32.sourceforge.net/, grab the +binaries and dependencies ZIP-file, extract magic1.dll, regex2.dll +and zlib1.dll and put it in C:\Windows\System32. You also need a +magic file from Linux, compatible with file version 5.0. + +To build and install run: + +# python setup.py install + += Example Usage = + +>>> import magic +>>> m = magic.Magic() +>>> m.from_file("testdata/test.pdf") +'PDF document, version 1.2' +>>> m.from_buffer(open("testdata/test.pdf").read(1024)) +'PDF document, version 1.2' + +# For MIME types +>>> mime = magic.Magic(mime=True) +>>> mime.from_file("testdata/test.pdf") +'application/pdf' +>>> + +# For MIME encoding +>>> mime_encoding = magic.Magic(mime_encoding=True) +>>> mime_encoding.from_file("testdata/text-iso8859-1.txt") +'iso-8859-1' +>>> + += Contributors = + +Thanks to these folks on github who submitted features and bugfixes. + +NicolasDelaby +lukenowak +FlaPer87 +SimpleSeb +tehmaze diff --git a/modules/python_magic/__init__.py b/modules/python_magic/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/modules/python_magic/magic.py b/modules/python_magic/magic.py new file mode 100644 index 0000000000..53a91fd473 --- /dev/null +++ b/modules/python_magic/magic.py @@ -0,0 +1,230 @@ +""" +magic is a wrapper around the libmagic file identification library. + +See README for more information. + +Usage: + +>>> import magic +>>> magic.from_file("testdata/test.pdf") +'PDF document, version 1.2' +>>> magic.from_file("testdata/test.pdf", mime=True) +'application/pdf' +>>> magic.from_buffer(open("testdata/test.pdf").read(1024)) +'PDF document, version 1.2' +>>> + + +""" + +import os.path +import ctypes +import ctypes.util + +from ctypes import c_char_p, c_int, c_size_t, c_void_p + +class MagicException(Exception): pass + +class Magic: + """ + Magic is a wrapper around the libmagic C library. + + """ + + def __init__(self, mime=False, magic_file=None, mime_encoding=False): + """ + Create a new libmagic wrapper. + + mime - if True, mimetypes are returned instead of textual descriptions + mime_encoding - if True, codec is returned + magic_file - use a mime database other than the system default + + """ + flags = MAGIC_NONE + if mime: + flags |= MAGIC_MIME + elif mime_encoding: + flags |= MAGIC_MIME_ENCODING + + self.cookie = magic_open(flags) + + magic_load(self.cookie, magic_file) + + + def from_buffer(self, buf): + """ + Identify the contents of `buf` + """ + return magic_buffer(self.cookie, buf) + + def from_file(self, filename): + """ + Identify the contents of file `filename` + raises IOError if the file does not exist + """ + + if not os.path.exists(filename): + raise IOError("File does not exist: " + filename) + + return magic_file(self.cookie, filename) + + def __del__(self): + if self.cookie: + magic_close(self.cookie) + self.cookie = None + +_magic_mime = None +_magic = None + +def _get_magic_mime(): + global _magic_mime + if not _magic_mime: + _magic_mime = Magic(mime=True) + return _magic_mime + +def _get_magic(): + global _magic + if not _magic: + _magic = Magic() + return _magic + +def _get_magic_type(mime): + if mime: + return _get_magic_mime() + else: + return _get_magic() + +def from_file(filename, mime=False): + m = _get_magic_type(mime) + return m.from_file(filename) + +def from_buffer(buffer, mime=False): + m = _get_magic_type(mime) + return m.from_buffer(buffer) + + + + +libmagic = None +# Let's try to find magic or magic1 +dll = ctypes.util.find_library('magic') or ctypes.util.find_library('magic1') + +# This is necessary because find_library returns None if it doesn't find the library +if dll: + libmagic = ctypes.CDLL(dll) + +if not libmagic or not libmagic._name: + import sys + platform_to_lib = {'darwin': '/opt/local/lib/libmagic.dylib', + 'win32': 'magic1.dll'} + if sys.platform in platform_to_lib: + try: + libmagic = ctypes.CDLL(platform_to_lib[sys.platform]) + except OSError: + pass + +if not libmagic or not libmagic._name: + # It is better to raise an ImportError since we are importing magic module + raise ImportError('failed to find libmagic. Check your installation') + +magic_t = ctypes.c_void_p + +def errorcheck(result, func, args): + err = magic_error(args[0]) + if err is not None: + raise MagicException(err) + else: + return result + +magic_open = libmagic.magic_open +magic_open.restype = magic_t +magic_open.argtypes = [c_int] + +magic_close = libmagic.magic_close +magic_close.restype = None +magic_close.argtypes = [magic_t] + +magic_error = libmagic.magic_error +magic_error.restype = c_char_p +magic_error.argtypes = [magic_t] + +magic_errno = libmagic.magic_errno +magic_errno.restype = c_int +magic_errno.argtypes = [magic_t] + +magic_file = libmagic.magic_file +magic_file.restype = c_char_p +magic_file.argtypes = [magic_t, c_char_p] +magic_file.errcheck = errorcheck + + +_magic_buffer = libmagic.magic_buffer +_magic_buffer.restype = c_char_p +_magic_buffer.argtypes = [magic_t, c_void_p, c_size_t] +_magic_buffer.errcheck = errorcheck + + +def magic_buffer(cookie, buf): + return _magic_buffer(cookie, buf, len(buf)) + + +magic_load = libmagic.magic_load +magic_load.restype = c_int +magic_load.argtypes = [magic_t, c_char_p] +magic_load.errcheck = errorcheck + +magic_setflags = libmagic.magic_setflags +magic_setflags.restype = c_int +magic_setflags.argtypes = [magic_t, c_int] + +magic_check = libmagic.magic_check +magic_check.restype = c_int +magic_check.argtypes = [magic_t, c_char_p] + +magic_compile = libmagic.magic_compile +magic_compile.restype = c_int +magic_compile.argtypes = [magic_t, c_char_p] + + + +MAGIC_NONE = 0x000000 # No flags + +MAGIC_DEBUG = 0x000001 # Turn on debugging + +MAGIC_SYMLINK = 0x000002 # Follow symlinks + +MAGIC_COMPRESS = 0x000004 # Check inside compressed files + +MAGIC_DEVICES = 0x000008 # Look at the contents of devices + +MAGIC_MIME = 0x000010 # Return a mime string + +MAGIC_MIME_ENCODING = 0x000400 # Return the MIME encoding + +MAGIC_CONTINUE = 0x000020 # Return all matches + +MAGIC_CHECK = 0x000040 # Print warnings to stderr + +MAGIC_PRESERVE_ATIME = 0x000080 # Restore access time on exit + +MAGIC_RAW = 0x000100 # Don't translate unprintable chars + +MAGIC_ERROR = 0x000200 # Handle ENOENT etc as real errors + +MAGIC_NO_CHECK_COMPRESS = 0x001000 # Don't check for compressed files + +MAGIC_NO_CHECK_TAR = 0x002000 # Don't check for tar files + +MAGIC_NO_CHECK_SOFT = 0x004000 # Don't check magic entries + +MAGIC_NO_CHECK_APPTYPE = 0x008000 # Don't check application type + +MAGIC_NO_CHECK_ELF = 0x010000 # Don't check for elf details + +MAGIC_NO_CHECK_ASCII = 0x020000 # Don't check for ascii files + +MAGIC_NO_CHECK_TROFF = 0x040000 # Don't check ascii/troff + +MAGIC_NO_CHECK_FORTRAN = 0x080000 # Don't check ascii/fortran + +MAGIC_NO_CHECK_TOKENS = 0x100000 # Don't check ascii/tokens diff --git a/modules/python_magic/setup.py b/modules/python_magic/setup.py new file mode 100644 index 0000000000..7666efa0dd --- /dev/null +++ b/modules/python_magic/setup.py @@ -0,0 +1,17 @@ +from setuptools import setup, Extension +#from distutils.core import setup, Extension + +setup(name='python-magic', + description='File type identification using libmagic', + author='Adam Hupp', + author_email='adam@hupp.org', + url="http://github.com/ahupp/python-magic", + version='0.4.0', + py_modules=['magic'], + long_description="""This module uses ctypes to access the libmagic file type +identification library. It makes use of the local magic database and +supports both textual and MIME-type output. +""", + keywords="mime magic file", + license="PSF", + ) diff --git a/modules/python_magic/test.py b/modules/python_magic/test.py new file mode 100644 index 0000000000..d822f68194 --- /dev/null +++ b/modules/python_magic/test.py @@ -0,0 +1,65 @@ + +import os.path +import unittest +import random +from StringIO import StringIO +from os import path +from magic import Magic, MagicException + +testfile = [ + ("magic.pyc", "python 2.4 byte-compiled", "application/octet-stream"), + ("test.pdf", "PDF document, version 1.2", "application/pdf"), + ("test.gz", 'gzip compressed data, was "test", from Unix, last modified: ' + 'Sat Jun 28 18:32:52 2008', "application/x-gzip"), + ("text.txt", "ASCII text", "text/plain"), + ] + +testFileEncoding = [('text-iso8859-1.txt', 'iso-8859-1')] + +class TestMagic(unittest.TestCase): + + mime = False + + def setUp(self): + self.m = Magic(mime=self.mime) + + def testFileTypes(self): + for filename, desc, mime in testfile: + filename = path.join(path.dirname(__file__), + "testdata", + filename) + if self.mime: + target = mime + else: + target = desc + + self.assertEqual(target, self.m.from_buffer(open(filename).read(1024))) + self.assertEqual(target, self.m.from_file(filename), filename) + + + def testErrors(self): + self.assertRaises(IOError, self.m.from_file, "nonexistent") + self.assertRaises(MagicException, Magic, magic_file="noneexistent") + os.environ['MAGIC'] = '/nonexistetn' + self.assertRaises(MagicException, Magic) + del os.environ['MAGIC'] + +class TestMagicMime(TestMagic): + mime = True + +class TestMagicMimeEncoding(unittest.TestCase): + def setUp(self): + self.m = Magic(mime_encoding=True) + + def testFileEncoding(self): + for filename, encoding in testFileEncoding: + filename = path.join(path.dirname(__file__), + "testdata", + filename) + self.assertEqual(encoding, self.m.from_buffer(open(filename).read(1024))) + self.assertEqual(encoding, self.m.from_file(filename), filename) + + +if __name__ == '__main__': + unittest.main() + diff --git a/modules/python_magic/testdata/test.gz b/modules/python_magic/testdata/test.gz new file mode 100644 index 0000000000..5d847dd99d Binary files /dev/null and b/modules/python_magic/testdata/test.gz differ diff --git a/modules/python_magic/testdata/test.pdf b/modules/python_magic/testdata/test.pdf new file mode 100644 index 0000000000..b986617b22 --- /dev/null +++ b/modules/python_magic/testdata/test.pdf @@ -0,0 +1,199 @@ +%PDF-1.2 +7 0 obj +[5 0 R/XYZ 111.6 757.86] +endobj +13 0 obj +<< +/Title(About this document) +/A<< +/S/GoTo +/D(subsection.1.1) +>> +/Parent 12 0 R +/Next 14 0 R +>> +endobj +15 0 obj +<< +/Title(Compiling with GHC) +/A<< +/S/GoTo +/D(subsubsection.1.2.1) +>> +/Parent 14 0 R +/Next 16 0 R +>> +endobj +16 0 obj +<< +/Title(Compiling with Hugs) +/A<< +/S/GoTo +/D(subsubsection.1.2.2) +>> +/Parent 14 0 R +/Prev 15 0 R +>> +endobj +14 0 obj +<< +/Title(Compatibility) +/A<< +/S/GoTo +/D(subsection.1.2) +>> +/Parent 12 0 R +/Prev 13 0 R +/First 15 0 R +/Last 16 0 R +/Count -2 +/Next 17 0 R +>> +endobj +17 0 obj +<< +/Title(Reporting bugs) +/A<< +/S/GoTo +/D(subsection.1.3) +>> +/Parent 12 0 R +/Prev 14 0 R +/Next 18 0 R +>> +endobj +18 0 obj +<< +/Title(History) +/A<< +/S/GoTo +/D(subsection.1.4) +>> +/Parent 12 0 R +/Prev 17 0 R +/Next 19 0 R +>> +endobj +19 0 obj +<< +/Title(License) +/A<< +/S/GoTo +/D(subsection.1.5) +>> +/Parent 12 0 R +/Prev 18 0 R +>> +endobj +12 0 obj +<< +/Title(Introduction) +/A<< +/S/GoTo +/D(section.1) +>> +/Parent 11 0 R +/First 13 0 R +/Last 19 0 R +/Count -5 +/Next 20 0 R +>> +endobj +21 0 obj +<< +/Title(Running a parser) +/A<< +/S/GoTo +/D(subsection.2.1) +>> +/Parent 20 0 R +/Next 22 0 R +>> +endobj +22 0 obj +<< +/Title(Sequence and choice) +/A<< +/S/GoTo +/D(subsection.2.2) +>> +/Parent 20 0 R +/Prev 21 0 R +/Next 23 0 R +>> +endobj +23 0 obj +<< +/Title(Predictive parsers) +/A<< +/S/GoTo +/D(subsection.2.3) +>> +/Parent 20 0 R +/Prev 22 0 R +/Next 24 0 R +>> +endobj +24 0 obj +<< +/Title(Adding semantics) +/A<< +/S/GoTo +/D(subsection.2.4) +>> +/Parent 20 0 R +/Prev 23 0 R +/Next 25 0 R +>> +endobj +25 0 obj +<< +/Title(Sequences and seperators) +/A<< +/S/GoTo +/D(subsection.2.5) +>> +/Parent 20 0 R +/Prev 24 0 R +/Next 26 0 R +>> +endobj +26 0 obj +<< +/Title(Improving error messages) +/A<< +/S/GoTo +/D(subsection.2.6) +>> +/Parent 20 0 R +/Prev 25 0 R +/Next 27 0 R +>> +endobj +27 0 obj +<< +/Title(Expressions) +/A<< +/S/GoTo +/D(subsection.2.7) +>> +/Parent 20 0 R +/Prev 26 0 R +/Next 28 0 R +>> +endobj +28 0 obj +<< +/Title(Lexical analysis) +/A<< +/S/GoTo +/D(subsection.2.8) +>> +/Parent 20 0 R +/Prev 27 0 R +/Next 29 0 R +>> +endobj +30 0 obj +<< +/Title(Lexeme parsers \ No newline at end of file diff --git a/modules/python_magic/testdata/text-iso8859-1.txt b/modules/python_magic/testdata/text-iso8859-1.txt new file mode 100644 index 0000000000..524a1d0cf1 --- /dev/null +++ b/modules/python_magic/testdata/text-iso8859-1.txt @@ -0,0 +1,2 @@ +This is a web page encoded in iso-8859-1 +יטאשפגןמ diff --git a/modules/python_magic/testdata/text.txt b/modules/python_magic/testdata/text.txt new file mode 100644 index 0000000000..476f506d8c --- /dev/null +++ b/modules/python_magic/testdata/text.txt @@ -0,0 +1,2 @@ +Hello, World! +