Added python-magic for smarter MIME type detection and a new document field: file_mime_encoding
This commit is contained in:
57
modules/python_magic/README
Normal file
57
modules/python_magic/README
Normal file
@@ -0,0 +1,57 @@
|
||||
= python-magic =
|
||||
|
||||
Adam Hupp <adam at hupp.org>
|
||||
|
||||
Distributed under the PSF License: http://www.python.org/psf/license/
|
||||
|
||||
python-magic is a simple wrapper for libmagic. libmagic identifies
|
||||
file types according to their headers. It is the core of the Unix
|
||||
"file" command.
|
||||
|
||||
= Installation =
|
||||
|
||||
To build and install run:
|
||||
|
||||
# python setup.py install
|
||||
|
||||
= Installation on Win32 =
|
||||
|
||||
You need magic1.dll from http://gnuwin32.sourceforge.net/, grab the
|
||||
binaries and dependencies ZIP-file, extract magic1.dll, regex2.dll
|
||||
and zlib1.dll and put it in C:\Windows\System32. You also need a
|
||||
magic file from Linux, compatible with file version 5.0.
|
||||
|
||||
To build and install run:
|
||||
|
||||
# python setup.py install
|
||||
|
||||
= Example Usage =
|
||||
|
||||
>>> import magic
|
||||
>>> m = magic.Magic()
|
||||
>>> m.from_file("testdata/test.pdf")
|
||||
'PDF document, version 1.2'
|
||||
>>> m.from_buffer(open("testdata/test.pdf").read(1024))
|
||||
'PDF document, version 1.2'
|
||||
|
||||
# For MIME types
|
||||
>>> mime = magic.Magic(mime=True)
|
||||
>>> mime.from_file("testdata/test.pdf")
|
||||
'application/pdf'
|
||||
>>>
|
||||
|
||||
# For MIME encoding
|
||||
>>> mime_encoding = magic.Magic(mime_encoding=True)
|
||||
>>> mime_encoding.from_file("testdata/text-iso8859-1.txt")
|
||||
'iso-8859-1'
|
||||
>>>
|
||||
|
||||
= Contributors =
|
||||
|
||||
Thanks to these folks on github who submitted features and bugfixes.
|
||||
|
||||
NicolasDelaby
|
||||
lukenowak
|
||||
FlaPer87
|
||||
SimpleSeb
|
||||
tehmaze
|
||||
0
modules/python_magic/__init__.py
Normal file
0
modules/python_magic/__init__.py
Normal file
230
modules/python_magic/magic.py
Normal file
230
modules/python_magic/magic.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
magic is a wrapper around the libmagic file identification library.
|
||||
|
||||
See README for more information.
|
||||
|
||||
Usage:
|
||||
|
||||
>>> import magic
|
||||
>>> magic.from_file("testdata/test.pdf")
|
||||
'PDF document, version 1.2'
|
||||
>>> magic.from_file("testdata/test.pdf", mime=True)
|
||||
'application/pdf'
|
||||
>>> magic.from_buffer(open("testdata/test.pdf").read(1024))
|
||||
'PDF document, version 1.2'
|
||||
>>>
|
||||
|
||||
|
||||
"""
|
||||
|
||||
import os.path
|
||||
import ctypes
|
||||
import ctypes.util
|
||||
|
||||
from ctypes import c_char_p, c_int, c_size_t, c_void_p
|
||||
|
||||
class MagicException(Exception): pass
|
||||
|
||||
class Magic:
|
||||
"""
|
||||
Magic is a wrapper around the libmagic C library.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, mime=False, magic_file=None, mime_encoding=False):
|
||||
"""
|
||||
Create a new libmagic wrapper.
|
||||
|
||||
mime - if True, mimetypes are returned instead of textual descriptions
|
||||
mime_encoding - if True, codec is returned
|
||||
magic_file - use a mime database other than the system default
|
||||
|
||||
"""
|
||||
flags = MAGIC_NONE
|
||||
if mime:
|
||||
flags |= MAGIC_MIME
|
||||
elif mime_encoding:
|
||||
flags |= MAGIC_MIME_ENCODING
|
||||
|
||||
self.cookie = magic_open(flags)
|
||||
|
||||
magic_load(self.cookie, magic_file)
|
||||
|
||||
|
||||
def from_buffer(self, buf):
|
||||
"""
|
||||
Identify the contents of `buf`
|
||||
"""
|
||||
return magic_buffer(self.cookie, buf)
|
||||
|
||||
def from_file(self, filename):
|
||||
"""
|
||||
Identify the contents of file `filename`
|
||||
raises IOError if the file does not exist
|
||||
"""
|
||||
|
||||
if not os.path.exists(filename):
|
||||
raise IOError("File does not exist: " + filename)
|
||||
|
||||
return magic_file(self.cookie, filename)
|
||||
|
||||
def __del__(self):
|
||||
if self.cookie:
|
||||
magic_close(self.cookie)
|
||||
self.cookie = None
|
||||
|
||||
_magic_mime = None
|
||||
_magic = None
|
||||
|
||||
def _get_magic_mime():
|
||||
global _magic_mime
|
||||
if not _magic_mime:
|
||||
_magic_mime = Magic(mime=True)
|
||||
return _magic_mime
|
||||
|
||||
def _get_magic():
|
||||
global _magic
|
||||
if not _magic:
|
||||
_magic = Magic()
|
||||
return _magic
|
||||
|
||||
def _get_magic_type(mime):
|
||||
if mime:
|
||||
return _get_magic_mime()
|
||||
else:
|
||||
return _get_magic()
|
||||
|
||||
def from_file(filename, mime=False):
|
||||
m = _get_magic_type(mime)
|
||||
return m.from_file(filename)
|
||||
|
||||
def from_buffer(buffer, mime=False):
|
||||
m = _get_magic_type(mime)
|
||||
return m.from_buffer(buffer)
|
||||
|
||||
|
||||
|
||||
|
||||
libmagic = None
|
||||
# Let's try to find magic or magic1
|
||||
dll = ctypes.util.find_library('magic') or ctypes.util.find_library('magic1')
|
||||
|
||||
# This is necessary because find_library returns None if it doesn't find the library
|
||||
if dll:
|
||||
libmagic = ctypes.CDLL(dll)
|
||||
|
||||
if not libmagic or not libmagic._name:
|
||||
import sys
|
||||
platform_to_lib = {'darwin': '/opt/local/lib/libmagic.dylib',
|
||||
'win32': 'magic1.dll'}
|
||||
if sys.platform in platform_to_lib:
|
||||
try:
|
||||
libmagic = ctypes.CDLL(platform_to_lib[sys.platform])
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
if not libmagic or not libmagic._name:
|
||||
# It is better to raise an ImportError since we are importing magic module
|
||||
raise ImportError('failed to find libmagic. Check your installation')
|
||||
|
||||
magic_t = ctypes.c_void_p
|
||||
|
||||
def errorcheck(result, func, args):
|
||||
err = magic_error(args[0])
|
||||
if err is not None:
|
||||
raise MagicException(err)
|
||||
else:
|
||||
return result
|
||||
|
||||
magic_open = libmagic.magic_open
|
||||
magic_open.restype = magic_t
|
||||
magic_open.argtypes = [c_int]
|
||||
|
||||
magic_close = libmagic.magic_close
|
||||
magic_close.restype = None
|
||||
magic_close.argtypes = [magic_t]
|
||||
|
||||
magic_error = libmagic.magic_error
|
||||
magic_error.restype = c_char_p
|
||||
magic_error.argtypes = [magic_t]
|
||||
|
||||
magic_errno = libmagic.magic_errno
|
||||
magic_errno.restype = c_int
|
||||
magic_errno.argtypes = [magic_t]
|
||||
|
||||
magic_file = libmagic.magic_file
|
||||
magic_file.restype = c_char_p
|
||||
magic_file.argtypes = [magic_t, c_char_p]
|
||||
magic_file.errcheck = errorcheck
|
||||
|
||||
|
||||
_magic_buffer = libmagic.magic_buffer
|
||||
_magic_buffer.restype = c_char_p
|
||||
_magic_buffer.argtypes = [magic_t, c_void_p, c_size_t]
|
||||
_magic_buffer.errcheck = errorcheck
|
||||
|
||||
|
||||
def magic_buffer(cookie, buf):
|
||||
return _magic_buffer(cookie, buf, len(buf))
|
||||
|
||||
|
||||
magic_load = libmagic.magic_load
|
||||
magic_load.restype = c_int
|
||||
magic_load.argtypes = [magic_t, c_char_p]
|
||||
magic_load.errcheck = errorcheck
|
||||
|
||||
magic_setflags = libmagic.magic_setflags
|
||||
magic_setflags.restype = c_int
|
||||
magic_setflags.argtypes = [magic_t, c_int]
|
||||
|
||||
magic_check = libmagic.magic_check
|
||||
magic_check.restype = c_int
|
||||
magic_check.argtypes = [magic_t, c_char_p]
|
||||
|
||||
magic_compile = libmagic.magic_compile
|
||||
magic_compile.restype = c_int
|
||||
magic_compile.argtypes = [magic_t, c_char_p]
|
||||
|
||||
|
||||
|
||||
MAGIC_NONE = 0x000000 # No flags
|
||||
|
||||
MAGIC_DEBUG = 0x000001 # Turn on debugging
|
||||
|
||||
MAGIC_SYMLINK = 0x000002 # Follow symlinks
|
||||
|
||||
MAGIC_COMPRESS = 0x000004 # Check inside compressed files
|
||||
|
||||
MAGIC_DEVICES = 0x000008 # Look at the contents of devices
|
||||
|
||||
MAGIC_MIME = 0x000010 # Return a mime string
|
||||
|
||||
MAGIC_MIME_ENCODING = 0x000400 # Return the MIME encoding
|
||||
|
||||
MAGIC_CONTINUE = 0x000020 # Return all matches
|
||||
|
||||
MAGIC_CHECK = 0x000040 # Print warnings to stderr
|
||||
|
||||
MAGIC_PRESERVE_ATIME = 0x000080 # Restore access time on exit
|
||||
|
||||
MAGIC_RAW = 0x000100 # Don't translate unprintable chars
|
||||
|
||||
MAGIC_ERROR = 0x000200 # Handle ENOENT etc as real errors
|
||||
|
||||
MAGIC_NO_CHECK_COMPRESS = 0x001000 # Don't check for compressed files
|
||||
|
||||
MAGIC_NO_CHECK_TAR = 0x002000 # Don't check for tar files
|
||||
|
||||
MAGIC_NO_CHECK_SOFT = 0x004000 # Don't check magic entries
|
||||
|
||||
MAGIC_NO_CHECK_APPTYPE = 0x008000 # Don't check application type
|
||||
|
||||
MAGIC_NO_CHECK_ELF = 0x010000 # Don't check for elf details
|
||||
|
||||
MAGIC_NO_CHECK_ASCII = 0x020000 # Don't check for ascii files
|
||||
|
||||
MAGIC_NO_CHECK_TROFF = 0x040000 # Don't check ascii/troff
|
||||
|
||||
MAGIC_NO_CHECK_FORTRAN = 0x080000 # Don't check ascii/fortran
|
||||
|
||||
MAGIC_NO_CHECK_TOKENS = 0x100000 # Don't check ascii/tokens
|
||||
17
modules/python_magic/setup.py
Normal file
17
modules/python_magic/setup.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from setuptools import setup, Extension
|
||||
#from distutils.core import setup, Extension
|
||||
|
||||
setup(name='python-magic',
|
||||
description='File type identification using libmagic',
|
||||
author='Adam Hupp',
|
||||
author_email='adam@hupp.org',
|
||||
url="http://github.com/ahupp/python-magic",
|
||||
version='0.4.0',
|
||||
py_modules=['magic'],
|
||||
long_description="""This module uses ctypes to access the libmagic file type
|
||||
identification library. It makes use of the local magic database and
|
||||
supports both textual and MIME-type output.
|
||||
""",
|
||||
keywords="mime magic file",
|
||||
license="PSF",
|
||||
)
|
||||
65
modules/python_magic/test.py
Normal file
65
modules/python_magic/test.py
Normal file
@@ -0,0 +1,65 @@
|
||||
|
||||
import os.path
|
||||
import unittest
|
||||
import random
|
||||
from StringIO import StringIO
|
||||
from os import path
|
||||
from magic import Magic, MagicException
|
||||
|
||||
testfile = [
|
||||
("magic.pyc", "python 2.4 byte-compiled", "application/octet-stream"),
|
||||
("test.pdf", "PDF document, version 1.2", "application/pdf"),
|
||||
("test.gz", 'gzip compressed data, was "test", from Unix, last modified: '
|
||||
'Sat Jun 28 18:32:52 2008', "application/x-gzip"),
|
||||
("text.txt", "ASCII text", "text/plain"),
|
||||
]
|
||||
|
||||
testFileEncoding = [('text-iso8859-1.txt', 'iso-8859-1')]
|
||||
|
||||
class TestMagic(unittest.TestCase):
|
||||
|
||||
mime = False
|
||||
|
||||
def setUp(self):
|
||||
self.m = Magic(mime=self.mime)
|
||||
|
||||
def testFileTypes(self):
|
||||
for filename, desc, mime in testfile:
|
||||
filename = path.join(path.dirname(__file__),
|
||||
"testdata",
|
||||
filename)
|
||||
if self.mime:
|
||||
target = mime
|
||||
else:
|
||||
target = desc
|
||||
|
||||
self.assertEqual(target, self.m.from_buffer(open(filename).read(1024)))
|
||||
self.assertEqual(target, self.m.from_file(filename), filename)
|
||||
|
||||
|
||||
def testErrors(self):
|
||||
self.assertRaises(IOError, self.m.from_file, "nonexistent")
|
||||
self.assertRaises(MagicException, Magic, magic_file="noneexistent")
|
||||
os.environ['MAGIC'] = '/nonexistetn'
|
||||
self.assertRaises(MagicException, Magic)
|
||||
del os.environ['MAGIC']
|
||||
|
||||
class TestMagicMime(TestMagic):
|
||||
mime = True
|
||||
|
||||
class TestMagicMimeEncoding(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.m = Magic(mime_encoding=True)
|
||||
|
||||
def testFileEncoding(self):
|
||||
for filename, encoding in testFileEncoding:
|
||||
filename = path.join(path.dirname(__file__),
|
||||
"testdata",
|
||||
filename)
|
||||
self.assertEqual(encoding, self.m.from_buffer(open(filename).read(1024)))
|
||||
self.assertEqual(encoding, self.m.from_file(filename), filename)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
BIN
modules/python_magic/testdata/test.gz
vendored
Normal file
BIN
modules/python_magic/testdata/test.gz
vendored
Normal file
Binary file not shown.
199
modules/python_magic/testdata/test.pdf
vendored
Normal file
199
modules/python_magic/testdata/test.pdf
vendored
Normal file
@@ -0,0 +1,199 @@
|
||||
%PDF-1.2
|
||||
7 0 obj
|
||||
[5 0 R/XYZ 111.6 757.86]
|
||||
endobj
|
||||
13 0 obj
|
||||
<<
|
||||
/Title(About this document)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsection.1.1)
|
||||
>>
|
||||
/Parent 12 0 R
|
||||
/Next 14 0 R
|
||||
>>
|
||||
endobj
|
||||
15 0 obj
|
||||
<<
|
||||
/Title(Compiling with GHC)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsubsection.1.2.1)
|
||||
>>
|
||||
/Parent 14 0 R
|
||||
/Next 16 0 R
|
||||
>>
|
||||
endobj
|
||||
16 0 obj
|
||||
<<
|
||||
/Title(Compiling with Hugs)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsubsection.1.2.2)
|
||||
>>
|
||||
/Parent 14 0 R
|
||||
/Prev 15 0 R
|
||||
>>
|
||||
endobj
|
||||
14 0 obj
|
||||
<<
|
||||
/Title(Compatibility)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsection.1.2)
|
||||
>>
|
||||
/Parent 12 0 R
|
||||
/Prev 13 0 R
|
||||
/First 15 0 R
|
||||
/Last 16 0 R
|
||||
/Count -2
|
||||
/Next 17 0 R
|
||||
>>
|
||||
endobj
|
||||
17 0 obj
|
||||
<<
|
||||
/Title(Reporting bugs)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsection.1.3)
|
||||
>>
|
||||
/Parent 12 0 R
|
||||
/Prev 14 0 R
|
||||
/Next 18 0 R
|
||||
>>
|
||||
endobj
|
||||
18 0 obj
|
||||
<<
|
||||
/Title(History)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsection.1.4)
|
||||
>>
|
||||
/Parent 12 0 R
|
||||
/Prev 17 0 R
|
||||
/Next 19 0 R
|
||||
>>
|
||||
endobj
|
||||
19 0 obj
|
||||
<<
|
||||
/Title(License)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsection.1.5)
|
||||
>>
|
||||
/Parent 12 0 R
|
||||
/Prev 18 0 R
|
||||
>>
|
||||
endobj
|
||||
12 0 obj
|
||||
<<
|
||||
/Title(Introduction)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(section.1)
|
||||
>>
|
||||
/Parent 11 0 R
|
||||
/First 13 0 R
|
||||
/Last 19 0 R
|
||||
/Count -5
|
||||
/Next 20 0 R
|
||||
>>
|
||||
endobj
|
||||
21 0 obj
|
||||
<<
|
||||
/Title(Running a parser)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsection.2.1)
|
||||
>>
|
||||
/Parent 20 0 R
|
||||
/Next 22 0 R
|
||||
>>
|
||||
endobj
|
||||
22 0 obj
|
||||
<<
|
||||
/Title(Sequence and choice)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsection.2.2)
|
||||
>>
|
||||
/Parent 20 0 R
|
||||
/Prev 21 0 R
|
||||
/Next 23 0 R
|
||||
>>
|
||||
endobj
|
||||
23 0 obj
|
||||
<<
|
||||
/Title(Predictive parsers)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsection.2.3)
|
||||
>>
|
||||
/Parent 20 0 R
|
||||
/Prev 22 0 R
|
||||
/Next 24 0 R
|
||||
>>
|
||||
endobj
|
||||
24 0 obj
|
||||
<<
|
||||
/Title(Adding semantics)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsection.2.4)
|
||||
>>
|
||||
/Parent 20 0 R
|
||||
/Prev 23 0 R
|
||||
/Next 25 0 R
|
||||
>>
|
||||
endobj
|
||||
25 0 obj
|
||||
<<
|
||||
/Title(Sequences and seperators)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsection.2.5)
|
||||
>>
|
||||
/Parent 20 0 R
|
||||
/Prev 24 0 R
|
||||
/Next 26 0 R
|
||||
>>
|
||||
endobj
|
||||
26 0 obj
|
||||
<<
|
||||
/Title(Improving error messages)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsection.2.6)
|
||||
>>
|
||||
/Parent 20 0 R
|
||||
/Prev 25 0 R
|
||||
/Next 27 0 R
|
||||
>>
|
||||
endobj
|
||||
27 0 obj
|
||||
<<
|
||||
/Title(Expressions)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsection.2.7)
|
||||
>>
|
||||
/Parent 20 0 R
|
||||
/Prev 26 0 R
|
||||
/Next 28 0 R
|
||||
>>
|
||||
endobj
|
||||
28 0 obj
|
||||
<<
|
||||
/Title(Lexical analysis)
|
||||
/A<<
|
||||
/S/GoTo
|
||||
/D(subsection.2.8)
|
||||
>>
|
||||
/Parent 20 0 R
|
||||
/Prev 27 0 R
|
||||
/Next 29 0 R
|
||||
>>
|
||||
endobj
|
||||
30 0 obj
|
||||
<<
|
||||
/Title(Lexeme parsers
|
||||
2
modules/python_magic/testdata/text-iso8859-1.txt
vendored
Normal file
2
modules/python_magic/testdata/text-iso8859-1.txt
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
This is a web page encoded in iso-8859-1
|
||||
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
||||
2
modules/python_magic/testdata/text.txt
vendored
Normal file
2
modules/python_magic/testdata/text.txt
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
Hello, World!
|
||||
|
||||
Reference in New Issue
Block a user