diff --git a/README.md b/README.md index e1e21d1eae..662d33ab2a 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ Python: * django-mptt - Utilities for implementing a modified pre-order traversal tree in django * python-magic - A python wrapper for libmagic * django-taggit - Simple tagging for django +* slate - The simplest way to extract text from PDFs in Python + Execute pip install -r requirements/production.txt to install the python/django dependencies automatically. diff --git a/apps/ocr/parsers/__init__.py b/apps/ocr/parsers/__init__.py new file mode 100644 index 0000000000..815e868747 --- /dev/null +++ b/apps/ocr/parsers/__init__.py @@ -0,0 +1,40 @@ +import codecs +import os +import subprocess +import tempfile +import sys + +import slate + +from django.utils.translation import ugettext as _ + +from ocr.parsers.exceptions import ParserError, ParserUnknownFile + +mimetype_registry = {} + + +def register_parser(mimetype, function): + mimetype_registry[mimetype] = {'function': function} + + +def pdf_parser(document_page): + fd = document_page.document.open() + pdf_pages = slate.PDF(fd) + fd.close() + + if pdf_pages[document_page.page_number - 1] == '\x0c': + raise ParserError + + document_page.content = pdf_pages[document_page.page_number - 1] + document_page.page_label = _(u'Text extracted from PDF') + document_page.save() + + +def parse_document_page(document_page): + try: + mimetype_registry[document_page.document.file_mimetype]['function'](document_page) + except KeyError: + raise ParserUnknownFile + + +register_parser('application/pdf', pdf_parser) diff --git a/apps/ocr/parsers/exceptions.py b/apps/ocr/parsers/exceptions.py new file mode 100644 index 0000000000..e06875f222 --- /dev/null +++ b/apps/ocr/parsers/exceptions.py @@ -0,0 +1,10 @@ +class ParserError(Exception): + """ + Raised when a text parser fails to understand a file it been passed + or the resulting parsed text is invalid + """ + pass + + +class ParserUnknownFile(Exception): + pass diff --git a/requirements/development.txt b/requirements/development.txt index 00def8c63a..3acf630b4e 100644 --- a/requirements/development.txt +++ b/requirements/development.txt @@ -9,3 +9,5 @@ django-celery==2.2.2 django-sentry==1.6.0 django-taggit==0.9.3 -e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt +slate==0.3 +PIL==1.1.7 diff --git a/requirements/production.txt b/requirements/production.txt index 1f1d3a0881..02219abaee 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -6,3 +6,5 @@ django-celery==2.2.2 django-sentry==1.6.0 django-taggit==0.9.3 -e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt +slate==0.3 +PIL==1.1.7