Added the first text parser backend (PDF) and updated the requirements files and README

This commit is contained in:
Roberto Rosario
2011-07-18 04:06:59 -04:00
parent 5bfd607b31
commit d566dfbb1d
5 changed files with 56 additions and 0 deletions

View File

@@ -18,6 +18,8 @@ Python:
* django-mptt - Utilities for implementing a modified pre-order traversal tree in django * django-mptt - Utilities for implementing a modified pre-order traversal tree in django
* python-magic - A python wrapper for libmagic * python-magic - A python wrapper for libmagic
* django-taggit - Simple tagging for django * django-taggit - Simple tagging for django
* slate - The simplest way to extract text from PDFs in Python
Execute pip install -r requirements/production.txt to install the python/django dependencies automatically. Execute pip install -r requirements/production.txt to install the python/django dependencies automatically.

View File

@@ -0,0 +1,40 @@
import codecs
import os
import subprocess
import tempfile
import sys
import slate
from django.utils.translation import ugettext as _
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
mimetype_registry = {}
def register_parser(mimetype, function):
mimetype_registry[mimetype] = {'function': function}
def pdf_parser(document_page):
fd = document_page.document.open()
pdf_pages = slate.PDF(fd)
fd.close()
if pdf_pages[document_page.page_number - 1] == '\x0c':
raise ParserError
document_page.content = pdf_pages[document_page.page_number - 1]
document_page.page_label = _(u'Text extracted from PDF')
document_page.save()
def parse_document_page(document_page):
try:
mimetype_registry[document_page.document.file_mimetype]['function'](document_page)
except KeyError:
raise ParserUnknownFile
register_parser('application/pdf', pdf_parser)

View File

@@ -0,0 +1,10 @@
class ParserError(Exception):
"""
Raised when a text parser fails to understand a file it been passed
or the resulting parsed text is invalid
"""
pass
class ParserUnknownFile(Exception):
pass

View File

@@ -9,3 +9,5 @@ django-celery==2.2.2
django-sentry==1.6.0 django-sentry==1.6.0
django-taggit==0.9.3 django-taggit==0.9.3
-e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt -e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt
slate==0.3
PIL==1.1.7

View File

@@ -6,3 +6,5 @@ django-celery==2.2.2
django-sentry==1.6.0 django-sentry==1.6.0
django-taggit==0.9.3 django-taggit==0.9.3
-e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt -e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt
slate==0.3
PIL==1.1.7