Added the first text parser backend (PDF) and updated the requirements files and README
This commit is contained in:
@@ -18,6 +18,8 @@ Python:
|
|||||||
* django-mptt - Utilities for implementing a modified pre-order traversal tree in django
|
* django-mptt - Utilities for implementing a modified pre-order traversal tree in django
|
||||||
* python-magic - A python wrapper for libmagic
|
* python-magic - A python wrapper for libmagic
|
||||||
* django-taggit - Simple tagging for django
|
* django-taggit - Simple tagging for django
|
||||||
|
* slate - The simplest way to extract text from PDFs in Python
|
||||||
|
|
||||||
|
|
||||||
Execute pip install -r requirements/production.txt to install the python/django dependencies automatically.
|
Execute pip install -r requirements/production.txt to install the python/django dependencies automatically.
|
||||||
|
|
||||||
|
|||||||
40
apps/ocr/parsers/__init__.py
Normal file
40
apps/ocr/parsers/__init__.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
import codecs
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import slate
|
||||||
|
|
||||||
|
from django.utils.translation import ugettext as _
|
||||||
|
|
||||||
|
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
|
||||||
|
|
||||||
|
mimetype_registry = {}
|
||||||
|
|
||||||
|
|
||||||
|
def register_parser(mimetype, function):
|
||||||
|
mimetype_registry[mimetype] = {'function': function}
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_parser(document_page):
|
||||||
|
fd = document_page.document.open()
|
||||||
|
pdf_pages = slate.PDF(fd)
|
||||||
|
fd.close()
|
||||||
|
|
||||||
|
if pdf_pages[document_page.page_number - 1] == '\x0c':
|
||||||
|
raise ParserError
|
||||||
|
|
||||||
|
document_page.content = pdf_pages[document_page.page_number - 1]
|
||||||
|
document_page.page_label = _(u'Text extracted from PDF')
|
||||||
|
document_page.save()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_document_page(document_page):
|
||||||
|
try:
|
||||||
|
mimetype_registry[document_page.document.file_mimetype]['function'](document_page)
|
||||||
|
except KeyError:
|
||||||
|
raise ParserUnknownFile
|
||||||
|
|
||||||
|
|
||||||
|
register_parser('application/pdf', pdf_parser)
|
||||||
10
apps/ocr/parsers/exceptions.py
Normal file
10
apps/ocr/parsers/exceptions.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
class ParserError(Exception):
|
||||||
|
"""
|
||||||
|
Raised when a text parser fails to understand a file it been passed
|
||||||
|
or the resulting parsed text is invalid
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ParserUnknownFile(Exception):
|
||||||
|
pass
|
||||||
@@ -9,3 +9,5 @@ django-celery==2.2.2
|
|||||||
django-sentry==1.6.0
|
django-sentry==1.6.0
|
||||||
django-taggit==0.9.3
|
django-taggit==0.9.3
|
||||||
-e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt
|
-e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt
|
||||||
|
slate==0.3
|
||||||
|
PIL==1.1.7
|
||||||
|
|||||||
@@ -6,3 +6,5 @@ django-celery==2.2.2
|
|||||||
django-sentry==1.6.0
|
django-sentry==1.6.0
|
||||||
django-taggit==0.9.3
|
django-taggit==0.9.3
|
||||||
-e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt
|
-e git://github.com/django-mptt/django-mptt.git@0af02a95877041b2fd6d458bd95413dc1666c321#egg=django-mptt
|
||||||
|
slate==0.3
|
||||||
|
PIL==1.1.7
|
||||||
|
|||||||
Reference in New Issue
Block a user