Merge branch 'feature/text_renderer_parser' into development

This commit is contained in:
Roberto Rosario
2012-07-28 05:51:23 -04:00
10 changed files with 163 additions and 32 deletions

View File

@@ -31,6 +31,7 @@
font-size: 1.2em;
margin: 0;
padding: 1px 0;
font-weight: bold;
}
.debug {

89
apps/common/textparser.py Normal file
View File

@@ -0,0 +1,89 @@
import codecs
from pygments import highlight
from pygments.lexers import TextLexer, guess_lexer, get_lexer_for_filename, ClassNotFound
from pygments.formatters import ImageFormatter
DEFAULT_PAGE_WIDTH = 70
DEFAULT_PAGE_HEIGHT = 57
DEFAULT_LINE_NUMBER_PAD = 19
CHUNKSIZE = 1024
NEWLINE = u'\n'
SPACE = u' '
TEXT_PARSER_MIMETYPES = ['text/plain' ,'text/x-python', 'text/html', 'text/x-shellscript']
class TextParser(object):
def render_to_viewport(self, filename, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, fill_last_page=False):
"""
Render an input text file into an imaginary squared view port (terminal window),
returning a list of pages which are themselves a list of lines
"""
pages = []
with codecs.open(filename, 'rU', 'utf-8') as descriptor:
width = 0
height = 0
line = []
page = []
bytes_read = descriptor.read(CHUNKSIZE)
while bytes_read:
for letter in bytes_read:
if letter != NEWLINE:
line.append(letter)
width = width + 1
if width >= page_width or letter == NEWLINE:
page.append(u''.join(line))
line = []
width = 0
height = height + 1
if height >= page_height:
pages.append(page)
page = []
height = 0
bytes_read = descriptor.read(CHUNKSIZE)
# Fill any final partial page with empty lines
if fill_last_page:
for filler in range(DEFAULT_PAGE_HEIGHT - len(page)):
page.append(SPACE)
# Append any final partial page when chunk ends
pages.append(page)
return pages
def render_to_image(self, filename, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, page_number=None, lexer=None, line_numbers=True, line_number_pad=DEFAULT_LINE_NUMBER_PAD):
"""
Turn a list of pages and lines and product and image representation,
selecting the best parser possible based on the filename and contents
"""
pages = self.render_to_viewport(filename, page_width, page_height, fill_last_page=True)
if not lexer:
# Read entire file to guess the lexer
with codecs.open(filename, 'r', 'utf-8') as descriptor:
file_data = descriptor.read()
if not lexer:
try:
lexer = get_lexer_for_filename(filename, file_data)
except ClassNotFound, err:
try:
lexer = guess_lexer(file_data)
except ClassNotFound:
lexer = TextLexer()
if page_number:
# Render a single page into image
return highlight(u'\n'.join(pages[page_number - 1]), lexer, ImageFormatter(line_number_start=(page_number - 1) * page_height + 1, line_numbers=line_numbers, line_number_pad=line_number_pad))
else:
# Render all pages into image
output = []
for page, page_number in zip(pages, xrange(len(pages))):
output.append(highlight(u'\n'.join(page), lexer, ImageFormatter(line_number_start=page_number * page_height + 1, line_numbers=line_numbers, line_number_pad=line_number_pad)))
return output

View File

@@ -88,12 +88,13 @@ class TextAreaDiv(forms.widgets.Widget):
def render(self, name, value, attrs=None):
if value is None:
value = ''
final_attrs = self.build_attrs(attrs, name=name)
result = mark_safe(u'<div%s>%s</div>' % (flatatt(final_attrs),
conditional_escape(force_unicode(value))))
value = u''
return mark_safe(result.replace('\n', '<br>'))
flat_attrs = flatatt(self.build_attrs(attrs, name=name))
content = conditional_escape(force_unicode(value))
# Not needed for <pre> - .replace(u'\n', u'<br>').replace(u' ', u'&nbsp;')
result = u'<pre%s>%s</pre>' % (flat_attrs, content)
return mark_safe(result)
# From: http://www.peterbe.com/plog/emailinput-html5-django

View File

@@ -7,6 +7,8 @@ import logging
from django.utils.encoding import smart_str
from common.conf.settings import TEMPORARY_DIRECTORY
from common.textparser import TextParser, TEXT_PARSER_MIMETYPES
from mimetype.api import get_mimetype
from .literals import (DEFAULT_PAGE_NUMBER,
DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, DEFAULT_FILE_FORMAT)
@@ -21,6 +23,8 @@ from .exceptions import OfficeConversionError, UnknownFileFormat
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
logger = logging.getLogger(__name__)
text_parser = TextParser()
TEXT_PARSER_FILE_SUFFIX = '_text_parser'
def cache_cleanup(input_filepath, *args, **kwargs):
@@ -55,7 +59,22 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype=
if os.path.exists(output_filepath):
return output_filepath
if office_converter:
if not mimetype:
with open(input_filepath, 'rb') as descriptor:
mimetype2, encoding = get_mimetype(descriptor, input_filepath, mimetype_only=True)
logger.debug('mimetype: %s' % mimetype)
if mimetype in TEXT_PARSER_MIMETYPES:
logger.debug('creating page image with TextParser')
parser_output_filepath = os.path.join(TEMPORARY_DIRECTORY, u''.join([input_filepath, str(page), TEXT_PARSER_FILE_SUFFIX]))
logger.debug('parser_output_filepath: %s', parser_output_filepath)
with open(parser_output_filepath, 'wb') as descriptor:
descriptor.write(text_parser.render_to_image(input_filepath, page_number=page))
input_filepath = parser_output_filepath
mimetype = 'image/png'
elif office_converter:
try:
office_converter.convert(input_filepath, mimetype=mimetype)
if office_converter.exists:
@@ -102,6 +121,15 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype=
def get_page_count(input_filepath):
# Try to determine the page count first with the TextParser
with open(input_filepath, 'rb') as descriptor:
mimetype, encoding = get_mimetype(descriptor, input_filepath, mimetype_only=True)
logger.debug('mimetype: %s' % mimetype)
if mimetype in TEXT_PARSER_MIMETYPES:
logger.debug('getting page count with text parser')
parser = TextParser()
return len(parser.render_to_viewport(input_filepath))
logger.debug('office_converter: %s' % office_converter)
if office_converter:
try:

View File

@@ -1,22 +0,0 @@
"""
This file demonstrates two different styles of tests (one doctest and one
unittest). These will both pass when you run "manage.py test".
Replace these with more appropriate tests for your application.
"""
from django.test import TestCase
class SimpleTest(TestCase):
def test_basic_addition(self):
"""
Tests that 1 + 1 always equals 2.
"""
self.failUnlessEqual(1 + 1, 2)
__test__ = {"doctest": """
Another way to test that 1 + 1 is equal to 2.
>>> 1 + 1 == 2
True
"""}

View File

@@ -89,6 +89,8 @@ class DocumentPageForm_edit(forms.ModelForm):
'page_label',
'content',
]
self.fields['content'].widget.attrs.update({'class': 'text_area_div'})
page_image = forms.CharField(
required=False, widget=DocumentPageImageWidget()
)

View File

@@ -41,7 +41,7 @@ Setting(
namespace=namespace,
name='AUTOMATIC_OCR',
global_name='OCR_AUTOMATIC_OCR',
default=False,
default=True,
description=_(u'Automatically queue newly created documents for OCR.')
)

View File

@@ -12,6 +12,7 @@ from converter.exceptions import OfficeConversionError
from documents.utils import document_save_to_temp_dir
from common.utils import copyfile
from common.conf.settings import TEMPORARY_DIRECTORY
from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
from ocr.conf.settings import PDFTOTEXT_PATH
@@ -165,5 +166,29 @@ class PopplerParser(Parser):
document_page.save()
class TextParser(Parser):
def parse(self, document_page, descriptor=None):
logger.debug('parsing with TextParser')
pagenum = str(document_page.page_number)
if descriptor:
destination_descriptor, temp_filepath = tempfile.mkstemp(dir=TEMPORARY_DIRECTORY)
copyfile(descriptor, temp_filepath)
document_file = temp_filepath
else:
document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum)
logger.debug('document_file: %s', document_file)
logger.debug('parsing text page %s' % pagenum)
parser = OriginalTextParser()
document_page.content = '\n'.join(parser.render_to_viewport(filename=document_file)[int(pagenum) - 1])
document_page.page_label = _(u'Text extracted from file')
document_page.save()
register_parser(mimetypes=[u'application/pdf'], parsers=[PopplerParser, SlateParser])
register_parser(mimetypes=TEXT_PARSER_MIMETYPES, parsers=[TextParser])
register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, parsers=[OfficeParser])

View File

@@ -13,11 +13,17 @@ class StylingNode(Node):
for field_name, field in form.fields.items():
if isinstance(field.widget, forms.widgets.TextInput):
field.widget.attrs['class'] = u'text_field'
# Don't overwrite any existing CSS class, append
css_class = field.widget.attrs.get('class', u'text_field')
field.widget.attrs['class'] = u' '.join([css_class, 'text_field'])
elif isinstance(field.widget, forms.widgets.PasswordInput):
field.widget.attrs['class'] = u'text_field'
# Don't overwrite any existing CSS class, append
css_class = field.widget.attrs.get('class', u'text_field')
field.widget.attrs['class'] = u' '.join([css_class, 'text_field'])
elif isinstance(field.widget, forms.widgets.Textarea):
field.widget.attrs['class'] = u'text_area'
# Don't overwrite any existing CSS class, append
css_class = field.widget.attrs.get('class', u'text_area')
field.widget.attrs['class'] = u' '.join([css_class, 'text_area'])
context[self.form_name] = form
return ''

View File

@@ -71,3 +71,4 @@ GitPython==0.3.2.RC1
# Misc
elementtree==1.2.7-20070827-preview
Pygments==1.5