Merge branch 'feature/text_renderer_parser' into development
This commit is contained in:
@@ -31,6 +31,7 @@
|
||||
font-size: 1.2em;
|
||||
margin: 0;
|
||||
padding: 1px 0;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.debug {
|
||||
|
||||
89
apps/common/textparser.py
Normal file
89
apps/common/textparser.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import codecs
|
||||
|
||||
from pygments import highlight
|
||||
from pygments.lexers import TextLexer, guess_lexer, get_lexer_for_filename, ClassNotFound
|
||||
from pygments.formatters import ImageFormatter
|
||||
|
||||
DEFAULT_PAGE_WIDTH = 70
|
||||
DEFAULT_PAGE_HEIGHT = 57
|
||||
DEFAULT_LINE_NUMBER_PAD = 19
|
||||
CHUNKSIZE = 1024
|
||||
NEWLINE = u'\n'
|
||||
SPACE = u' '
|
||||
|
||||
TEXT_PARSER_MIMETYPES = ['text/plain' ,'text/x-python', 'text/html', 'text/x-shellscript']
|
||||
|
||||
|
||||
class TextParser(object):
|
||||
def render_to_viewport(self, filename, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, fill_last_page=False):
|
||||
"""
|
||||
Render an input text file into an imaginary squared view port (terminal window),
|
||||
returning a list of pages which are themselves a list of lines
|
||||
"""
|
||||
pages = []
|
||||
with codecs.open(filename, 'rU', 'utf-8') as descriptor:
|
||||
width = 0
|
||||
height = 0
|
||||
line = []
|
||||
page = []
|
||||
bytes_read = descriptor.read(CHUNKSIZE)
|
||||
while bytes_read:
|
||||
for letter in bytes_read:
|
||||
if letter != NEWLINE:
|
||||
line.append(letter)
|
||||
|
||||
width = width + 1
|
||||
if width >= page_width or letter == NEWLINE:
|
||||
page.append(u''.join(line))
|
||||
line = []
|
||||
width = 0
|
||||
height = height + 1
|
||||
if height >= page_height:
|
||||
pages.append(page)
|
||||
page = []
|
||||
height = 0
|
||||
|
||||
bytes_read = descriptor.read(CHUNKSIZE)
|
||||
|
||||
# Fill any final partial page with empty lines
|
||||
if fill_last_page:
|
||||
for filler in range(DEFAULT_PAGE_HEIGHT - len(page)):
|
||||
page.append(SPACE)
|
||||
|
||||
# Append any final partial page when chunk ends
|
||||
pages.append(page)
|
||||
|
||||
|
||||
return pages
|
||||
|
||||
def render_to_image(self, filename, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, page_number=None, lexer=None, line_numbers=True, line_number_pad=DEFAULT_LINE_NUMBER_PAD):
|
||||
"""
|
||||
Turn a list of pages and lines and product and image representation,
|
||||
selecting the best parser possible based on the filename and contents
|
||||
"""
|
||||
pages = self.render_to_viewport(filename, page_width, page_height, fill_last_page=True)
|
||||
|
||||
if not lexer:
|
||||
# Read entire file to guess the lexer
|
||||
with codecs.open(filename, 'r', 'utf-8') as descriptor:
|
||||
file_data = descriptor.read()
|
||||
if not lexer:
|
||||
try:
|
||||
lexer = get_lexer_for_filename(filename, file_data)
|
||||
except ClassNotFound, err:
|
||||
try:
|
||||
lexer = guess_lexer(file_data)
|
||||
except ClassNotFound:
|
||||
lexer = TextLexer()
|
||||
|
||||
if page_number:
|
||||
# Render a single page into image
|
||||
return highlight(u'\n'.join(pages[page_number - 1]), lexer, ImageFormatter(line_number_start=(page_number - 1) * page_height + 1, line_numbers=line_numbers, line_number_pad=line_number_pad))
|
||||
else:
|
||||
# Render all pages into image
|
||||
output = []
|
||||
|
||||
for page, page_number in zip(pages, xrange(len(pages))):
|
||||
output.append(highlight(u'\n'.join(page), lexer, ImageFormatter(line_number_start=page_number * page_height + 1, line_numbers=line_numbers, line_number_pad=line_number_pad)))
|
||||
|
||||
return output
|
||||
@@ -88,12 +88,13 @@ class TextAreaDiv(forms.widgets.Widget):
|
||||
|
||||
def render(self, name, value, attrs=None):
|
||||
if value is None:
|
||||
value = ''
|
||||
final_attrs = self.build_attrs(attrs, name=name)
|
||||
result = mark_safe(u'<div%s>%s</div>' % (flatatt(final_attrs),
|
||||
conditional_escape(force_unicode(value))))
|
||||
value = u''
|
||||
|
||||
return mark_safe(result.replace('\n', '<br>'))
|
||||
flat_attrs = flatatt(self.build_attrs(attrs, name=name))
|
||||
content = conditional_escape(force_unicode(value))
|
||||
# Not needed for <pre> - .replace(u'\n', u'<br>').replace(u' ', u' ')
|
||||
result = u'<pre%s>%s</pre>' % (flat_attrs, content)
|
||||
return mark_safe(result)
|
||||
|
||||
|
||||
# From: http://www.peterbe.com/plog/emailinput-html5-django
|
||||
|
||||
@@ -7,6 +7,8 @@ import logging
|
||||
from django.utils.encoding import smart_str
|
||||
|
||||
from common.conf.settings import TEMPORARY_DIRECTORY
|
||||
from common.textparser import TextParser, TEXT_PARSER_MIMETYPES
|
||||
from mimetype.api import get_mimetype
|
||||
|
||||
from .literals import (DEFAULT_PAGE_NUMBER,
|
||||
DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, DEFAULT_FILE_FORMAT)
|
||||
@@ -21,6 +23,8 @@ from .exceptions import OfficeConversionError, UnknownFileFormat
|
||||
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
text_parser = TextParser()
|
||||
TEXT_PARSER_FILE_SUFFIX = '_text_parser'
|
||||
|
||||
|
||||
def cache_cleanup(input_filepath, *args, **kwargs):
|
||||
@@ -55,7 +59,22 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype=
|
||||
if os.path.exists(output_filepath):
|
||||
return output_filepath
|
||||
|
||||
if office_converter:
|
||||
if not mimetype:
|
||||
with open(input_filepath, 'rb') as descriptor:
|
||||
mimetype2, encoding = get_mimetype(descriptor, input_filepath, mimetype_only=True)
|
||||
|
||||
logger.debug('mimetype: %s' % mimetype)
|
||||
|
||||
if mimetype in TEXT_PARSER_MIMETYPES:
|
||||
logger.debug('creating page image with TextParser')
|
||||
parser_output_filepath = os.path.join(TEMPORARY_DIRECTORY, u''.join([input_filepath, str(page), TEXT_PARSER_FILE_SUFFIX]))
|
||||
logger.debug('parser_output_filepath: %s', parser_output_filepath)
|
||||
with open(parser_output_filepath, 'wb') as descriptor:
|
||||
descriptor.write(text_parser.render_to_image(input_filepath, page_number=page))
|
||||
|
||||
input_filepath = parser_output_filepath
|
||||
mimetype = 'image/png'
|
||||
elif office_converter:
|
||||
try:
|
||||
office_converter.convert(input_filepath, mimetype=mimetype)
|
||||
if office_converter.exists:
|
||||
@@ -102,6 +121,15 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype=
|
||||
|
||||
|
||||
def get_page_count(input_filepath):
|
||||
# Try to determine the page count first with the TextParser
|
||||
with open(input_filepath, 'rb') as descriptor:
|
||||
mimetype, encoding = get_mimetype(descriptor, input_filepath, mimetype_only=True)
|
||||
logger.debug('mimetype: %s' % mimetype)
|
||||
if mimetype in TEXT_PARSER_MIMETYPES:
|
||||
logger.debug('getting page count with text parser')
|
||||
parser = TextParser()
|
||||
return len(parser.render_to_viewport(input_filepath))
|
||||
|
||||
logger.debug('office_converter: %s' % office_converter)
|
||||
if office_converter:
|
||||
try:
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
"""
|
||||
This file demonstrates two different styles of tests (one doctest and one
|
||||
unittest). These will both pass when you run "manage.py test".
|
||||
|
||||
Replace these with more appropriate tests for your application.
|
||||
"""
|
||||
|
||||
from django.test import TestCase
|
||||
|
||||
class SimpleTest(TestCase):
|
||||
def test_basic_addition(self):
|
||||
"""
|
||||
Tests that 1 + 1 always equals 2.
|
||||
"""
|
||||
self.failUnlessEqual(1 + 1, 2)
|
||||
|
||||
__test__ = {"doctest": """
|
||||
Another way to test that 1 + 1 is equal to 2.
|
||||
|
||||
>>> 1 + 1 == 2
|
||||
True
|
||||
"""}
|
||||
@@ -89,6 +89,8 @@ class DocumentPageForm_edit(forms.ModelForm):
|
||||
'page_label',
|
||||
'content',
|
||||
]
|
||||
self.fields['content'].widget.attrs.update({'class': 'text_area_div'})
|
||||
|
||||
page_image = forms.CharField(
|
||||
required=False, widget=DocumentPageImageWidget()
|
||||
)
|
||||
|
||||
@@ -41,7 +41,7 @@ Setting(
|
||||
namespace=namespace,
|
||||
name='AUTOMATIC_OCR',
|
||||
global_name='OCR_AUTOMATIC_OCR',
|
||||
default=False,
|
||||
default=True,
|
||||
description=_(u'Automatically queue newly created documents for OCR.')
|
||||
)
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ from converter.exceptions import OfficeConversionError
|
||||
from documents.utils import document_save_to_temp_dir
|
||||
from common.utils import copyfile
|
||||
from common.conf.settings import TEMPORARY_DIRECTORY
|
||||
from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES
|
||||
|
||||
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
|
||||
from ocr.conf.settings import PDFTOTEXT_PATH
|
||||
@@ -165,5 +166,29 @@ class PopplerParser(Parser):
|
||||
document_page.save()
|
||||
|
||||
|
||||
class TextParser(Parser):
|
||||
def parse(self, document_page, descriptor=None):
|
||||
logger.debug('parsing with TextParser')
|
||||
pagenum = str(document_page.page_number)
|
||||
|
||||
if descriptor:
|
||||
destination_descriptor, temp_filepath = tempfile.mkstemp(dir=TEMPORARY_DIRECTORY)
|
||||
copyfile(descriptor, temp_filepath)
|
||||
document_file = temp_filepath
|
||||
else:
|
||||
document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum)
|
||||
|
||||
logger.debug('document_file: %s', document_file)
|
||||
|
||||
logger.debug('parsing text page %s' % pagenum)
|
||||
|
||||
parser = OriginalTextParser()
|
||||
|
||||
document_page.content = '\n'.join(parser.render_to_viewport(filename=document_file)[int(pagenum) - 1])
|
||||
document_page.page_label = _(u'Text extracted from file')
|
||||
document_page.save()
|
||||
|
||||
|
||||
register_parser(mimetypes=[u'application/pdf'], parsers=[PopplerParser, SlateParser])
|
||||
register_parser(mimetypes=TEXT_PARSER_MIMETYPES, parsers=[TextParser])
|
||||
register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, parsers=[OfficeParser])
|
||||
|
||||
@@ -13,11 +13,17 @@ class StylingNode(Node):
|
||||
for field_name, field in form.fields.items():
|
||||
|
||||
if isinstance(field.widget, forms.widgets.TextInput):
|
||||
field.widget.attrs['class'] = u'text_field'
|
||||
# Don't overwrite any existing CSS class, append
|
||||
css_class = field.widget.attrs.get('class', u'text_field')
|
||||
field.widget.attrs['class'] = u' '.join([css_class, 'text_field'])
|
||||
elif isinstance(field.widget, forms.widgets.PasswordInput):
|
||||
field.widget.attrs['class'] = u'text_field'
|
||||
# Don't overwrite any existing CSS class, append
|
||||
css_class = field.widget.attrs.get('class', u'text_field')
|
||||
field.widget.attrs['class'] = u' '.join([css_class, 'text_field'])
|
||||
elif isinstance(field.widget, forms.widgets.Textarea):
|
||||
field.widget.attrs['class'] = u'text_area'
|
||||
# Don't overwrite any existing CSS class, append
|
||||
css_class = field.widget.attrs.get('class', u'text_area')
|
||||
field.widget.attrs['class'] = u' '.join([css_class, 'text_area'])
|
||||
|
||||
context[self.form_name] = form
|
||||
return ''
|
||||
|
||||
@@ -71,3 +71,4 @@ GitPython==0.3.2.RC1
|
||||
# Misc
|
||||
|
||||
elementtree==1.2.7-20070827-preview
|
||||
Pygments==1.5
|
||||
|
||||
Reference in New Issue
Block a user