Merge branch 'feature/text_renderer_parser' into development

2012-07-28 05:51:23 -04:00
parent 64c7804611 4c71139055
commit 03fa9f9fb7
10 changed files with 163 additions and 32 deletions
--- a/apps/common/static/css/override.css
+++ b/apps/common/static/css/override.css
@@ -31,6 +31,7 @@
    font-size: 1.2em;
    margin: 0;
    padding: 1px 0;
+    font-weight: bold;
 }

 .debug {
--- a/apps/common/textparser.py
+++ b/apps/common/textparser.py
@@ -0,0 +1,89 @@
+import codecs
+
+from pygments import highlight
+from pygments.lexers import TextLexer, guess_lexer, get_lexer_for_filename, ClassNotFound
+from pygments.formatters import ImageFormatter
+
+DEFAULT_PAGE_WIDTH = 70
+DEFAULT_PAGE_HEIGHT = 57
+DEFAULT_LINE_NUMBER_PAD = 19
+CHUNKSIZE = 1024
+NEWLINE = u'\n'
+SPACE = u' '
+
+TEXT_PARSER_MIMETYPES = ['text/plain' ,'text/x-python', 'text/html', 'text/x-shellscript']
+
+
+class TextParser(object):
+    def render_to_viewport(self, filename, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, fill_last_page=False):
+        """
+        Render an input text file into an imaginary squared view port (terminal window),
+        returning a list of pages which are themselves a list of lines
+        """
+        pages = []
+        with codecs.open(filename, 'rU', 'utf-8') as descriptor:
+            width = 0
+            height = 0
+            line = []
+            page = []
+            bytes_read = descriptor.read(CHUNKSIZE)
+            while bytes_read:
+                for letter in bytes_read:
+                    if letter != NEWLINE:
+                        line.append(letter)
+                        
+                    width = width + 1
+                    if width >= page_width or letter == NEWLINE:
+                        page.append(u''.join(line))
+                        line = []
+                        width = 0
+                        height = height + 1
+                        if height >= page_height:
+                            pages.append(page)
+                            page = []
+                            height = 0
+                        
+                bytes_read = descriptor.read(CHUNKSIZE)
+
+            # Fill any final partial page with empty lines
+            if fill_last_page:
+                for filler in range(DEFAULT_PAGE_HEIGHT - len(page)):
+                    page.append(SPACE)
+
+            # Append any final partial page when chunk ends
+            pages.append(page)
+            
+
+        return pages
+        
+    def render_to_image(self, filename, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, page_number=None, lexer=None, line_numbers=True, line_number_pad=DEFAULT_LINE_NUMBER_PAD):
+        """
+        Turn a list of pages and lines and product and image representation,
+        selecting the best parser possible based on the filename and contents
+        """
+        pages = self.render_to_viewport(filename, page_width, page_height, fill_last_page=True)
+
+        if not lexer:
+            # Read entire file to guess the lexer
+            with codecs.open(filename, 'r', 'utf-8') as descriptor:                
+                file_data = descriptor.read()
+                if not lexer:
+                    try:
+                        lexer = get_lexer_for_filename(filename, file_data)
+                    except ClassNotFound, err:
+                        try:
+                            lexer = guess_lexer(file_data)
+                        except ClassNotFound:
+                            lexer = TextLexer()
+       
+        if page_number:
+            # Render a single page into image
+            return highlight(u'\n'.join(pages[page_number - 1]), lexer, ImageFormatter(line_number_start=(page_number - 1) * page_height + 1, line_numbers=line_numbers, line_number_pad=line_number_pad))
+        else:
+            # Render all pages into image
+            output = []
+        
+            for page, page_number in zip(pages, xrange(len(pages))):
+                output.append(highlight(u'\n'.join(page), lexer, ImageFormatter(line_number_start=page_number * page_height + 1, line_numbers=line_numbers, line_number_pad=line_number_pad)))
+                
+            return output
--- a/apps/common/widgets.py
+++ b/apps/common/widgets.py
@@ -88,12 +88,13 @@ class TextAreaDiv(forms.widgets.Widget):

    def render(self, name, value, attrs=None):
        if value is None:
-            value = ''
-        final_attrs = self.build_attrs(attrs, name=name)
-        result = mark_safe(u'<div%s>%s</div>' % (flatatt(final_attrs),
-            conditional_escape(force_unicode(value))))
+            value = u''

-        return mark_safe(result.replace('\n', '<br>'))
+        flat_attrs = flatatt(self.build_attrs(attrs, name=name))
+        content = conditional_escape(force_unicode(value))
+        # Not needed for <pre> - .replace(u'\n', u'<br>').replace(u' ', u'&nbsp;')
+        result = u'<pre%s>%s</pre>' % (flat_attrs, content)
+        return mark_safe(result)


 # From: http://www.peterbe.com/plog/emailinput-html5-django
--- a/apps/converter/api.py
+++ b/apps/converter/api.py
@@ -7,6 +7,8 @@ import logging
 from django.utils.encoding import smart_str

 from common.conf.settings import TEMPORARY_DIRECTORY
+from common.textparser import TextParser, TEXT_PARSER_MIMETYPES
+from mimetype.api import get_mimetype

 from .literals import (DEFAULT_PAGE_NUMBER,
    DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, DEFAULT_FILE_FORMAT)
@@ -21,6 +23,8 @@ from .exceptions import OfficeConversionError, UnknownFileFormat
 HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()

 logger = logging.getLogger(__name__)
+text_parser = TextParser()
+TEXT_PARSER_FILE_SUFFIX = '_text_parser'


 def cache_cleanup(input_filepath, *args, **kwargs):
@@ -55,7 +59,22 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype=
    if os.path.exists(output_filepath):
        return output_filepath

-    if office_converter:
+    if not mimetype:
+        with open(input_filepath, 'rb') as descriptor:
+            mimetype2, encoding = get_mimetype(descriptor, input_filepath, mimetype_only=True)
+    
+    logger.debug('mimetype: %s' % mimetype)
+    
+    if mimetype in TEXT_PARSER_MIMETYPES:
+        logger.debug('creating page image with TextParser')
+        parser_output_filepath = os.path.join(TEMPORARY_DIRECTORY, u''.join([input_filepath, str(page), TEXT_PARSER_FILE_SUFFIX]))
+        logger.debug('parser_output_filepath: %s', parser_output_filepath)
+        with open(parser_output_filepath, 'wb') as descriptor:
+            descriptor.write(text_parser.render_to_image(input_filepath, page_number=page))
+        
+        input_filepath = parser_output_filepath
+        mimetype = 'image/png'
+    elif office_converter:
        try:
            office_converter.convert(input_filepath, mimetype=mimetype)
            if office_converter.exists:
@@ -102,6 +121,15 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype=


 def get_page_count(input_filepath):
+    # Try to determine the page count first with the TextParser
+    with open(input_filepath, 'rb') as descriptor:
+        mimetype, encoding = get_mimetype(descriptor, input_filepath, mimetype_only=True)
+        logger.debug('mimetype: %s' % mimetype)
+        if mimetype in TEXT_PARSER_MIMETYPES:
+            logger.debug('getting page count with text parser')
+            parser = TextParser()
+            return len(parser.render_to_viewport(input_filepath))
+  
    logger.debug('office_converter: %s' % office_converter)
    if office_converter:
        try:
--- a/apps/converter/tests.py
+++ b/apps/converter/tests.py
@@ -1,22 +0,0 @@
-"""
-This file demonstrates two different styles of tests (one doctest and one
-unittest). These will both pass when you run "manage.py test".
-
-Replace these with more appropriate tests for your application.
-"""
-
-from django.test import TestCase
-
-class SimpleTest(TestCase):
-    def test_basic_addition(self):
-        """
-        Tests that 1 + 1 always equals 2.
-        """
-        self.failUnlessEqual(1 + 1, 2)
-
-__test__ = {"doctest": """
-Another way to test that 1 + 1 is equal to 2.
-
->>> 1 + 1 == 2
-True
-"""}
--- a/apps/documents/forms.py
+++ b/apps/documents/forms.py
@@ -89,6 +89,8 @@ class DocumentPageForm_edit(forms.ModelForm):
            'page_label',
            'content',
        ]
+        self.fields['content'].widget.attrs.update({'class': 'text_area_div'})
+        
    page_image = forms.CharField(
        required=False, widget=DocumentPageImageWidget()
    )
--- a/apps/ocr/conf/settings.py
+++ b/apps/ocr/conf/settings.py
@@ -41,7 +41,7 @@ Setting(
    namespace=namespace,
    name='AUTOMATIC_OCR',
    global_name='OCR_AUTOMATIC_OCR',
-    default=False,
+    default=True,
    description=_(u'Automatically queue newly created documents for OCR.')
 )

--- a/apps/ocr/parsers/init.py
+++ b/apps/ocr/parsers/init.py
@@ -12,6 +12,7 @@ from converter.exceptions import OfficeConversionError
 from documents.utils import document_save_to_temp_dir
 from common.utils import copyfile
 from common.conf.settings import TEMPORARY_DIRECTORY
+from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES

 from ocr.parsers.exceptions import ParserError, ParserUnknownFile
 from ocr.conf.settings import PDFTOTEXT_PATH
@@ -165,5 +166,29 @@ class PopplerParser(Parser):
        document_page.save()


+class TextParser(Parser):
+    def parse(self, document_page, descriptor=None):
+        logger.debug('parsing with TextParser')
+        pagenum = str(document_page.page_number)
+
+        if descriptor:
+            destination_descriptor, temp_filepath = tempfile.mkstemp(dir=TEMPORARY_DIRECTORY)
+            copyfile(descriptor, temp_filepath)
+            document_file = temp_filepath
+        else:
+            document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum)
+
+        logger.debug('document_file: %s', document_file)
+
+        logger.debug('parsing text page %s' % pagenum)
+
+        parser = OriginalTextParser()
+
+        document_page.content = '\n'.join(parser.render_to_viewport(filename=document_file)[int(pagenum) - 1])
+        document_page.page_label = _(u'Text extracted from file')
+        document_page.save()
+        
+
 register_parser(mimetypes=[u'application/pdf'], parsers=[PopplerParser, SlateParser])
+register_parser(mimetypes=TEXT_PARSER_MIMETYPES, parsers=[TextParser])
 register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, parsers=[OfficeParser])
--- a/apps/web_theme/templatetags/styling.py
+++ b/apps/web_theme/templatetags/styling.py
@@ -13,11 +13,17 @@ class StylingNode(Node):
        for field_name, field in form.fields.items():

            if isinstance(field.widget, forms.widgets.TextInput):
-                field.widget.attrs['class'] = u'text_field'
+                # Don't overwrite any existing CSS class, append
+                css_class = field.widget.attrs.get('class', u'text_field')
+                field.widget.attrs['class'] = u' '.join([css_class, 'text_field'])
            elif isinstance(field.widget, forms.widgets.PasswordInput):
-                field.widget.attrs['class'] = u'text_field'
+                # Don't overwrite any existing CSS class, append
+                css_class = field.widget.attrs.get('class', u'text_field')
+                field.widget.attrs['class'] = u' '.join([css_class, 'text_field'])
            elif isinstance(field.widget, forms.widgets.Textarea):
-                field.widget.attrs['class'] = u'text_area'
+                # Don't overwrite any existing CSS class, append
+                css_class = field.widget.attrs.get('class', u'text_area')
+                field.widget.attrs['class'] = u' '.join([css_class, 'text_area'])

        context[self.form_name] = form
        return ''
--- a/requirements/production.txt
+++ b/requirements/production.txt
@@ -71,3 +71,4 @@ GitPython==0.3.2.RC1
 # Misc

 elementtree==1.2.7-20070827-preview
+Pygments==1.5