diff --git a/apps/common/static/css/override.css b/apps/common/static/css/override.css index 0328ea0c87..b61ef9ca6b 100644 --- a/apps/common/static/css/override.css +++ b/apps/common/static/css/override.css @@ -31,6 +31,7 @@ font-size: 1.2em; margin: 0; padding: 1px 0; + font-weight: bold; } .debug { diff --git a/apps/common/textparser.py b/apps/common/textparser.py new file mode 100644 index 0000000000..55963da71d --- /dev/null +++ b/apps/common/textparser.py @@ -0,0 +1,89 @@ +import codecs + +from pygments import highlight +from pygments.lexers import TextLexer, guess_lexer, get_lexer_for_filename, ClassNotFound +from pygments.formatters import ImageFormatter + +DEFAULT_PAGE_WIDTH = 70 +DEFAULT_PAGE_HEIGHT = 57 +DEFAULT_LINE_NUMBER_PAD = 19 +CHUNKSIZE = 1024 +NEWLINE = u'\n' +SPACE = u' ' + +TEXT_PARSER_MIMETYPES = ['text/plain' ,'text/x-python', 'text/html', 'text/x-shellscript'] + + +class TextParser(object): + def render_to_viewport(self, filename, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, fill_last_page=False): + """ + Render an input text file into an imaginary squared view port (terminal window), + returning a list of pages which are themselves a list of lines + """ + pages = [] + with codecs.open(filename, 'rU', 'utf-8') as descriptor: + width = 0 + height = 0 + line = [] + page = [] + bytes_read = descriptor.read(CHUNKSIZE) + while bytes_read: + for letter in bytes_read: + if letter != NEWLINE: + line.append(letter) + + width = width + 1 + if width >= page_width or letter == NEWLINE: + page.append(u''.join(line)) + line = [] + width = 0 + height = height + 1 + if height >= page_height: + pages.append(page) + page = [] + height = 0 + + bytes_read = descriptor.read(CHUNKSIZE) + + # Fill any final partial page with empty lines + if fill_last_page: + for filler in range(DEFAULT_PAGE_HEIGHT - len(page)): + page.append(SPACE) + + # Append any final partial page when chunk ends + pages.append(page) + + + return pages + + def render_to_image(self, filename, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, page_number=None, lexer=None, line_numbers=True, line_number_pad=DEFAULT_LINE_NUMBER_PAD): + """ + Turn a list of pages and lines and product and image representation, + selecting the best parser possible based on the filename and contents + """ + pages = self.render_to_viewport(filename, page_width, page_height, fill_last_page=True) + + if not lexer: + # Read entire file to guess the lexer + with codecs.open(filename, 'r', 'utf-8') as descriptor: + file_data = descriptor.read() + if not lexer: + try: + lexer = get_lexer_for_filename(filename, file_data) + except ClassNotFound, err: + try: + lexer = guess_lexer(file_data) + except ClassNotFound: + lexer = TextLexer() + + if page_number: + # Render a single page into image + return highlight(u'\n'.join(pages[page_number - 1]), lexer, ImageFormatter(line_number_start=(page_number - 1) * page_height + 1, line_numbers=line_numbers, line_number_pad=line_number_pad)) + else: + # Render all pages into image + output = [] + + for page, page_number in zip(pages, xrange(len(pages))): + output.append(highlight(u'\n'.join(page), lexer, ImageFormatter(line_number_start=page_number * page_height + 1, line_numbers=line_numbers, line_number_pad=line_number_pad))) + + return output diff --git a/apps/common/widgets.py b/apps/common/widgets.py index c2f7906c56..46daccd388 100644 --- a/apps/common/widgets.py +++ b/apps/common/widgets.py @@ -88,12 +88,13 @@ class TextAreaDiv(forms.widgets.Widget): def render(self, name, value, attrs=None): if value is None: - value = '' - final_attrs = self.build_attrs(attrs, name=name) - result = mark_safe(u'%s' % (flatatt(final_attrs), - conditional_escape(force_unicode(value)))) + value = u'' - return mark_safe(result.replace('\n', '
')) + flat_attrs = flatatt(self.build_attrs(attrs, name=name)) + content = conditional_escape(force_unicode(value)) + # Not needed for
 - .replace(u'\n', u'
').replace(u' ', u' ') + result = u'%s
' % (flat_attrs, content) + return mark_safe(result) # From: http://www.peterbe.com/plog/emailinput-html5-django diff --git a/apps/converter/api.py b/apps/converter/api.py index 80001292cf..d88445586d 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -7,6 +7,8 @@ import logging from django.utils.encoding import smart_str from common.conf.settings import TEMPORARY_DIRECTORY +from common.textparser import TextParser, TEXT_PARSER_MIMETYPES +from mimetype.api import get_mimetype from .literals import (DEFAULT_PAGE_NUMBER, DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, DEFAULT_FILE_FORMAT) @@ -21,6 +23,8 @@ from .exceptions import OfficeConversionError, UnknownFileFormat HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest() logger = logging.getLogger(__name__) +text_parser = TextParser() +TEXT_PARSER_FILE_SUFFIX = '_text_parser' def cache_cleanup(input_filepath, *args, **kwargs): @@ -55,7 +59,22 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype= if os.path.exists(output_filepath): return output_filepath - if office_converter: + if not mimetype: + with open(input_filepath, 'rb') as descriptor: + mimetype2, encoding = get_mimetype(descriptor, input_filepath, mimetype_only=True) + + logger.debug('mimetype: %s' % mimetype) + + if mimetype in TEXT_PARSER_MIMETYPES: + logger.debug('creating page image with TextParser') + parser_output_filepath = os.path.join(TEMPORARY_DIRECTORY, u''.join([input_filepath, str(page), TEXT_PARSER_FILE_SUFFIX])) + logger.debug('parser_output_filepath: %s', parser_output_filepath) + with open(parser_output_filepath, 'wb') as descriptor: + descriptor.write(text_parser.render_to_image(input_filepath, page_number=page)) + + input_filepath = parser_output_filepath + mimetype = 'image/png' + elif office_converter: try: office_converter.convert(input_filepath, mimetype=mimetype) if office_converter.exists: @@ -102,6 +121,15 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype= def get_page_count(input_filepath): + # Try to determine the page count first with the TextParser + with open(input_filepath, 'rb') as descriptor: + mimetype, encoding = get_mimetype(descriptor, input_filepath, mimetype_only=True) + logger.debug('mimetype: %s' % mimetype) + if mimetype in TEXT_PARSER_MIMETYPES: + logger.debug('getting page count with text parser') + parser = TextParser() + return len(parser.render_to_viewport(input_filepath)) + logger.debug('office_converter: %s' % office_converter) if office_converter: try: diff --git a/apps/converter/tests.py b/apps/converter/tests.py deleted file mode 100644 index 3b31148896..0000000000 --- a/apps/converter/tests.py +++ /dev/null @@ -1,22 +0,0 @@ -""" -This file demonstrates two different styles of tests (one doctest and one -unittest). These will both pass when you run "manage.py test". - -Replace these with more appropriate tests for your application. -""" - -from django.test import TestCase - -class SimpleTest(TestCase): - def test_basic_addition(self): - """ - Tests that 1 + 1 always equals 2. - """ - self.failUnlessEqual(1 + 1, 2) - -__test__ = {"doctest": """ -Another way to test that 1 + 1 is equal to 2. - ->>> 1 + 1 == 2 -True -"""} diff --git a/apps/documents/forms.py b/apps/documents/forms.py index 1aff45cc53..111ab80484 100644 --- a/apps/documents/forms.py +++ b/apps/documents/forms.py @@ -89,6 +89,8 @@ class DocumentPageForm_edit(forms.ModelForm): 'page_label', 'content', ] + self.fields['content'].widget.attrs.update({'class': 'text_area_div'}) + page_image = forms.CharField( required=False, widget=DocumentPageImageWidget() ) diff --git a/apps/ocr/conf/settings.py b/apps/ocr/conf/settings.py index 26161de53f..242b777f8d 100644 --- a/apps/ocr/conf/settings.py +++ b/apps/ocr/conf/settings.py @@ -41,7 +41,7 @@ Setting( namespace=namespace, name='AUTOMATIC_OCR', global_name='OCR_AUTOMATIC_OCR', - default=False, + default=True, description=_(u'Automatically queue newly created documents for OCR.') ) diff --git a/apps/ocr/parsers/__init__.py b/apps/ocr/parsers/__init__.py index 62f2c8ca13..afa01c078a 100644 --- a/apps/ocr/parsers/__init__.py +++ b/apps/ocr/parsers/__init__.py @@ -12,6 +12,7 @@ from converter.exceptions import OfficeConversionError from documents.utils import document_save_to_temp_dir from common.utils import copyfile from common.conf.settings import TEMPORARY_DIRECTORY +from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES from ocr.parsers.exceptions import ParserError, ParserUnknownFile from ocr.conf.settings import PDFTOTEXT_PATH @@ -165,5 +166,29 @@ class PopplerParser(Parser): document_page.save() +class TextParser(Parser): + def parse(self, document_page, descriptor=None): + logger.debug('parsing with TextParser') + pagenum = str(document_page.page_number) + + if descriptor: + destination_descriptor, temp_filepath = tempfile.mkstemp(dir=TEMPORARY_DIRECTORY) + copyfile(descriptor, temp_filepath) + document_file = temp_filepath + else: + document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum) + + logger.debug('document_file: %s', document_file) + + logger.debug('parsing text page %s' % pagenum) + + parser = OriginalTextParser() + + document_page.content = '\n'.join(parser.render_to_viewport(filename=document_file)[int(pagenum) - 1]) + document_page.page_label = _(u'Text extracted from file') + document_page.save() + + register_parser(mimetypes=[u'application/pdf'], parsers=[PopplerParser, SlateParser]) +register_parser(mimetypes=TEXT_PARSER_MIMETYPES, parsers=[TextParser]) register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, parsers=[OfficeParser]) diff --git a/apps/web_theme/templatetags/styling.py b/apps/web_theme/templatetags/styling.py index 8e77be88fc..4b1f0b5ff3 100644 --- a/apps/web_theme/templatetags/styling.py +++ b/apps/web_theme/templatetags/styling.py @@ -13,11 +13,17 @@ class StylingNode(Node): for field_name, field in form.fields.items(): if isinstance(field.widget, forms.widgets.TextInput): - field.widget.attrs['class'] = u'text_field' + # Don't overwrite any existing CSS class, append + css_class = field.widget.attrs.get('class', u'text_field') + field.widget.attrs['class'] = u' '.join([css_class, 'text_field']) elif isinstance(field.widget, forms.widgets.PasswordInput): - field.widget.attrs['class'] = u'text_field' + # Don't overwrite any existing CSS class, append + css_class = field.widget.attrs.get('class', u'text_field') + field.widget.attrs['class'] = u' '.join([css_class, 'text_field']) elif isinstance(field.widget, forms.widgets.Textarea): - field.widget.attrs['class'] = u'text_area' + # Don't overwrite any existing CSS class, append + css_class = field.widget.attrs.get('class', u'text_area') + field.widget.attrs['class'] = u' '.join([css_class, 'text_area']) context[self.form_name] = form return '' diff --git a/requirements/production.txt b/requirements/production.txt index e598a1b886..50c91e37ee 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -71,3 +71,4 @@ GitPython==0.3.2.RC1 # Misc elementtree==1.2.7-20070827-preview +Pygments==1.5