diff --git a/apps/common/textparser.py b/apps/common/textparser.py index 55963da71d..9f1df5661b 100644 --- a/apps/common/textparser.py +++ b/apps/common/textparser.py @@ -1,7 +1,8 @@ import codecs +import logging from pygments import highlight -from pygments.lexers import TextLexer, guess_lexer, get_lexer_for_filename, ClassNotFound +from pygments.lexers import TextLexer, guess_lexer, get_lexer_for_filename, ClassNotFound, get_lexer_for_mimetype from pygments.formatters import ImageFormatter DEFAULT_PAGE_WIDTH = 70 @@ -13,6 +14,8 @@ SPACE = u' ' TEXT_PARSER_MIMETYPES = ['text/plain' ,'text/x-python', 'text/html', 'text/x-shellscript'] +logger = logging.getLogger(__name__) + class TextParser(object): def render_to_viewport(self, filename, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, fill_last_page=False): @@ -56,26 +59,47 @@ class TextParser(object): return pages - def render_to_image(self, filename, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, page_number=None, lexer=None, line_numbers=True, line_number_pad=DEFAULT_LINE_NUMBER_PAD): + def render_to_image(self, filename, mimetype=None, actual_filename=None, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, page_number=None, lexer=None, line_numbers=True, line_number_pad=DEFAULT_LINE_NUMBER_PAD): """ Turn a list of pages and lines and product and image representation, selecting the best parser possible based on the filename and contents """ pages = self.render_to_viewport(filename, page_width, page_height, fill_last_page=True) - + if not lexer: - # Read entire file to guess the lexer - with codecs.open(filename, 'r', 'utf-8') as descriptor: - file_data = descriptor.read() - if not lexer: - try: - lexer = get_lexer_for_filename(filename, file_data) - except ClassNotFound, err: + if mimetype: + try: + lexer = get_lexer_for_mimetype(mimetype) + except ClassNotFound: + pass + else: + logger.debug('get_lexer_for_mimetype: %s' % lexer) + else: + # Read entire file to guess the lexer + with codecs.open(filename, 'r', 'utf-8') as descriptor: + logger.debug('guessing lexer for file: %s' % filename) + + file_data = descriptor.read() + if not lexer: try: - lexer = guess_lexer(file_data) - except ClassNotFound: - lexer = TextLexer() + if actual_filename: + lexer = get_lexer_for_filename(actual_filename, file_data) + else: + lexer = get_lexer_for_filename(filename, file_data) + except ClassNotFound, err: + logger.debug('get_lexer_for_filename error: %s', err) + try: + lexer = guess_lexer(file_data) + except ClassNotFound: + lexer = TextLexer() + logger.debug('unable to guess lexer') + else: + logger.debug('guess_lexer returned: %s' % lexer) + else: + logger.debug('get_lexer_for_filename returned: %s' % lexer) + logger.debug('lexer: %s' % lexer) + if page_number: # Render a single page into image return highlight(u'\n'.join(pages[page_number - 1]), lexer, ImageFormatter(line_number_start=(page_number - 1) * page_height + 1, line_numbers=line_numbers, line_number_pad=line_number_pad)) diff --git a/apps/converter/api.py b/apps/converter/api.py index d88445586d..56555eff0c 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -70,7 +70,7 @@ def convert(input_filepath, output_filepath=None, cleanup_files=False, mimetype= parser_output_filepath = os.path.join(TEMPORARY_DIRECTORY, u''.join([input_filepath, str(page), TEXT_PARSER_FILE_SUFFIX])) logger.debug('parser_output_filepath: %s', parser_output_filepath) with open(parser_output_filepath, 'wb') as descriptor: - descriptor.write(text_parser.render_to_image(input_filepath, page_number=page)) + descriptor.write(text_parser.render_to_image(input_filepath, mimetype=mimetype, page_number=page)) input_filepath = parser_output_filepath mimetype = 'image/png' diff --git a/apps/ocr/parsers/__init__.py b/apps/ocr/parsers/__init__.py index afa01c078a..3bf89031fc 100644 --- a/apps/ocr/parsers/__init__.py +++ b/apps/ocr/parsers/__init__.py @@ -184,7 +184,7 @@ class TextParser(Parser): parser = OriginalTextParser() - document_page.content = '\n'.join(parser.render_to_viewport(filename=document_file)[int(pagenum) - 1]) + document_page.content = '\n'.join(parser.render_to_viewport(filename=document_file, mimetype=document_page.document_version.mimetype)[int(pagenum) - 1]) document_page.page_label = _(u'Text extracted from file') document_page.save()