diff --git a/apps/common/textparser.py b/apps/common/textparser.py new file mode 100644 index 0000000000..ade83658a2 --- /dev/null +++ b/apps/common/textparser.py @@ -0,0 +1,98 @@ +from pygments import highlight +from pygments.lexers import TextLexer, guess_lexer, get_lexer_for_filename, ClassNotFound +from pygments.formatters import ImageFormatter + +DEFAULT_PAGE_WIDTH = 70 +DEFAULT_PAGE_HEIGHT = 57 +DEFAULT_LINE_NUMBER_PAD = 19 +CHUNKSIZE = 1024 +NEWLINE = '\n' +SPACE = ' ' + +TEXT_PARSER_MIMETYPES = ['text/plain'] + + +class TextParser(object): + def render_to_viewport(self, filename, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, fill_last_page=False): + """ + Render an input text file into an imaginary squared view port (terminal window), + returning a list of pages which are themselves a list of lines + """ + pages = [] + with open(filename, 'rU') as descriptor: + width = 0 + height = 0 + line = [] + page = [] + bytes_read = descriptor.read(CHUNKSIZE) + while bytes_read: + for letter in bytes_read: + if letter != NEWLINE: + line.append(letter) + + width = width + 1 + if width >= page_width or letter == NEWLINE: + page.append(''.join(line)) + line = [] + width = 0 + height = height + 1 + if height >= page_height: + pages.append(page) + page = [] + height = 0 + + bytes_read = descriptor.read(CHUNKSIZE) + + # Fill any final partial page with empty lines + if fill_last_page: + for filler in range(DEFAULT_PAGE_HEIGHT - len(page)): + page.append(SPACE) + + # Append any final partial page when chunk ends + pages.append(page) + + + return pages + + def render_to_image(self, filename, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, page_number=None, lexer=None, line_numbers=True, line_number_pad=DEFAULT_LINE_NUMBER_PAD): + """ + Turn a list of pages and lines and product and image representation, + selecting the best parser possible based on the filename and contents + """ + pages = self.render_to_viewport(filename, page_width, page_height, fill_last_page=True) + + if not lexer: + # Read entire file to guess the lexer + with open(filename, 'rb') as descriptor: + file_data = descriptor.read() + if not lexer: + try: + lexer = get_lexer_for_filename(filename, file_data) + except ClassNotFound, err: + try: + lexer = guess_lexer(file_data) + except ClassNotFound: + lexer = TextLexer() + + if page_number: + # Render a single page into image + return highlight('\n'.join(pages[page_number - 1]), lexer, ImageFormatter(line_number_start=(page_number - 1) * page_height + 1, line_numbers=line_numbers, line_number_pad=line_number_pad)) + else: + # Render all pages into image + output = [] + + for page, page_number in zip(pages, xrange(len(pages))): + output.append(highlight('\n'.join(page), lexer, ImageFormatter(line_number_start=page_number * page_height + 1, line_numbers=line_numbers, line_number_pad=line_number_pad))) + + return output + + +# parser = TextParser() +# page_num = 1 +# #for result in parser.render('docwrap.py'):#, 80): +# #for result in parser.render_to_image('input.txt'):#, 80): +# for result in parser.render_to_image('../apps/documents/views.py'):#, 80): +# FILE = open('page%d' % page_num, 'wb') +# FILE.write(result) +# FILE.close() +# page_num += 1 diff --git a/apps/ocr/parsers/__init__.py b/apps/ocr/parsers/__init__.py index 62f2c8ca13..2b79403478 100644 --- a/apps/ocr/parsers/__init__.py +++ b/apps/ocr/parsers/__init__.py @@ -12,6 +12,7 @@ from converter.exceptions import OfficeConversionError from documents.utils import document_save_to_temp_dir from common.utils import copyfile from common.conf.settings import TEMPORARY_DIRECTORY +from common.textparser import TextParser as OriginalTextParser from ocr.parsers.exceptions import ParserError, ParserUnknownFile from ocr.conf.settings import PDFTOTEXT_PATH @@ -165,5 +166,29 @@ class PopplerParser(Parser): document_page.save() +class TextParser(Parser): + def parse(self, document_page, descriptor=None): + logger.debug('parsing with TextParser') + pagenum = str(document_page.page_number) + + if descriptor: + destination_descriptor, temp_filepath = tempfile.mkstemp(dir=TEMPORARY_DIRECTORY) + copyfile(descriptor, temp_filepath) + document_file = temp_filepath + else: + document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum) + + logger.debug('document_file: %s', document_file) + + logger.debug('parsing text page %s' % pagenum) + + parser = OriginalTextParser() + + document_page.content = '\n'.join(parser.render_to_viewport(filename=document_file)[int(pagenum) - 1]) + document_page.page_label = _(u'Text extracted from file') + document_page.save() + + register_parser(mimetypes=[u'application/pdf'], parsers=[PopplerParser, SlateParser]) +register_parser(mimetypes=[u'text/plain'], parsers=[TextParser]) register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, parsers=[OfficeParser]) diff --git a/requirements/production.txt b/requirements/production.txt index e598a1b886..50c91e37ee 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -71,3 +71,4 @@ GitPython==0.3.2.RC1 # Misc elementtree==1.2.7-20070827-preview +Pygments==1.5