Add text parser and render using Pygments
This commit is contained in:
98
apps/common/textparser.py
Normal file
98
apps/common/textparser.py
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
from pygments import highlight
|
||||||
|
from pygments.lexers import TextLexer, guess_lexer, get_lexer_for_filename, ClassNotFound
|
||||||
|
from pygments.formatters import ImageFormatter
|
||||||
|
|
||||||
|
DEFAULT_PAGE_WIDTH = 70
|
||||||
|
DEFAULT_PAGE_HEIGHT = 57
|
||||||
|
DEFAULT_LINE_NUMBER_PAD = 19
|
||||||
|
CHUNKSIZE = 1024
|
||||||
|
NEWLINE = '\n'
|
||||||
|
SPACE = ' '
|
||||||
|
|
||||||
|
TEXT_PARSER_MIMETYPES = ['text/plain']
|
||||||
|
|
||||||
|
|
||||||
|
class TextParser(object):
|
||||||
|
def render_to_viewport(self, filename, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, fill_last_page=False):
|
||||||
|
"""
|
||||||
|
Render an input text file into an imaginary squared view port (terminal window),
|
||||||
|
returning a list of pages which are themselves a list of lines
|
||||||
|
"""
|
||||||
|
pages = []
|
||||||
|
with open(filename, 'rU') as descriptor:
|
||||||
|
width = 0
|
||||||
|
height = 0
|
||||||
|
line = []
|
||||||
|
page = []
|
||||||
|
bytes_read = descriptor.read(CHUNKSIZE)
|
||||||
|
while bytes_read:
|
||||||
|
for letter in bytes_read:
|
||||||
|
if letter != NEWLINE:
|
||||||
|
line.append(letter)
|
||||||
|
|
||||||
|
width = width + 1
|
||||||
|
if width >= page_width or letter == NEWLINE:
|
||||||
|
page.append(''.join(line))
|
||||||
|
line = []
|
||||||
|
width = 0
|
||||||
|
height = height + 1
|
||||||
|
if height >= page_height:
|
||||||
|
pages.append(page)
|
||||||
|
page = []
|
||||||
|
height = 0
|
||||||
|
|
||||||
|
bytes_read = descriptor.read(CHUNKSIZE)
|
||||||
|
|
||||||
|
# Fill any final partial page with empty lines
|
||||||
|
if fill_last_page:
|
||||||
|
for filler in range(DEFAULT_PAGE_HEIGHT - len(page)):
|
||||||
|
page.append(SPACE)
|
||||||
|
|
||||||
|
# Append any final partial page when chunk ends
|
||||||
|
pages.append(page)
|
||||||
|
|
||||||
|
|
||||||
|
return pages
|
||||||
|
|
||||||
|
def render_to_image(self, filename, page_width=DEFAULT_PAGE_WIDTH, page_height=DEFAULT_PAGE_HEIGHT, page_number=None, lexer=None, line_numbers=True, line_number_pad=DEFAULT_LINE_NUMBER_PAD):
|
||||||
|
"""
|
||||||
|
Turn a list of pages and lines and product and image representation,
|
||||||
|
selecting the best parser possible based on the filename and contents
|
||||||
|
"""
|
||||||
|
pages = self.render_to_viewport(filename, page_width, page_height, fill_last_page=True)
|
||||||
|
|
||||||
|
if not lexer:
|
||||||
|
# Read entire file to guess the lexer
|
||||||
|
with open(filename, 'rb') as descriptor:
|
||||||
|
file_data = descriptor.read()
|
||||||
|
if not lexer:
|
||||||
|
try:
|
||||||
|
lexer = get_lexer_for_filename(filename, file_data)
|
||||||
|
except ClassNotFound, err:
|
||||||
|
try:
|
||||||
|
lexer = guess_lexer(file_data)
|
||||||
|
except ClassNotFound:
|
||||||
|
lexer = TextLexer()
|
||||||
|
|
||||||
|
if page_number:
|
||||||
|
# Render a single page into image
|
||||||
|
return highlight('\n'.join(pages[page_number - 1]), lexer, ImageFormatter(line_number_start=(page_number - 1) * page_height + 1, line_numbers=line_numbers, line_number_pad=line_number_pad))
|
||||||
|
else:
|
||||||
|
# Render all pages into image
|
||||||
|
output = []
|
||||||
|
|
||||||
|
for page, page_number in zip(pages, xrange(len(pages))):
|
||||||
|
output.append(highlight('\n'.join(page), lexer, ImageFormatter(line_number_start=page_number * page_height + 1, line_numbers=line_numbers, line_number_pad=line_number_pad)))
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
# parser = TextParser()
|
||||||
|
# page_num = 1
|
||||||
|
# #for result in parser.render('docwrap.py'):#, 80):
|
||||||
|
# #for result in parser.render_to_image('input.txt'):#, 80):
|
||||||
|
# for result in parser.render_to_image('../apps/documents/views.py'):#, 80):
|
||||||
|
# FILE = open('page%d' % page_num, 'wb')
|
||||||
|
# FILE.write(result)
|
||||||
|
# FILE.close()
|
||||||
|
# page_num += 1
|
||||||
@@ -12,6 +12,7 @@ from converter.exceptions import OfficeConversionError
|
|||||||
from documents.utils import document_save_to_temp_dir
|
from documents.utils import document_save_to_temp_dir
|
||||||
from common.utils import copyfile
|
from common.utils import copyfile
|
||||||
from common.conf.settings import TEMPORARY_DIRECTORY
|
from common.conf.settings import TEMPORARY_DIRECTORY
|
||||||
|
from common.textparser import TextParser as OriginalTextParser
|
||||||
|
|
||||||
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
|
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
|
||||||
from ocr.conf.settings import PDFTOTEXT_PATH
|
from ocr.conf.settings import PDFTOTEXT_PATH
|
||||||
@@ -165,5 +166,29 @@ class PopplerParser(Parser):
|
|||||||
document_page.save()
|
document_page.save()
|
||||||
|
|
||||||
|
|
||||||
|
class TextParser(Parser):
|
||||||
|
def parse(self, document_page, descriptor=None):
|
||||||
|
logger.debug('parsing with TextParser')
|
||||||
|
pagenum = str(document_page.page_number)
|
||||||
|
|
||||||
|
if descriptor:
|
||||||
|
destination_descriptor, temp_filepath = tempfile.mkstemp(dir=TEMPORARY_DIRECTORY)
|
||||||
|
copyfile(descriptor, temp_filepath)
|
||||||
|
document_file = temp_filepath
|
||||||
|
else:
|
||||||
|
document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum)
|
||||||
|
|
||||||
|
logger.debug('document_file: %s', document_file)
|
||||||
|
|
||||||
|
logger.debug('parsing text page %s' % pagenum)
|
||||||
|
|
||||||
|
parser = OriginalTextParser()
|
||||||
|
|
||||||
|
document_page.content = '\n'.join(parser.render_to_viewport(filename=document_file)[int(pagenum) - 1])
|
||||||
|
document_page.page_label = _(u'Text extracted from file')
|
||||||
|
document_page.save()
|
||||||
|
|
||||||
|
|
||||||
register_parser(mimetypes=[u'application/pdf'], parsers=[PopplerParser, SlateParser])
|
register_parser(mimetypes=[u'application/pdf'], parsers=[PopplerParser, SlateParser])
|
||||||
|
register_parser(mimetypes=[u'text/plain'], parsers=[TextParser])
|
||||||
register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, parsers=[OfficeParser])
|
register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, parsers=[OfficeParser])
|
||||||
|
|||||||
@@ -71,3 +71,4 @@ GitPython==0.3.2.RC1
|
|||||||
# Misc
|
# Misc
|
||||||
|
|
||||||
elementtree==1.2.7-20070827-preview
|
elementtree==1.2.7-20070827-preview
|
||||||
|
Pygments==1.5
|
||||||
|
|||||||
Reference in New Issue
Block a user