Text parsers and OCR backends are now used in tandem for each document.

This commit is contained in:
Roberto Rosario
2015-08-08 04:49:08 -04:00
parent cf00ba2c40
commit bec85f38f4
9 changed files with 115 additions and 18 deletions

View File

@@ -38,7 +38,7 @@ class Parser(object):
).append(parser_class)
@classmethod
def process_document_version(cls, document_version):
def parse_document_version(cls, document_version):
try:
for parser_class in cls._registry[document_version.mimetype]:
try:
@@ -56,6 +56,24 @@ class Parser(object):
except KeyError:
raise NoMIMETypeMatch
@classmethod
def parse_document_page(cls, document_page):
try:
for parser_class in cls._registry[document_page.document_version.mimetype]:
try:
parser = parser_class()
parser.process_document_page(document_page)
except ParserError:
# If parser raises error, try next parser in the list
pass
else:
# If parser was successfull there is no need to try
# others in the list for this mimetype
return
raise NoMIMETypeMatch('Parser MIME type list exhausted')
except KeyError:
raise NoMIMETypeMatch
def process_document_version(self, document_version):
logger.info('Starting parsing for document version: %s', document_version)
logger.debug('document version: %d', document_version.pk)
@@ -139,10 +157,14 @@ class PopplerParser(Parser):
raise ParserError
output = proc.stdout.read()
if output == b'\x0c':
logger.debug('Parser didn\'t return any output')
return ''
if output[-3:] == b'\x0a\x0a\x0c':
return output[:-3]
return output