Convert and cache office documents at the document version level for faster page image retrieval

This commit is contained in:
Roberto Rosario
2015-06-24 01:04:35 -04:00
parent 3d68e79654
commit b18888b3f7
7 changed files with 116 additions and 69 deletions

View File

@@ -39,12 +39,8 @@ class Python(ConverterBase):
new_file_object, input_filepath = tempfile.mkstemp()
if self.soffice_file_object:
os.write(new_file_object, self.soffice_file_object.read())
self.soffice_file_object.close()
else:
os.write(new_file_object, self.file_object.read())
self.file_object.seek(0)
os.write(new_file_object, self.file_object.read())
self.file_object.seek(0)
os.close(new_file_object)
@@ -57,6 +53,8 @@ class Python(ConverterBase):
fs_cleanup(input_filepath)
def get_page_count(self):
super(Python, self).get_page_count()
page_count = 1
if self.mime_type == 'application/pdf':
@@ -64,25 +62,24 @@ class Python(ConverterBase):
try:
pages = slate.PDF(self.file_object)
except Exception as exception:
logger.error('slate exception; %s', exception)
return 1
# TODO: Maybe return UnknownFileFormat to display proper unknwon file format message in document description
logger.error('Slate exception; %s', exception)
raise
else:
return len(pages)
finally:
self.file_object.seek(0)
else:
try:
image = Image.open(self.file_object)
finally:
self.file_object.seek(0)
try:
image = Image.open(self.file_object)
finally:
self.file_object.seek(0)
try:
while True:
image.seek(image.tell() + 1)
page_count += 1
except EOFError:
# end of sequence
pass
try:
while True:
image.seek(image.tell() + 1)
page_count += 1
except EOFError:
# end of sequence
pass
return page_count
return page_count