Files
mayan-edms/mayan/apps/ocr/managers.py
Roberto Rosario ff03ea07ca Add support for appending pages
Add version upload form checkbox.
Add the append_pages keyword argument.

Signed-off-by: Roberto Rosario <roberto.rosario@mayan-edms.com>
2019-10-10 02:40:45 -04:00

126 lines
4.3 KiB
Python

from __future__ import unicode_literals
import logging
import sys
import traceback
from django.apps import apps
from django.conf import settings
from django.db import models, transaction
from mayan.apps.documents.literals import DOCUMENT_IMAGE_TASK_TIMEOUT
from mayan.apps.documents.tasks import (
task_generate_document_version_page_image
)
from .events import (
event_ocr_document_content_deleted, event_ocr_document_version_finish
)
from .runtime import ocr_backend
from .signals import post_document_version_ocr
logger = logging.getLogger(__name__)
class DocumentVesionPageOCRContentManager(models.Manager):
def delete_content_for(self, document, user=None):
with transaction.atomic():
for document_page in document.pages.all():
self.filter(
document_version_page=document_page.content_object
).delete()
event_ocr_document_content_deleted.commit(
actor=user, target=document
)
def process_document_version_page(self, document_version_page):
logger.info(
'Processing page: %d of document version: %s',
document_version_page.page_number,
document_version_page.document_version
)
DocumentVersionPageOCRContent = apps.get_model(
app_label='ocr', model_name='DocumentVersionPageOCRContent'
)
task = task_generate_document_version_page_image.apply_async(
kwargs=dict(
document_version_page_id=document_version_page.pk
)
)
cache_filename = task.get(
timeout=DOCUMENT_IMAGE_TASK_TIMEOUT, disable_sync_subtasks=False
)
with document_version_page.cache_partition.get_file(filename=cache_filename).open() as file_object:
document_version_page_content, created = DocumentVersionPageOCRContent.objects.get_or_create(
document_version_page=document_version_page
)
document_version_page_content.content = ocr_backend.execute(
file_object=file_object,
language=document_version_page.document.language
)
document_version_page_content.save()
logger.info(
'Finished processing page: %d of document version: %s',
document_version_page.page_number,
document_version_page.document_version
)
def process_document_version(self, document_version):
logger.info('Starting OCR for document version: %s', document_version)
logger.debug('document version: %d', document_version.pk)
try:
for document_version_page in document_version.pages.all():
self.process_document_version_page(
document_version_page=document_version_page
)
except Exception as exception:
logger.error(
'OCR error for document version: %d; %s', document_version.pk,
exception
)
if settings.DEBUG:
result = []
type, value, tb = sys.exc_info()
result.append('%s: %s' % (type.__name__, value))
result.extend(traceback.format_tb(tb))
document_version.ocr_errors.create(
result='\n'.join(result)
)
else:
document_version.ocr_errors.create(result=exception)
else:
logger.info(
'OCR complete for document version: %s', document_version
)
document_version.ocr_errors.all().delete()
event_ocr_document_version_finish.commit(
action_object=document_version.document,
target=document_version
)
post_document_version_ocr.send(
sender=document_version.__class__, instance=document_version
)
class DocumentTypeSettingsManager(models.Manager):
def get_by_natural_key(self, document_type_natural_key):
DocumentType = apps.get_model(
app_label='documents', model_name='DocumentType'
)
try:
document_type = DocumentType.objects.get_by_natural_key(document_type_natural_key)
except DocumentType.DoesNotExist:
raise self.model.DoesNotExist
return self.get(document_type__pk=document_type.pk)