Documents now have their own dedicated DocumentPage submodel. The old DocumentPage is now called DocumentVersionPage. This allows mappings between document pages and document version pages, allowing renumbering, appending pages. DocumentPages have a content_object to map them to any other object. For now they only map to DocumentVersionPages. New option added to the version upload form to append the pages of the new version. A new view was added to just append new pages with wraps the new document version upload form and hides the append pages checkbox set to True. Add a new action, reset_pages to reset the pages of the document to those of the latest version. Missing: appending tests, checks for proper content_object in OCR and document parsing. Author: Roberto Rosario <roberto.rosario@mayan-edms.com> Date: Thu Oct 11 12:00:25 2019 -0400
126 lines
4.3 KiB
Python
126 lines
4.3 KiB
Python
from __future__ import unicode_literals
|
|
|
|
import logging
|
|
import sys
|
|
import traceback
|
|
|
|
from django.apps import apps
|
|
from django.conf import settings
|
|
from django.db import models, transaction
|
|
|
|
from mayan.apps.documents.literals import DOCUMENT_IMAGE_TASK_TIMEOUT
|
|
from mayan.apps.documents.tasks import (
|
|
task_generate_document_version_page_image
|
|
)
|
|
|
|
from .events import (
|
|
event_ocr_document_content_deleted, event_ocr_document_version_finish
|
|
)
|
|
from .runtime import ocr_backend
|
|
from .signals import post_document_version_ocr
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DocumentVesionPageOCRContentManager(models.Manager):
|
|
def delete_content_for(self, document, user=None):
|
|
with transaction.atomic():
|
|
for document_page in document.pages.all():
|
|
self.filter(
|
|
document_version_page=document_page.content_object
|
|
).delete()
|
|
|
|
event_ocr_document_content_deleted.commit(
|
|
actor=user, target=document
|
|
)
|
|
|
|
def process_document_version_page(self, document_version_page):
|
|
logger.info(
|
|
'Processing page: %d of document version: %s',
|
|
document_version_page.page_number,
|
|
document_version_page.document_version
|
|
)
|
|
|
|
DocumentVersionPageOCRContent = apps.get_model(
|
|
app_label='ocr', model_name='DocumentVersionPageOCRContent'
|
|
)
|
|
|
|
task = task_generate_document_version_page_image.apply_async(
|
|
kwargs=dict(
|
|
document_version_page_id=document_version_page.pk
|
|
)
|
|
)
|
|
|
|
cache_filename = task.get(
|
|
timeout=DOCUMENT_IMAGE_TASK_TIMEOUT, disable_sync_subtasks=False
|
|
)
|
|
|
|
with document_version_page.cache_partition.get_file(filename=cache_filename).open() as file_object:
|
|
document_version_page_content, created = DocumentVersionPageOCRContent.objects.get_or_create(
|
|
document_version_page=document_version_page
|
|
)
|
|
document_version_page_content.content = ocr_backend.execute(
|
|
file_object=file_object,
|
|
language=document_version_page.document.language
|
|
)
|
|
document_version_page_content.save()
|
|
|
|
logger.info(
|
|
'Finished processing page: %d of document version: %s',
|
|
document_version_page.page_number,
|
|
document_version_page.document_version
|
|
)
|
|
|
|
def process_document_version(self, document_version):
|
|
logger.info('Starting OCR for document version: %s', document_version)
|
|
logger.debug('document version: %d', document_version.pk)
|
|
|
|
try:
|
|
for document_version_page in document_version.pages.all():
|
|
self.process_document_version_page(
|
|
document_version_page=document_version_page
|
|
)
|
|
except Exception as exception:
|
|
logger.error(
|
|
'OCR error for document version: %d; %s', document_version.pk,
|
|
exception
|
|
)
|
|
|
|
if settings.DEBUG:
|
|
result = []
|
|
type, value, tb = sys.exc_info()
|
|
result.append('%s: %s' % (type.__name__, value))
|
|
result.extend(traceback.format_tb(tb))
|
|
document_version.ocr_errors.create(
|
|
result='\n'.join(result)
|
|
)
|
|
else:
|
|
document_version.ocr_errors.create(result=exception)
|
|
else:
|
|
logger.info(
|
|
'OCR complete for document version: %s', document_version
|
|
)
|
|
document_version.ocr_errors.all().delete()
|
|
|
|
event_ocr_document_version_finish.commit(
|
|
action_object=document_version.document,
|
|
target=document_version
|
|
)
|
|
|
|
post_document_version_ocr.send(
|
|
sender=document_version.__class__, instance=document_version
|
|
)
|
|
|
|
|
|
class DocumentTypeSettingsManager(models.Manager):
|
|
def get_by_natural_key(self, document_type_natural_key):
|
|
DocumentType = apps.get_model(
|
|
app_label='documents', model_name='DocumentType'
|
|
)
|
|
try:
|
|
document_type = DocumentType.objects.get_by_natural_key(document_type_natural_key)
|
|
except DocumentType.DoesNotExist:
|
|
raise self.model.DoesNotExist
|
|
|
|
return self.get(document_type__pk=document_type.pk)
|