Refactor OCR app. Removes document parsing. Moves OCR processing to

model manager. Add submit and finish events.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2017-08-23 02:04:57 -04:00
parent 2052caada4
commit 317d07a355
20 changed files with 309 additions and 497 deletions

View File

@@ -3,12 +3,12 @@ from __future__ import unicode_literals
from django.contrib import admin
from .models import (
DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError
DocumentPageOCRContent, DocumentTypeSettings, DocumentVersionOCRError
)
@admin.register(DocumentPageContent)
class DocumentPageContentAdmin(admin.ModelAdmin):
@admin.register(DocumentPageOCRContent)
class DocumentPageOCRContentAdmin(admin.ModelAdmin):
list_display = ('document_page',)

View File

@@ -6,9 +6,9 @@ from rest_framework.response import Response
from documents.models import Document, DocumentPage, DocumentVersion
from rest_api.permissions import MayanPermission
from .models import DocumentPageContent
from .models import DocumentPageOCRContent
from .permissions import permission_ocr_content_view, permission_ocr_document
from .serializers import DocumentPageContentSerializer
from .serializers import DocumentPageOCRContentSerializer
class APIDocumentOCRView(generics.GenericAPIView):
@@ -67,7 +67,7 @@ class APIDocumentVersionOCRView(generics.GenericAPIView):
return Response(status=status.HTTP_202_ACCEPTED)
class APIDocumentPageContentView(generics.RetrieveAPIView):
class APIDocumentPageOCRContentView(generics.RetrieveAPIView):
"""
Returns the OCR content of the selected document page.
---
@@ -82,7 +82,7 @@ class APIDocumentPageContentView(generics.RetrieveAPIView):
'GET': (permission_ocr_content_view,),
}
permission_classes = (MayanPermission,)
serializer_class = DocumentPageContentSerializer
serializer_class = DocumentPageOCRContentSerializer
queryset = DocumentPage.objects.all()
def retrieve(self, request, *args, **kwargs):
@@ -90,8 +90,8 @@ class APIDocumentPageContentView(generics.RetrieveAPIView):
try:
ocr_content = instance.ocr_content
except DocumentPageContent.DoesNotExist:
ocr_content = DocumentPageContent.objects.none()
except DocumentPageOCRContent.DoesNotExist:
ocr_content = DocumentPageOCRContent.objects.none()
serializer = self.get_serializer(ocr_content)
return Response(serializer.data)

View File

@@ -1,11 +1,13 @@
from __future__ import unicode_literals
from datetime import timedelta
import logging
from kombu import Exchange, Queue
from django.apps import apps
from django.db.models.signals import post_save
from django.utils.timezone import now
from django.utils.translation import ugettext_lazy as _
from acls import ModelPermission
@@ -21,7 +23,10 @@ from mayan.celery import app
from navigation import SourceColumn
from rest_api.classes import APIEndPoint
from .handlers import initialize_new_ocr_settings, post_version_upload_ocr
from .events import event_ocr_document_version_submit
from .handlers import (
handler_initialize_new_ocr_settings, handler_ocr_document_version,
)
from .links import (
link_document_content, link_document_ocr_download,
link_document_ocr_erros_list, link_document_submit,
@@ -36,17 +41,17 @@ logger = logging.getLogger(__name__)
def document_ocr_submit(self):
from .tasks import task_do_ocr
task_do_ocr.apply_async(args=(self.latest_version.pk,))
self.latest_version.submit_for_ocr()
def document_version_ocr_submit(self):
from .tasks import task_do_ocr
event_ocr_document_version_submit.commit(target=self)
task_do_ocr.apply_async(
eta=now() + timedelta(seconds=settings_db_sync_task_delay.value),
kwargs={'document_version_pk': self.pk},
countdown=settings_db_sync_task_delay.value
)
@@ -155,10 +160,12 @@ class OCRApp(MayanAppConfig):
)
post_save.connect(
initialize_new_ocr_settings,
dispatch_uid='initialize_new_ocr_settings', sender=DocumentType
dispatch_uid='ocr_handler_initialize_new_ocr_settings',
receiver=handler_initialize_new_ocr_settings,
sender=DocumentType
)
post_version_upload.connect(
post_version_upload_ocr, dispatch_uid='post_version_upload_ocr',
dispatch_uid='ocr_handler_ocr_document_version',
receiver=handler_ocr_document_version,
sender=DocumentVersion
)

View File

@@ -1,80 +1,9 @@
from __future__ import unicode_literals
import logging
from django.utils.module_loading import import_string
from converter import converter_class
from documents.runtime import cache_storage_backend
from .exceptions import NoMIMETypeMatch, ParserError
from .models import DocumentPageContent
from .parsers import Parser
from .settings import setting_ocr_backend
logger = logging.getLogger(__name__)
class TextExtractor(object):
@classmethod
def perform_ocr(cls, document_page):
ocr_backend_class = import_string(setting_ocr_backend.value)
backend = ocr_backend_class()
backend.process_document_page(document_page)
@classmethod
def process_document_page(cls, document_page):
"""
Extract text for a document version's page. Try parsing the page and if
no there are not parsers for the MIME type or the parser return nothing
fallback to doing and OCR of the page.
"""
try:
Parser.parse_document_page(document_page=document_page)
except (NoMIMETypeMatch, ParserError):
cls.perform_ocr(document_page=document_page)
else:
if not document_page.ocr_content.content:
cls.perform_ocr(document_page=document_page)
@classmethod
def process_document_version(cls, document_version):
for document_page in document_version.pages.all():
cls.process_document_page(document_page=document_page)
class OCRBackendBase(object):
def process_document_version(self, document_version):
logger.info('Starting OCR for document version: %s', document_version)
logger.debug('document version: %d', document_version.pk)
for document_page in document_version.pages.all():
self.process_document_page(document_page=document_page)
def process_document_page(self, document_page):
logger.info(
'Processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
)
cache_filename = document_page.generate_image()
with cache_storage_backend.open(cache_filename) as file_object:
document_page_content, created = DocumentPageContent.objects.get_or_create(
document_page=document_page
)
document_page_content.content = self.execute(
file_object=file_object,
language=document_page.document.language
)
document_page_content.save()
logger.info(
'Finished processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
)
def execute(self, file_object, language=None, transformations=None):
self.language = language

14
mayan/apps/ocr/events.py Normal file
View File

@@ -0,0 +1,14 @@
from __future__ import absolute_import, unicode_literals
from django.utils.translation import ugettext_lazy as _
from events.classes import Event
event_ocr_document_version_submit = Event(
name='ocr_document_version_submit',
label=_('Document version submitted for OCR')
)
event_ocr_document_version_finish = Event(
name='ocr_document_version_finish',
label=_('Document version OCR finished')
)

View File

@@ -6,17 +6,3 @@ class OCRError(Exception):
Raised by the OCR backend
"""
pass
class ParserError(Exception):
"""
Base exception for file parsers
"""
pass
class NoMIMETypeMatch(ParserError):
"""
There is no parser registered for the specified MIME type
"""
pass

View File

@@ -9,17 +9,17 @@ from django.utils.translation import ugettext_lazy as _, ugettext
from common.widgets import TextAreaDiv
from documents.models import DocumentType
from .models import DocumentPageContent
from .models import DocumentPageOCRContent
class DocumentContentForm(forms.Form):
class DocumentOCRContentForm(forms.Form):
"""
Form that concatenates all of a document pages' text content into a
single textarea widget
"""
def __init__(self, *args, **kwargs):
self.document = kwargs.pop('instance', None)
super(DocumentContentForm, self).__init__(*args, **kwargs)
super(DocumentOCRContentForm, self).__init__(*args, **kwargs)
content = []
self.fields['contents'].initial = ''
try:
@@ -30,7 +30,7 @@ class DocumentContentForm(forms.Form):
for page in document_pages:
try:
page_content = page.ocr_content.content
except DocumentPageContent.DoesNotExist:
except DocumentPageOCRContent.DoesNotExist:
pass
else:
content.append(conditional_escape(force_text(page_content)))

View File

@@ -9,14 +9,7 @@ from .settings import setting_auto_ocr
logger = logging.getLogger(__name__)
def post_version_upload_ocr(sender, instance, **kwargs):
logger.debug('received post_version_upload')
logger.debug('instance pk: %s', instance.pk)
if instance.document.document_type.ocr_settings.auto_ocr:
instance.submit_for_ocr()
def initialize_new_ocr_settings(sender, instance, **kwargs):
def handler_initialize_new_ocr_settings(sender, instance, **kwargs):
DocumentTypeSettings = apps.get_model(
app_label='ocr', model_name='DocumentTypeSettings'
)
@@ -25,3 +18,10 @@ def initialize_new_ocr_settings(sender, instance, **kwargs):
DocumentTypeSettings.objects.create(
document_type=instance, auto_ocr=setting_auto_ocr.value
)
def handler_ocr_document_version(sender, instance, **kwargs):
logger.debug('received post_version_upload')
logger.debug('instance pk: %s', instance.pk)
if instance.document.document_type.ocr_settings.auto_ocr:
instance.submit_for_ocr()

View File

@@ -0,0 +1,82 @@
from __future__ import unicode_literals
import logging
import sys
import traceback
from django.apps import apps
from django.conf import settings
from django.db import models
from documents.runtime import cache_storage_backend
from .events import event_ocr_document_version_finish
from .runtime import ocr_backend
from .signals import post_document_version_ocr
logger = logging.getLogger(__name__)
class DocumentPageOCRContentManager(models.Manager):
def process_document_version(self, document_version):
logger.info('Starting OCR for document version: %s', document_version)
logger.debug('document version: %d', document_version.pk)
try:
for document_page in document_version.pages.all():
self.process_document_page(document_page=document_page)
except Exception as exception:
logger.error(
'OCR error for document version: %d; %s', document_version,
exception
)
if settings.DEBUG:
result = []
type, value, tb = sys.exc_info()
result.append('%s: %s' % (type.__name__, value))
result.extend(traceback.format_tb(tb))
document_version.ocr_errors.create(
result='\n'.join(result)
)
else:
document_version.ocr_errors.create(result=exception)
else:
logger.info(
'OCR complete for document version: %s', document_version
)
document_version.ocr_errors.all().delete()
event_ocr_document_version_finish.commit(target=document_version)
post_document_version_ocr.send(
sender=document_version.__class__, instance=document_version
)
def process_document_page(self, document_page):
logger.info(
'Processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
)
DocumentPageOCRContent = apps.get_model(
app_label='ocr', model_name='DocumentPageOCRContent'
)
# TODO: Call task and wait
cache_filename = document_page.generate_image()
with cache_storage_backend.open(cache_filename) as file_object:
document_page_content, created = DocumentPageOCRContent.objects.get_or_create(
document_page=document_page
)
document_page_content.content = ocr_backend.execute(
file_object=file_object,
language=document_page.document.language
)
document_page_content.save()
logger.info(
'Finished processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
)

View File

@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.10.7 on 2017-08-23 05:53
from __future__ import unicode_literals
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('documents', '0040_auto_20170725_1111'),
('ocr', '0005_auto_20170630_1846'),
]
operations = [
migrations.CreateModel(
name='DocumentPageOCRContent',
fields=[
(
'id', models.AutoField(
auto_created=True, primary_key=True, serialize=False,
verbose_name='ID'
)
),
(
'content', models.TextField(
blank=True, verbose_name='Content'
)
),
(
'document_page', models.OneToOneField(
on_delete=django.db.models.deletion.CASCADE,
related_name='ocr', to='documents.DocumentPage',
verbose_name='Document page'
)
),
],
options={
'verbose_name': 'Document page OCR content',
'verbose_name_plural': 'Document pages OCR contents',
},
),
migrations.RemoveField(
model_name='documentpagecontent',
name='document_page',
),
migrations.AlterModelOptions(
name='documentversionocrerror',
options={
'ordering': ('datetime_submitted',),
'verbose_name': 'Document version OCR error',
'verbose_name_plural': 'Document version OCR errors'
},
),
migrations.AlterField(
model_name='documentversionocrerror',
name='datetime_submitted',
field=models.DateTimeField(
auto_now_add=True, db_index=True,
verbose_name='Date time submitted'
),
),
migrations.DeleteModel(
name='DocumentPageContent',
),
]

View File

@@ -6,6 +6,8 @@ from django.utils.translation import ugettext_lazy as _
from documents.models import DocumentPage, DocumentType, DocumentVersion
from .managers import DocumentPageOCRContentManager
class DocumentTypeSettings(models.Model):
"""
@@ -25,6 +27,24 @@ class DocumentTypeSettings(models.Model):
verbose_name_plural = _('Document types settings')
@python_2_unicode_compatible
class DocumentPageOCRContent(models.Model):
document_page = models.OneToOneField(
DocumentPage, on_delete=models.CASCADE, related_name='ocr_content',
verbose_name=_('Document page')
)
content = models.TextField(blank=True, verbose_name=_('Content'))
objects = DocumentPageOCRContentManager()
def __str__(self):
return force_text(self.document_page)
class Meta:
verbose_name = _('Document page OCR content')
verbose_name_plural = _('Document pages OCR contents')
@python_2_unicode_compatible
class DocumentVersionOCRError(models.Model):
document_version = models.ForeignKey(
@@ -32,7 +52,7 @@ class DocumentVersionOCRError(models.Model):
verbose_name=_('Document version')
)
datetime_submitted = models.DateTimeField(
auto_now=True, db_index=True, verbose_name=_('Date time submitted')
auto_now_add=True, db_index=True, verbose_name=_('Date time submitted')
)
result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
@@ -41,24 +61,5 @@ class DocumentVersionOCRError(models.Model):
class Meta:
ordering = ('datetime_submitted',)
verbose_name = _('Document Version OCR Error')
verbose_name_plural = _('Document Version OCR Errors')
@python_2_unicode_compatible
class DocumentPageContent(models.Model):
"""
Model that describes a document page content
"""
document_page = models.OneToOneField(
DocumentPage, on_delete=models.CASCADE, related_name='ocr_content',
verbose_name=_('Document page')
)
content = models.TextField(blank=True, verbose_name=_('Content'))
def __str__(self):
return force_text(self.document_page)
class Meta:
verbose_name = _('Document page content')
verbose_name_plural = _('Document pages contents')
verbose_name = _('Document version OCR error')
verbose_name_plural = _('Document version OCR errors')

View File

@@ -1,202 +0,0 @@
from __future__ import unicode_literals
from io import BytesIO
import logging
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import subprocess
from django.utils.translation import ugettext_lazy as _
from common.utils import copyfile, fs_cleanup, mkstemp
from .exceptions import ParserError, NoMIMETypeMatch
from .models import DocumentPageContent
from .settings import setting_pdftotext_path
logger = logging.getLogger(__name__)
class Parser(object):
"""
Parser base class
"""
_registry = {}
@classmethod
def register(cls, mimetypes, parser_classes):
for mimetype in mimetypes:
for parser_class in parser_classes:
cls._registry.setdefault(
mimetype, []
).append(parser_class)
@classmethod
def parse_document_version(cls, document_version):
try:
for parser_class in cls._registry[document_version.mimetype]:
try:
parser = parser_class()
parser.process_document_version(document_version)
except ParserError:
# If parser raises error, try next parser in the list
pass
else:
# If parser was successfull there is no need to try
# others in the list for this mimetype
return
raise NoMIMETypeMatch('Parser MIME type list exhausted')
except KeyError:
raise NoMIMETypeMatch
@classmethod
def parse_document_page(cls, document_page):
try:
for parser_class in cls._registry[document_page.document_version.mimetype]:
try:
parser = parser_class()
parser.process_document_page(document_page)
except ParserError:
# If parser raises error, try next parser in the list
pass
else:
# If parser was successfull there is no need to try
# others in the list for this mimetype
return
raise NoMIMETypeMatch('Parser MIME type list exhausted')
except KeyError:
raise NoMIMETypeMatch
def process_document_version(self, document_version):
logger.info(
'Starting parsing for document version: %s', document_version
)
logger.debug('document version: %d', document_version.pk)
for document_page in document_version.pages.all():
self.process_document_page(document_page=document_page)
def process_document_page(self, document_page):
logger.info(
'Processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
)
file_object = document_page.document_version.get_intermidiate_file()
try:
document_page_content, created = DocumentPageContent.objects.get_or_create(
document_page=document_page
)
document_page_content.content = self.execute(
file_object=file_object, page_number=document_page.page_number
)
document_page_content.save()
except Exception as exception:
error_message = _('Exception parsing page; %s') % exception
logger.error(error_message)
raise ParserError(error_message)
finally:
file_object.close()
logger.info(
'Finished processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
)
def execute(self, file_object, page_number):
raise NotImplementedError(
'Your %s class has not defined the required execute() method.' %
self.__class__.__name__
)
class PopplerParser(Parser):
"""
PDF parser using the pdftotext execute from the poppler package
"""
def __init__(self):
self.pdftotext_path = setting_pdftotext_path.value
if not os.path.exists(self.pdftotext_path):
error_message = _(
'Cannot find pdftotext executable at: %s'
) % self.pdftotext_path
logger.error(error_message)
raise ParserError(error_message)
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
def execute(self, file_object, page_number):
logger.debug('Parsing PDF page: %d', page_number)
destination_descriptor, temp_filepath = mkstemp()
copyfile(file_object, temp_filepath)
command = []
command.append(self.pdftotext_path)
command.append('-f')
command.append(str(page_number))
command.append('-l')
command.append(str(page_number))
command.append(temp_filepath)
command.append('-')
proc = subprocess.Popen(
command, close_fds=True, stderr=subprocess.PIPE,
stdout=subprocess.PIPE
)
return_code = proc.wait()
if return_code != 0:
logger.error(proc.stderr.readline())
fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
raise ParserError
output = proc.stdout.read()
fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
if output == b'\x0c':
logger.debug('Parser didn\'t return any output')
return ''
if output[-3:] == b'\x0a\x0a\x0c':
return output[:-3]
return output
class PDFMinerParser(Parser):
"""
Parser for PDF files using the PDFMiner library for Python
"""
def execute(self, file_object, page_number):
logger.debug('Parsing PDF page: %d', page_number)
with BytesIO() as string_buffer:
rsrcmgr = PDFResourceManager()
device = TextConverter(
rsrcmgr, outfp=string_buffer, laparams=LAParams()
)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page = PDFPage.get_pages(
file_object, maxpages=1, pagenos=(page_number - 1,)
)
interpreter.process_page(page.next())
device.close()
logger.debug('Finished parsing PDF: %d', page_number)
return string_buffer.getvalue()
Parser.register(
mimetypes=('application/pdf',),
parser_classes=(PopplerParser, PDFMinerParser)
)

View File

@@ -0,0 +1,5 @@
from django.utils.module_loading import import_string
from .settings import setting_ocr_backend
ocr_backend = import_string(setting_ocr_backend.value)()

View File

@@ -2,10 +2,10 @@ from __future__ import unicode_literals
from rest_framework import serializers
from .models import DocumentPageContent
from .models import DocumentPageOCRContent
class DocumentPageContentSerializer(serializers.ModelSerializer):
class DocumentPageOCRContentSerializer(serializers.ModelSerializer):
class Meta:
fields = ('content',)
model = DocumentPageContent
model = DocumentPageOCRContent

View File

@@ -1,84 +1,53 @@
from __future__ import unicode_literals
import logging
import sys
import traceback
from django.conf import settings
from django.apps import apps
from django.db import OperationalError
from documents.models import DocumentVersion
from lock_manager import LockError
from lock_manager.runtime import locking_backend
from mayan.celery import app
from .classes import TextExtractor
from .literals import DO_OCR_RETRY_DELAY, LOCK_EXPIRE
from .models import DocumentVersionOCRError
from .signals import post_document_version_ocr
logger = logging.getLogger(__name__)
@app.task(bind=True, default_retry_delay=DO_OCR_RETRY_DELAY, ignore_result=True)
def task_do_ocr(self, document_version_pk):
DocumentVersion = apps.get_model(
app_label='documents', model_name='DocumentVersion'
)
DocumentPageOCRContent = apps.get_model(
app_label='ocr', model_name='DocumentPageOCRContent'
)
lock_id = 'task_do_ocr_doc_version-%d' % document_version_pk
try:
logger.debug('trying to acquire lock: %s', lock_id)
# Acquire lock to avoid doing OCR on the same document version more than
# once concurrently
# Acquire lock to avoid doing OCR on the same document version more
# than once concurrently
lock = locking_backend.acquire_lock(lock_id, LOCK_EXPIRE)
logger.debug('acquired lock: %s', lock_id)
document_version = None
try:
document_version = DocumentVersion.objects.get(pk=document_version_pk)
document_version = DocumentVersion.objects.get(
pk=document_version_pk
)
logger.info(
'Starting document OCR for document version: %s',
document_version
)
TextExtractor.process_document_version(document_version)
DocumentPageOCRContent.objects.process_document_version(
document_version=document_version
)
except OperationalError as exception:
logger.warning(
'OCR error for document version: %d; %s. Retrying.',
document_version_pk, exception
)
raise self.retry(exc=exception)
except Exception as exception:
logger.error(
'OCR error for document version: %d; %s', document_version_pk,
exception
)
if document_version:
entry, created = DocumentVersionOCRError.objects.get_or_create(
document_version=document_version
)
if settings.DEBUG:
result = []
type, value, tb = sys.exc_info()
result.append('%s: %s' % (type.__name__, value))
result.extend(traceback.format_tb(tb))
entry.result = '\n'.join(result)
else:
entry.result = exception
entry.save()
else:
logger.info(
'OCR complete for document version: %s', document_version
)
try:
entry = DocumentVersionOCRError.objects.get(
document_version=document_version
)
except DocumentVersionOCRError.DoesNotExist:
pass
else:
entry.delete()
post_document_version_ocr.send(
sender=self, instance=document_version
)
finally:
lock.release()
except LockError:

View File

@@ -0,0 +1,35 @@
from __future__ import unicode_literals
from actstream.models import Action
from documents.tests.test_models import GenericDocumentTestCase
from ..events import (
event_ocr_document_version_submit, event_ocr_document_version_finish
)
class OCREventsTestCase(GenericDocumentTestCase):
def test_document_version_submit_event(self):
Action.objects.all().delete()
self.document.submit_for_ocr()
self.assertEqual(
Action.objects.last().target, self.document.latest_version
)
self.assertEqual(
Action.objects.last().verb,
event_ocr_document_version_submit.name
)
def test_document_version_finish_event(self):
Action.objects.all().delete()
self.document.submit_for_ocr()
self.assertEqual(
Action.objects.first().target, self.document.latest_version
)
self.assertEqual(
Action.objects.first().verb,
event_ocr_document_version_finish.name
)

View File

@@ -1,83 +0,0 @@
from __future__ import unicode_literals
from django.core.files.base import File
from django.test import override_settings
from common.tests import BaseTestCase
from documents.models import DocumentType
from documents.tests import (
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
)
from ..classes import TextExtractor
from ..parsers import PDFMinerParser, PopplerParser
@override_settings(OCR_AUTO_OCR=False)
class ParserTestCase(BaseTestCase):
def setUp(self):
super(ParserTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=File(file_object)
)
def tearDown(self):
self.document_type.delete()
super(ParserTestCase, self).tearDown()
def test_pdfminer_parser(self):
parser = PDFMinerParser()
parser.process_document_version(self.document.latest_version)
self.assertTrue(
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
)
def test_poppler_parser(self):
parser = PopplerParser()
parser.process_document_version(self.document.latest_version)
self.assertTrue(
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
)
@override_settings(OCR_AUTO_OCR=False)
class TextExtractorTestCase(BaseTestCase):
def setUp(self):
super(TextExtractorTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=File(file_object)
)
def tearDown(self):
self.document_type.delete()
super(TextExtractorTestCase, self).tearDown()
def test_text_extractor(self):
TextExtractor.process_document_version(
document_version=self.document.latest_version
)
self.assertEqual(
self.document.latest_version.pages.first().ocr_content.content,
'Sample text',
)
self.assertEqual(
self.document.latest_version.pages.last().ocr_content.content,
'Sample text in image form',
)

View File

@@ -3,7 +3,8 @@ from __future__ import unicode_literals
from django.conf.urls import url
from .api_views import (
APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView
APIDocumentOCRView, APIDocumentPageOCRContentView,
APIDocumentVersionOCRView
)
from .views import (
DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView,
@@ -59,7 +60,8 @@ api_urls = [
name='document-version-ocr-submit-view'
),
url(
r'^page/(?P<pk>\d+)/content/$', APIDocumentPageContentView.as_view(),
r'^page/(?P<pk>\d+)/content/$',
APIDocumentPageOCRContentView.as_view(),
name='document-page-content-view'
),
]

View File

@@ -3,14 +3,14 @@ from __future__ import unicode_literals
from django.utils.encoding import force_text
from django.utils.html import conditional_escape
from .models import DocumentPageContent
from .models import DocumentPageOCRContent
def get_document_ocr_content(document):
for page in document.pages.all():
try:
page_content = page.ocr_content.content
except DocumentPageContent.DoesNotExist:
except DocumentPageOCRContent.DoesNotExist:
pass
else:
yield conditional_escape(force_text(page_content))

View File

@@ -14,7 +14,7 @@ from common.generics import (
from common.mixins import MultipleInstanceActionMixin
from documents.models import Document, DocumentType
from .forms import DocumentContentForm, DocumentTypeSelectForm
from .forms import DocumentOCRContentForm, DocumentTypeSelectForm
from .models import DocumentVersionOCRError
from .permissions import (
permission_ocr_content_view, permission_ocr_document,
@@ -40,6 +40,27 @@ class DocumentAllSubmitView(ConfirmView):
)
class DocumentOCRContent(SingleObjectDetailView):
form_class = DocumentOCRContentForm
model = Document
object_permission = permission_ocr_content_view
def dispatch(self, request, *args, **kwargs):
result = super(DocumentOCRContent, self).dispatch(
request, *args, **kwargs
)
self.get_object().add_as_recent_document_for_user(request.user)
return result
def get_extra_context(self):
return {
'document': self.get_object(),
'hide_labels': True,
'object': self.get_object(),
'title': _('OCR result for document: %s') % self.get_object(),
}
class DocumentSubmitView(ConfirmView):
def get_extra_context(self):
return {
@@ -128,27 +149,6 @@ class DocumentTypeSettingsEditView(SingleObjectEditView):
}
class DocumentOCRContent(SingleObjectDetailView):
form_class = DocumentContentForm
model = Document
object_permission = permission_ocr_content_view
def dispatch(self, request, *args, **kwargs):
result = super(DocumentOCRContent, self).dispatch(
request, *args, **kwargs
)
self.get_object().add_as_recent_document_for_user(request.user)
return result
def get_extra_context(self):
return {
'document': self.get_object(),
'hide_labels': True,
'object': self.get_object(),
'title': _('OCR result for document: %s') % self.get_object(),
}
class EntryListView(SingleObjectListView):
extra_context = {
'hide_object': True,