Initial commit of the document parsing app.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2017-08-23 02:23:14 -04:00
parent 317d07a355
commit e9591c92f9
25 changed files with 1350 additions and 0 deletions

View File

@@ -0,0 +1,104 @@
from __future__ import unicode_literals
from django import forms
from django.utils.encoding import force_text
from django.utils.html import conditional_escape
from django.utils.safestring import mark_safe
from django.utils.translation import ugettext_lazy as _, ugettext
from common.widgets import TextAreaDiv
from documents.models import DocumentType
from .models import DocumentPageContent, DocumentPageOCRContent
class DocumentContentForm(forms.Form):
"""
Form that concatenates all of a document pages' text content into a
single textarea widget
"""
def __init__(self, *args, **kwargs):
self.document = kwargs.pop('instance', None)
super(DocumentContentForm, self).__init__(*args, **kwargs)
content = []
self.fields['contents'].initial = ''
try:
document_pages = self.document.pages.all()
except AttributeError:
document_pages = []
for page in document_pages:
try:
page_content = page.ocr_content.content
except DocumentPageContent.DoesNotExist:
pass
else:
content.append(conditional_escape(force_text(page_content)))
content.append(
'\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
ugettext(
'Page %(page_number)d'
) % {'page_number': page.page_number}
)
)
self.fields['contents'].initial = mark_safe(''.join(content))
contents = forms.CharField(
label=_('Contents'),
widget=TextAreaDiv(
attrs={
'class': 'text_area_div full-height',
'data-height-difference': 360
}
)
)
class DocumentOCRContentForm(forms.Form):
"""
Form that concatenates all of a document pages' text content into a
single textarea widget
"""
def __init__(self, *args, **kwargs):
self.document = kwargs.pop('instance', None)
super(DocumentContentForm, self).__init__(*args, **kwargs)
content = []
self.fields['contents'].initial = ''
try:
document_pages = self.document.pages.all()
except AttributeError:
document_pages = []
for page in document_pages:
try:
page_content = page.ocr_content.content
except DocumentPageOCRContent.DoesNotExist:
pass
else:
content.append(conditional_escape(force_text(page_content)))
content.append(
'\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
ugettext(
'Page %(page_number)d'
) % {'page_number': page.page_number}
)
)
self.fields['contents'].initial = mark_safe(''.join(content))
contents = forms.CharField(
label=_('Contents'),
widget=TextAreaDiv(
attrs={
'class': 'text_area_div full-height',
'data-height-difference': 360
}
)
)
class DocumentTypeSelectForm(forms.Form):
document_type = forms.ModelChoiceField(
queryset=DocumentType.objects.all(), label=('Document type')
)