74df76c8-4768-48d5-bb7f-5ee50aa05217

Committed 06 Nov 2023 01:38PM UTC coverage: 67.523% (-0.3%) from 67.801%

Build # 74df76c8-4768-48d5-bb7f-5ee50aa05217

Build Type

Pull #2177

circleci

Committed by

VirginiaDooley

Commit Message

Create TextractResults model

Pull Request Pull Request #2177: Spike: AWS Textract

Run Details

1640 of 2760 branches covered (0.0%)

Branch coverage included in aggregate %.

12 of 62 new or added lines in 3 files covered. (19.35%)

110 existing lines in 10 files now uncovered.

6662 of 9535 relevant lines covered (69.87%)

0.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

66.07

/ynr/apps/sopn_parsing/helpers/extract_tables.py

import json
import os

import boto3
import trp.trp2 as t2
from sopn_parsing.helpers.text_helpers import NoTextInDocumentError, clean_text
from sopn_parsing.models import ParsedSOPN
from textractcaller.t_call import Textract_Features, call_textract
from textractprettyprinter.t_pretty_print import (
    Textract_Pretty_Print,
    get_string,
)
from trp.t_pipeline import pipeline_merge_tables
from trp.t_tables import HeaderFooterType, MergeOptions


def extract_ballot_table(ballot, parse_flavor="lattice"):
    """
    Given a OfficialDocument model, update or create a ParsedSOPN model with the
    contents of the table as a JSON string.

    :type ballot: candidates.models.Ballot

    """
    import camelot  # import here to avoid import error running tests without pdf deps installed

    document = ballot.sopn
    if not document.relevant_pages:
        raise ValueError(
            "Pages for table not known for document, extract page numbers first"
        )

    try:
        tables = camelot.read_pdf(
            document.uploaded_file.path,
            pages=document.relevant_pages,
            flavor=parse_flavor,
        )
    except (NotImplementedError, AttributeError):
        # * NotImplementedError is thrown if the PDF is an image or generally
        #   unreadable.
        # * AttributeError is thrown on some PDFs saying they need a password.
        #   Assume this is a bug in camelot, and ignore these PDFs
        raise NoTextInDocumentError()

    # Tables can span pages, camelot assumes they're different tables, so we
    # need to join them back together
    table_list = []
    for table in tables:
        table_list.append(table)
    table_list.sort(key=lambda t: (t.page, t.order))

    if not table_list:
        return None

    table_data = table_list.pop(0).df
    for table in table_list:
        # It's possible to have the "situation of poll" document on the SOPN
        # Ignore any table that contains "polling station" (SOPNs tables don't)
        first_row = table.df.iloc[0].to_string()
        if "polling station" in clean_text(first_row):
            break
        # Append the continuation table to the first one in the document.
        # ignore_index is needed so the e.g table 2 row 1 doesn't replace
        # table 1 row 1
        table_data = table_data.append(table.df, ignore_index=True)

    if not table_data.empty:
        parsed, _ = ParsedSOPN.objects.update_or_create(
            sopn=document,
            defaults={"raw_data": json.dumps(table_data.to_dict())},
        )
        return parsed
    return None


def textract_extract_tables(self, s3_uri_of_documents):
    session = boto3.session.Session(
        aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
        aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
        aws_session_token=os.environ.get("AWS_SECURITY_TOKEN"),
    )
    textract_client = boto3.client("textract", region_name="us-west-1")
    textract_json = call_textract(
        input_document=s3_uri_of_documents,
        features=[Textract_Features.TABLES],
        boto3_textract_client=textract_client,
    )
    print(
        get_string(
            textract_json=textract_json, output_type=Textract_Pretty_Print
        )
    )

    t_document: t2.TDocument = t2.TDocumentSchema().load(textract_json)
    t_document = pipeline_merge_tables(
        t_document, MergeOptions.MERGE, None, HeaderFooterType.NONE, session
    )

1	import json	1✔
2	import os	1✔
3
4	import boto3	1✔
5	import trp.trp2 as t2	1✔
6	from sopn_parsing.helpers.text_helpers import NoTextInDocumentError, clean_text	1✔
7	from sopn_parsing.models import ParsedSOPN	1✔
8	from textractcaller.t_call import Textract_Features, call_textract	1✔
9	from textractprettyprinter.t_pretty_print import (	1✔
10	Textract_Pretty_Print,
11	get_string,
12	)
13	from trp.t_pipeline import pipeline_merge_tables	1✔
14	from trp.t_tables import HeaderFooterType, MergeOptions	1✔
15
16
17	def extract_ballot_table(ballot, parse_flavor="lattice"):	1✔
18	"""
19	Given a OfficialDocument model, update or create a ParsedSOPN model with the
20	contents of the table as a JSON string.
21
22	:type ballot: candidates.models.Ballot
23
24	"""
25	import camelot # import here to avoid import error running tests without pdf deps installed	1✔
26
27	document = ballot.sopn	1✔
28	if not document.relevant_pages:	1!
UNCOV 29	raise ValueError(	×
30	"Pages for table not known for document, extract page numbers first"
31	)
32
33	try:	1✔
34	tables = camelot.read_pdf(	1✔
35	document.uploaded_file.path,
36	pages=document.relevant_pages,
37	flavor=parse_flavor,
38	)
UNCOV 39	except (NotImplementedError, AttributeError):	×
40	# * NotImplementedError is thrown if the PDF is an image or generally
41	# unreadable.
42	# * AttributeError is thrown on some PDFs saying they need a password.
43	# Assume this is a bug in camelot, and ignore these PDFs
UNCOV 44	raise NoTextInDocumentError()	×
45
46	# Tables can span pages, camelot assumes they're different tables, so we
47	# need to join them back together
48	table_list = []	1✔
49	for table in tables:	1✔
50	table_list.append(table)	1✔
51	table_list.sort(key=lambda t: (t.page, t.order))	1✔
52
53	if not table_list:	1✔
54	return None	1✔
55
56	table_data = table_list.pop(0).df	1✔
57	for table in table_list:	1!
58	# It's possible to have the "situation of poll" document on the SOPN
59	# Ignore any table that contains "polling station" (SOPNs tables don't)
60	first_row = table.df.iloc[0].to_string()	×
UNCOV 61	if "polling station" in clean_text(first_row):	×
UNCOV 62	break	×
63	# Append the continuation table to the first one in the document.
64	# ignore_index is needed so the e.g table 2 row 1 doesn't replace
65	# table 1 row 1
UNCOV 66	table_data = table_data.append(table.df, ignore_index=True)	×
67
68	if not table_data.empty:	1!
69	parsed, _ = ParsedSOPN.objects.update_or_create(	1✔
70	sopn=document,
71	defaults={"raw_data": json.dumps(table_data.to_dict())},
72	)
73	return parsed	1✔
UNCOV 74	return None	×
75
76
77	def textract_extract_tables(self, s3_uri_of_documents):	1✔
NEW UNCOV 78	session = boto3.session.Session(	×
79	aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
80	aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
81	aws_session_token=os.environ.get("AWS_SECURITY_TOKEN"),
82	)
NEW UNCOV 83	textract_client = boto3.client("textract", region_name="us-west-1")	×
NEW UNCOV 84	textract_json = call_textract(	×
85	input_document=s3_uri_of_documents,
86	features=[Textract_Features.TABLES],
87	boto3_textract_client=textract_client,
88	)
NEW UNCOV 89	print(	×
90	get_string(
91	textract_json=textract_json, output_type=Textract_Pretty_Print
92	)
93	)
94
NEW UNCOV 95	t_document: t2.TDocument = t2.TDocumentSchema().load(textract_json)	×
NEW UNCOV 96	t_document = pipeline_merge_tables(	×
97	t_document, MergeOptions.MERGE, None, HeaderFooterType.NONE, session
98	)

DemocracyClub / yournextrepresentative / 74df76c8-4768-48d5-bb7f-5ee50aa05217

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous