971511b7-cd93-49f9-a38d-d0d450f0943f

Committed 16 Dec 2023 09:13AM UTC coverage: 67.767% (-0.2%) from 67.965%

Build # 971511b7-cd93-49f9-a38d-d0d450f0943f

Build Type

Pull #2177

circleci

Committed by

VirginiaDooley

Commit Message

Bump moto[s3]==4.2.11

Pull Request Pull Request #2177: AWS Textract

Run Details

1689 of 2838 branches covered (0.0%)

Branch coverage included in aggregate %.

155 of 241 new or added lines in 11 files covered. (64.32%)

1 existing line in 1 file now uncovered.

6834 of 9739 relevant lines covered (70.17%)

0.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

68.18

/ynr/apps/sopn_parsing/helpers/extract_tables.py

import json

import pandas as pd
from sopn_parsing.helpers.text_helpers import NoTextInDocumentError, clean_text
from sopn_parsing.models import CamelotParsedSOPN


def extract_ballot_table(ballot, parse_flavor="lattice"):
    """
    Given a OfficialDocument model, update or create a CamelotParsedSOPN model with the
    contents of the table as a JSON string.

    :type ballot: candidates.models.Ballot

    """
    import camelot  # import here to avoid import error running tests without pdf deps installed

    document = ballot.sopn
    if not document.relevant_pages:
        raise ValueError(
            "Pages for table not known for document, extract page numbers first"
        )

    try:
        tables = camelot.read_pdf(
            document.uploaded_file.path,
            pages=document.relevant_pages,
            flavor=parse_flavor,
        )
    except (NotImplementedError, AttributeError):
        # * NotImplementedError is thrown if the PDF is an image or generally
        #   unreadable.
        # * AttributeError is thrown on some PDFs saying they need a password.
        #   Assume this is a bug in camelot, and ignore these PDFs
        raise NoTextInDocumentError()

    # Tables can span pages, camelot assumes they're different tables, so we
    # need to join them back together
    table_list = []
    for table in tables:
        table_list.append(table)
    table_list.sort(key=lambda t: (t.page, t.order))

    if not table_list:
        return None

    table_data = table_list.pop(0).df

    for table in table_list:
        # It's possible to have the "situation of poll" document on the SOPN
        # Ignore any table that contains "polling station" (SOPNs tables don't)
        table = table.df
        first_row = table.iloc[0].to_string()

        if "polling station" in clean_text(first_row):
            break
        # Append the continuation table to the first one in the document.
        # ignore_index is needed so the e.g table 2 row 1 doesn't replace
        # table 1 row 1
        table_data = pd.concat([table_data, table], ignore_index=True)

    if not table_data.empty:
        parsed, _ = CamelotParsedSOPN.objects.update_or_create(
            sopn=document,
            defaults={"raw_data": json.dumps(table_data.to_dict())},
        )
        return parsed
    return None

1	import json	1✔
2
3	import pandas as pd	1✔
4	from sopn_parsing.helpers.text_helpers import NoTextInDocumentError, clean_text	1✔
5	from sopn_parsing.models import CamelotParsedSOPN	1✔
6
7
8	def extract_ballot_table(ballot, parse_flavor="lattice"):	1✔
9	"""
10	Given a OfficialDocument model, update or create a CamelotParsedSOPN model with the
11	contents of the table as a JSON string.
12
13	:type ballot: candidates.models.Ballot
14
15	"""
16	import camelot # import here to avoid import error running tests without pdf deps installed	1✔
17
18	document = ballot.sopn	1✔
19	if not document.relevant_pages:	1!
20	raise ValueError(	×
21	"Pages for table not known for document, extract page numbers first"
22	)
23
24	try:	1✔
25	tables = camelot.read_pdf(	1✔
26	document.uploaded_file.path,
27	pages=document.relevant_pages,
28	flavor=parse_flavor,
29	)
30	except (NotImplementedError, AttributeError):	×
31	# * NotImplementedError is thrown if the PDF is an image or generally
32	# unreadable.
33	# * AttributeError is thrown on some PDFs saying they need a password.
34	# Assume this is a bug in camelot, and ignore these PDFs
35	raise NoTextInDocumentError()	×
36
37	# Tables can span pages, camelot assumes they're different tables, so we
38	# need to join them back together
39	table_list = []	1✔
40	for table in tables:	1✔
41	table_list.append(table)	1✔
42	table_list.sort(key=lambda t: (t.page, t.order))	1✔
43
44	if not table_list:	1✔
45	return None	1✔
46
47	table_data = table_list.pop(0).df	1✔
48
49	for table in table_list:	1!
50	# It's possible to have the "situation of poll" document on the SOPN
51	# Ignore any table that contains "polling station" (SOPNs tables don't)
NEW 52	table = table.df	×
NEW 53	first_row = table.iloc[0].to_string()	×
54
55	if "polling station" in clean_text(first_row):	×
56	break	×
57	# Append the continuation table to the first one in the document.
58	# ignore_index is needed so the e.g table 2 row 1 doesn't replace
59	# table 1 row 1
NEW 60	table_data = pd.concat([table_data, table], ignore_index=True)	×
61
62	if not table_data.empty:	1!
63	parsed, _ = CamelotParsedSOPN.objects.update_or_create(	1✔
64	sopn=document,
65	defaults={"raw_data": json.dumps(table_data.to_dict())},
66	)
67	return parsed	1✔
68	return None	×

DemocracyClub / yournextrepresentative / 971511b7-cd93-49f9-a38d-d0d450f0943f

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous