• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

DemocracyClub / yournextrepresentative / 74df76c8-4768-48d5-bb7f-5ee50aa05217

06 Nov 2023 01:38PM UTC coverage: 67.523% (-0.3%) from 67.801%
74df76c8-4768-48d5-bb7f-5ee50aa05217

Pull #2177

circleci

VirginiaDooley
Create TextractResults model
Pull Request #2177: Spike: AWS Textract

1640 of 2760 branches covered (0.0%)

Branch coverage included in aggregate %.

12 of 62 new or added lines in 3 files covered. (19.35%)

110 existing lines in 10 files now uncovered.

6662 of 9535 relevant lines covered (69.87%)

0.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

66.07
/ynr/apps/sopn_parsing/helpers/extract_tables.py
1
import json
1✔
2
import os
1✔
3

4
import boto3
1✔
5
import trp.trp2 as t2
1✔
6
from sopn_parsing.helpers.text_helpers import NoTextInDocumentError, clean_text
1✔
7
from sopn_parsing.models import ParsedSOPN
1✔
8
from textractcaller.t_call import Textract_Features, call_textract
1✔
9
from textractprettyprinter.t_pretty_print import (
1✔
10
    Textract_Pretty_Print,
11
    get_string,
12
)
13
from trp.t_pipeline import pipeline_merge_tables
1✔
14
from trp.t_tables import HeaderFooterType, MergeOptions
1✔
15

16

17
def extract_ballot_table(ballot, parse_flavor="lattice"):
1✔
18
    """
19
    Given a OfficialDocument model, update or create a ParsedSOPN model with the
20
    contents of the table as a JSON string.
21

22
    :type ballot: candidates.models.Ballot
23

24
    """
25
    import camelot  # import here to avoid import error running tests without pdf deps installed
1✔
26

27
    document = ballot.sopn
1✔
28
    if not document.relevant_pages:
1!
UNCOV
29
        raise ValueError(
×
30
            "Pages for table not known for document, extract page numbers first"
31
        )
32

33
    try:
1✔
34
        tables = camelot.read_pdf(
1✔
35
            document.uploaded_file.path,
36
            pages=document.relevant_pages,
37
            flavor=parse_flavor,
38
        )
UNCOV
39
    except (NotImplementedError, AttributeError):
×
40
        # * NotImplementedError is thrown if the PDF is an image or generally
41
        #   unreadable.
42
        # * AttributeError is thrown on some PDFs saying they need a password.
43
        #   Assume this is a bug in camelot, and ignore these PDFs
UNCOV
44
        raise NoTextInDocumentError()
×
45

46
    # Tables can span pages, camelot assumes they're different tables, so we
47
    # need to join them back together
48
    table_list = []
1✔
49
    for table in tables:
1✔
50
        table_list.append(table)
1✔
51
    table_list.sort(key=lambda t: (t.page, t.order))
1✔
52

53
    if not table_list:
1✔
54
        return None
1✔
55

56
    table_data = table_list.pop(0).df
1✔
57
    for table in table_list:
1!
58
        # It's possible to have the "situation of poll" document on the SOPN
59
        # Ignore any table that contains "polling station" (SOPNs tables don't)
60
        first_row = table.df.iloc[0].to_string()
×
UNCOV
61
        if "polling station" in clean_text(first_row):
×
UNCOV
62
            break
×
63
        # Append the continuation table to the first one in the document.
64
        # ignore_index is needed so the e.g table 2 row 1 doesn't replace
65
        # table 1 row 1
UNCOV
66
        table_data = table_data.append(table.df, ignore_index=True)
×
67

68
    if not table_data.empty:
1!
69
        parsed, _ = ParsedSOPN.objects.update_or_create(
1✔
70
            sopn=document,
71
            defaults={"raw_data": json.dumps(table_data.to_dict())},
72
        )
73
        return parsed
1✔
UNCOV
74
    return None
×
75

76

77
def textract_extract_tables(self, s3_uri_of_documents):
1✔
NEW
UNCOV
78
    session = boto3.session.Session(
×
79
        aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
80
        aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
81
        aws_session_token=os.environ.get("AWS_SECURITY_TOKEN"),
82
    )
NEW
UNCOV
83
    textract_client = boto3.client("textract", region_name="us-west-1")
×
NEW
UNCOV
84
    textract_json = call_textract(
×
85
        input_document=s3_uri_of_documents,
86
        features=[Textract_Features.TABLES],
87
        boto3_textract_client=textract_client,
88
    )
NEW
UNCOV
89
    print(
×
90
        get_string(
91
            textract_json=textract_json, output_type=Textract_Pretty_Print
92
        )
93
    )
94

NEW
UNCOV
95
    t_document: t2.TDocument = t2.TDocumentSchema().load(textract_json)
×
NEW
UNCOV
96
    t_document = pipeline_merge_tables(
×
97
        t_document, MergeOptions.MERGE, None, HeaderFooterType.NONE, session
98
    )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc