• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

DemocracyClub / yournextrepresentative / a11757ca-33f7-44a2-a500-a7b58b9d99e5

28 Mar 2024 02:52PM UTC coverage: 67.169% (-0.1%) from 67.282%
a11757ca-33f7-44a2-a500-a7b58b9d99e5

Pull #2269

circleci

symroe
Refactor review-required page to use BallotSOPN
Pull Request #2269: Redesign SOPN models

1712 of 2908 branches covered (58.87%)

Branch coverage included in aggregate %.

190 of 206 new or added lines in 16 files covered. (92.23%)

27 existing lines in 5 files now uncovered.

6895 of 9906 relevant lines covered (69.6%)

0.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

64.04
/ynr/apps/sopn_parsing/helpers/extract_pages.py
1
import json
1✔
2
from typing import Optional
1✔
3

4
import boto3
1✔
5
from botocore.client import Config
1✔
6
from django.conf import settings
1✔
7
from django.db import IntegrityError
1✔
8
from official_documents.models import BallotSOPN, ElectionSOPN
1✔
9
from pdfminer.pdftypes import PDFException
1✔
10
from PIL import Image
1✔
11
from sopn_parsing.helpers.pdf_helpers import ElectionSOPNDocument
1✔
12
from sopn_parsing.helpers.text_helpers import NoTextInDocumentError
1✔
13
from sopn_parsing.models import (
1✔
14
    AWSTextractParsedSOPN,
15
    AWSTextractParsedSOPNImage,
16
)
17
from textractor import Textractor
1✔
18
from textractor.data.constants import TextractAPI, TextractFeatures
1✔
19
from textractor.entities.lazy_document import LazyDocument
1✔
20

21

22
def extract_pages_for_election_sopn(election_sopn: ElectionSOPN):
1✔
23
    """
24
    Try to extract the page numbers for an ElectionSOPN
25

26
    """
27
    try:
1✔
28
        election_sopn_document = ElectionSOPNDocument(election_sopn)
1✔
29

30
        election_sopn_document.match_all_pages()
1✔
31
        if (
1!
32
            len(election_sopn_document.pages) == 1
33
            or election_sopn_document.matched_page_numbers == "all"
34
        ):
NEW
35
            raise NotImplementedError(
×
36
                "TODO: Convert this to a BallotSOPN model, not an ElectionSOPN model"
37
            )
38

39
    except NoTextInDocumentError:
×
40
        # TODO: Flag that this ElectionSOPN needs manual matching, on the model
UNCOV
41
        raise NoTextInDocumentError(
×
42
            f"Failed to extract pages for {election_sopn.uploaded_file.path} as a NoTextInDocumentError was raised"
43
        )
44
    except PDFException:
×
45
        print(
×
46
            f"{election_sopn.election.slug} failed to parse as a PDFSyntaxError was raised"
47
        )
48
        raise PDFException(
×
49
            f"Failed to extract pages for {election_sopn.uploaded_file.path} as a PDFSyntaxError was raised"
50
        )
51

52

53
config = Config(retries={"max_attempts": 5})
1✔
54

55
textract_client = boto3.client(
1✔
56
    "textract", region_name=settings.TEXTRACT_S3_BUCKET_REGION, config=config
57
)
58

59

60
class NotUsingAWSException(ValueError):
1✔
61
    """
62
    Used to indicate that we're not in an environment that's not
63
    using AWS S3 storages
64
    """
65

66

67
# TODO: move this code to a better place
68
class TextractSOPNHelper:
1✔
69
    """Get the AWS Textract results for a given SOPN."""
70

71
    def __init__(
1✔
72
        self,
73
        ballot_sopn: BallotSOPN,
74
        bucket_name: str = None,
75
        upload_path: str = None,
76
    ):
77
        self.official_document = ballot_sopn
1✔
78
        self.bucket_name = bucket_name or getattr(
1✔
79
            settings, "AWS_STORAGE_BUCKET_NAME", None
80
        )
81
        self.upload_path = upload_path
1✔
82
        if not any((self.bucket_name, self.upload_path)):
1!
83
            raise NotUsingAWSException()
×
84
        self.extractor = Textractor(region_name="eu-west-2")
1✔
85

86
    def start_detection(self, replace=False) -> Optional[AWSTextractParsedSOPN]:
1✔
87
        parsed_sopn = getattr(
1✔
88
            self.official_document, "awstextractparsedsopn", None
89
        )
90
        if parsed_sopn and not replace:
1!
91
            return None
×
92
        print("Starting analysis")
1✔
93
        document = self.textract_start_document_analysis()
1✔
94
        print("Saving results")
1✔
95
        try:
1✔
96
            textract_result, _ = AWSTextractParsedSOPN.objects.update_or_create(
1✔
97
                sopn=self.official_document,
98
                defaults={"raw_data": "", "job_id": document.job_id},
99
            )
100
            textract_result.save()
1✔
101
            textract_result.refresh_from_db()
1✔
102
            # Delete any old images that might exist for this SOPN
103
            textract_result.images.all().delete()
1✔
104

105
            return textract_result
1✔
106
        except IntegrityError as e:
×
107
            raise IntegrityError(
×
108
                f"Failed to create AWSTextractParsedSOPN for {self.official_document.ballot.ballot_paper_id}: error {e}"
109
            )
110

111
    def textract_start_document_analysis(self) -> LazyDocument:
1✔
112
        document: LazyDocument = self.extractor.start_document_analysis(
×
113
            file_source=f"s3://{self.bucket_name}{settings.MEDIA_URL}{self.official_document.uploaded_file.name}",
114
            features=[TextractFeatures.TABLES],
115
            s3_output_path=f"s3://{settings.TEXTRACT_S3_BUCKET_NAME}/raw_textract_responses",
116
            s3_upload_path=self.upload_path,
117
        )
118
        return document
×
119

120
    def update_job_status(self, blocking=False, reparse=False):
1✔
121
        COMPLETED_STATES = ("SUCCEEDED", "FAILED", "PARTIAL_SUCCESS")
1✔
122
        textract_result = self.official_document.awstextractparsedsopn
1✔
123
        if textract_result.status in COMPLETED_STATES and not reparse:
1!
124
            return textract_result
×
125

126
        if not blocking:
1!
127
            # If we're not blocking, simply check the status and save it
128
            # In the case that it's not finished, just save the status and return
129
            response = self.extractor.textract_client.get_document_analysis(
×
130
                JobId=textract_result.job_id
131
            )
132
            textract_result.status = response["JobStatus"]
×
133
            if response["JobStatus"] not in COMPLETED_STATES:
×
134
                textract_result.save()
×
135
                return textract_result
×
136

137
        # extractor.get_result is blocking by default (e.g, it will poll
138
        # for the job finishing see
139
        # https://github.com/aws-samples/amazon-textract-textractor/issues/326)
140
        # because the above check for `if not blocking` should have returned
141
        # by now if we didn't want to block (or the job is finished)
142
        # it's safe to call this and have it 'block' on noting.
143
        textract_document = self.extractor.get_result(
1✔
144
            textract_result.job_id, TextractAPI.ANALYZE
145
        )
146

147
        print("Saving images")
1✔
148
        textract_result.images.all().delete()
1✔
149
        images = self.extractor._get_document_images_from_path(
1✔
150
            f"s3://{self.bucket_name}{settings.MEDIA_URL}{self.official_document.uploaded_file.name}"
151
        )
152
        for i, image in enumerate(images):
1!
153
            image_model = AWSTextractParsedSOPNImage.objects.create(
×
154
                parsed_sopn=textract_result,
155
            )
156
            image_model.image = AWSTextractParsedSOPNImage.pil_to_content_image(
×
157
                image, f"page_{i}.png"
158
            )
159
            image_model.save()
×
160
        print(
1✔
161
            f"Finished saving images for {self.official_document.ballot.ballot_paper_id}"
162
        )
163

164
        # Add the images back in manually
165
        images = list(textract_result.images.all())
1✔
166
        for i, page in enumerate(textract_document._pages):
1!
167
            page.image = Image.open(images[i].image)
×
168
        for i, page in enumerate(textract_document.pages):
1!
169
            images[i].image = AWSTextractParsedSOPNImage.pil_to_content_image(
×
170
                page.visualize(), f"page_{i}_annotated.png"
171
            )
172
            images[i].save()
×
173

174
        textract_result.status = textract_document.response["JobStatus"]
1✔
175
        textract_result.raw_data = json.dumps(textract_document.response)
1✔
176
        textract_result.save()
1✔
177
        return textract_result
1✔
178

179

180
class TextractSOPNParsingHelper:
1✔
181
    """Helper class to extract the AWS Textract blocks for a given SOPN
182
    and return the results as a dataframe. This is not to be confused with
183
    the SOPN parsing functionality that matches fields including
184
    candidates to parties."""
185

186
    def __init__(self, ballot_sopn: BallotSOPN):
1✔
NEW
187
        self.official_document = ballot_sopn
×
UNCOV
188
        self.parsed_sopn = self.official_document.awstextractparsedsopn
×
189

190
    def parse(self):
1✔
191
        self.parsed_sopn.parse_raw_data()
×
192
        self.parsed_sopn.save()
×
193
        return self.parsed_sopn
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc