• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

DemocracyClub / yournextrepresentative / 06fffc96-31fa-407b-8443-1731e4db6cf1

04 Apr 2024 08:24AM UTC coverage: 68.368% (+1.1%) from 67.282%
06fffc96-31fa-407b-8443-1731e4db6cf1

Pull #2269

circleci

symroe
Clean up some testsing code

Misc fixes to the test code
Pull Request #2269: Redesign SOPN models

1749 of 2922 branches covered (59.86%)

Branch coverage included in aggregate %.

415 of 488 new or added lines in 25 files covered. (85.04%)

18 existing lines in 4 files now uncovered.

7093 of 10011 relevant lines covered (70.85%)

0.71 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

68.42
/ynr/apps/sopn_parsing/helpers/textract_helpers.py
1
import json
1✔
2
from typing import Optional
1✔
3

4
import boto3
1✔
5
from botocore.config import Config
1✔
6
from django.conf import settings
1✔
7
from django.db import IntegrityError
1✔
8
from official_documents.models import BallotSOPN
1✔
9
from PIL import Image
1✔
10
from sopn_parsing.models import (
1✔
11
    AWSTextractParsedSOPN,
12
    AWSTextractParsedSOPNImage,
13
)
14
from textractor import Textractor
1✔
15
from textractor.data.constants import TextractAPI, TextractFeatures
1✔
16
from textractor.entities.lazy_document import LazyDocument
1✔
17

18
config = Config(retries={"max_attempts": 5})
1✔
19
textract_client = boto3.client(
1✔
20
    "textract", region_name=settings.TEXTRACT_S3_BUCKET_REGION, config=config
21
)
22

23

24
class NotUsingAWSException(ValueError):
1✔
25
    """
26
    Used to indicate that we're not in an environment that's not
27
    using AWS S3 storages
28
    """
29

30

31
class TextractSOPNHelper:
1✔
32
    """Get the AWS Textract results for a given SOPN."""
33

34
    def __init__(
1✔
35
        self,
36
        ballot_sopn: BallotSOPN,
37
        bucket_name: str = None,
38
        upload_path: str = None,
39
    ):
40
        self.ballot_sopn = ballot_sopn
1✔
41
        self.bucket_name = bucket_name or getattr(
1✔
42
            settings, "AWS_STORAGE_BUCKET_NAME", None
43
        )
44
        self.upload_path = upload_path
1✔
45
        if not any((self.bucket_name, self.upload_path)):
1✔
46
            raise NotUsingAWSException()
1✔
47
        self.extractor = Textractor(region_name="eu-west-2")
1✔
48

49
    def start_detection(self, replace=False) -> Optional[AWSTextractParsedSOPN]:
1✔
50
        parsed_sopn = getattr(self.ballot_sopn, "awstextractparsedsopn", None)
1✔
51
        if parsed_sopn and not replace:
1!
52
            return None
×
53
        print("Starting analysis")
1✔
54
        document = self.textract_start_document_analysis()
1✔
55
        print("Saving results")
1✔
56
        try:
1✔
57
            textract_result, _ = AWSTextractParsedSOPN.objects.update_or_create(
1✔
58
                sopn=self.ballot_sopn,
59
                defaults={"raw_data": "", "job_id": document.job_id},
60
            )
61
            textract_result.save()
1✔
62
            textract_result.refresh_from_db()
1✔
63
            # Delete any old images that might exist for this SOPN
64
            textract_result.images.all().delete()
1✔
65

66
            return textract_result
1✔
67
        except IntegrityError as e:
×
68
            raise IntegrityError(
×
69
                f"Failed to create AWSTextractParsedSOPN for {self.ballot_sopn.ballot.ballot_paper_id}: error {e}"
70
            )
71

72
    def textract_start_document_analysis(self) -> LazyDocument:
1✔
73
        document: LazyDocument = self.extractor.start_document_analysis(
×
74
            file_source=f"s3://{self.bucket_name}{settings.MEDIA_URL}{self.ballot_sopn.uploaded_file.name}",
75
            features=[TextractFeatures.TABLES],
76
            s3_output_path=f"s3://{settings.TEXTRACT_S3_BUCKET_NAME}/raw_textract_responses",
77
            s3_upload_path=self.upload_path,
78
        )
79
        return document
×
80

81
    def update_job_status(self, blocking=False, reparse=False):
1✔
82
        COMPLETED_STATES = ("SUCCEEDED", "FAILED", "PARTIAL_SUCCESS")
1✔
83
        textract_result = self.ballot_sopn.awstextractparsedsopn
1✔
84
        if textract_result.status in COMPLETED_STATES and not reparse:
1!
85
            return textract_result
×
86

87
        if not blocking:
1!
88
            # If we're not blocking, simply check the status and save it
89
            # In the case that it's not finished, just save the status and return
90
            response = self.extractor.textract_client.get_document_analysis(
×
91
                JobId=textract_result.job_id
92
            )
93
            textract_result.status = response["JobStatus"]
×
94
            if response["JobStatus"] not in COMPLETED_STATES:
×
95
                textract_result.save()
×
96
                return textract_result
×
97

98
        # extractor.get_result is blocking by default (e.g, it will poll
99
        # for the job finishing see
100
        # https://github.com/aws-samples/amazon-textract-textractor/issues/326)
101
        # because the above check for `if not blocking` should have returned
102
        # by now if we didn't want to block (or the job is finished)
103
        # it's safe to call this and have it 'block' on noting.
104
        textract_document = self.extractor.get_result(
1✔
105
            textract_result.job_id, TextractAPI.ANALYZE
106
        )
107

108
        print("Saving images")
1✔
109
        textract_result.images.all().delete()
1✔
110
        images = self.extractor._get_document_images_from_path(
1✔
111
            f"s3://{self.bucket_name}{settings.MEDIA_URL}{self.ballot_sopn.uploaded_file.name}"
112
        )
113
        for i, image in enumerate(images):
1!
114
            image_model = AWSTextractParsedSOPNImage.objects.create(
×
115
                parsed_sopn=textract_result,
116
            )
117
            image_model.image = AWSTextractParsedSOPNImage.pil_to_content_image(
×
118
                image, f"page_{i}.png"
119
            )
120
            image_model.save()
×
121
        print(
1✔
122
            f"Finished saving images for {self.ballot_sopn.ballot.ballot_paper_id}"
123
        )
124

125
        # Add the images back in manually
126
        images = list(textract_result.images.all())
1✔
127
        for i, page in enumerate(textract_document._pages):
1!
128
            page.image = Image.open(images[i].image)
×
129
        for i, page in enumerate(textract_document.pages):
1!
130
            images[i].image = AWSTextractParsedSOPNImage.pil_to_content_image(
×
131
                page.visualize(), f"page_{i}_annotated.png"
132
            )
133
            images[i].save()
×
134

135
        textract_result.status = textract_document.response["JobStatus"]
1✔
136
        textract_result.raw_data = json.dumps(textract_document.response)
1✔
137
        textract_result.save()
1✔
138
        return textract_result
1✔
139

140

141
class TextractSOPNParsingHelper:
1✔
142
    """Helper class to extract the AWS Textract blocks for a given SOPN
143
    and return the results as a dataframe. This is not to be confused with
144
    the SOPN parsing functionality that matches fields including
145
    candidates to parties."""
146

147
    def __init__(self, ballot_sopn: BallotSOPN):
1✔
NEW
148
        self.ballot_sopn = ballot_sopn
×
NEW
149
        self.parsed_sopn = self.ballot_sopn.awstextractparsedsopn
×
150

151
    def parse(self):
1✔
152
        self.parsed_sopn.parse_raw_data()
×
153
        self.parsed_sopn.save()
×
154
        return self.parsed_sopn
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc