a11757ca-33f7-44a2-a500-a7b58b9d99e5

Committed 28 Mar 2024 02:52PM UTC coverage: 67.169% (-0.1%) from 67.282%

Build # a11757ca-33f7-44a2-a500-a7b58b9d99e5

Build Type

Pull #2269

circleci

Committed by

symroe

Commit Message

Refactor review-required page to use BallotSOPN

Pull Request Pull Request #2269: Redesign SOPN models

Run Details

1712 of 2908 branches covered (58.87%)

Branch coverage included in aggregate %.

190 of 206 new or added lines in 16 files covered. (92.23%)

27 existing lines in 5 files now uncovered.

6895 of 9906 relevant lines covered (69.6%)

0.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

64.04

/ynr/apps/sopn_parsing/helpers/extract_pages.py

import json
from typing import Optional

import boto3
from botocore.client import Config
from django.conf import settings
from django.db import IntegrityError
from official_documents.models import BallotSOPN, ElectionSOPN
from pdfminer.pdftypes import PDFException
from PIL import Image
from sopn_parsing.helpers.pdf_helpers import ElectionSOPNDocument
from sopn_parsing.helpers.text_helpers import NoTextInDocumentError
from sopn_parsing.models import (
    AWSTextractParsedSOPN,
    AWSTextractParsedSOPNImage,
)
from textractor import Textractor
from textractor.data.constants import TextractAPI, TextractFeatures
from textractor.entities.lazy_document import LazyDocument


def extract_pages_for_election_sopn(election_sopn: ElectionSOPN):
    """
    Try to extract the page numbers for an ElectionSOPN

    """
    try:
        election_sopn_document = ElectionSOPNDocument(election_sopn)

        election_sopn_document.match_all_pages()
        if (
            len(election_sopn_document.pages) == 1
            or election_sopn_document.matched_page_numbers == "all"
        ):
            raise NotImplementedError(
                "TODO: Convert this to a BallotSOPN model, not an ElectionSOPN model"
            )

    except NoTextInDocumentError:
        # TODO: Flag that this ElectionSOPN needs manual matching, on the model
        raise NoTextInDocumentError(
            f"Failed to extract pages for {election_sopn.uploaded_file.path} as a NoTextInDocumentError was raised"
        )
    except PDFException:
        print(
            f"{election_sopn.election.slug} failed to parse as a PDFSyntaxError was raised"
        )
        raise PDFException(
            f"Failed to extract pages for {election_sopn.uploaded_file.path} as a PDFSyntaxError was raised"
        )


config = Config(retries={"max_attempts": 5})

textract_client = boto3.client(
    "textract", region_name=settings.TEXTRACT_S3_BUCKET_REGION, config=config
)


class NotUsingAWSException(ValueError):
    """
    Used to indicate that we're not in an environment that's not
    using AWS S3 storages
    """


# TODO: move this code to a better place
class TextractSOPNHelper:
    """Get the AWS Textract results for a given SOPN."""

    def __init__(
        self,
        ballot_sopn: BallotSOPN,
        bucket_name: str = None,
        upload_path: str = None,
    ):
        self.official_document = ballot_sopn
        self.bucket_name = bucket_name or getattr(
            settings, "AWS_STORAGE_BUCKET_NAME", None
        )
        self.upload_path = upload_path
        if not any((self.bucket_name, self.upload_path)):
            raise NotUsingAWSException()
        self.extractor = Textractor(region_name="eu-west-2")

    def start_detection(self, replace=False) -> Optional[AWSTextractParsedSOPN]:
        parsed_sopn = getattr(
            self.official_document, "awstextractparsedsopn", None
        )
        if parsed_sopn and not replace:
            return None
        print("Starting analysis")
        document = self.textract_start_document_analysis()
        print("Saving results")
        try:
            textract_result, _ = AWSTextractParsedSOPN.objects.update_or_create(
                sopn=self.official_document,
                defaults={"raw_data": "", "job_id": document.job_id},
            )
            textract_result.save()
            textract_result.refresh_from_db()
            # Delete any old images that might exist for this SOPN
            textract_result.images.all().delete()

            return textract_result
        except IntegrityError as e:
            raise IntegrityError(
                f"Failed to create AWSTextractParsedSOPN for {self.official_document.ballot.ballot_paper_id}: error {e}"
            )

    def textract_start_document_analysis(self) -> LazyDocument:
        document: LazyDocument = self.extractor.start_document_analysis(
            file_source=f"s3://{self.bucket_name}{settings.MEDIA_URL}{self.official_document.uploaded_file.name}",
            features=[TextractFeatures.TABLES],
            s3_output_path=f"s3://{settings.TEXTRACT_S3_BUCKET_NAME}/raw_textract_responses",
            s3_upload_path=self.upload_path,
        )
        return document

    def update_job_status(self, blocking=False, reparse=False):
        COMPLETED_STATES = ("SUCCEEDED", "FAILED", "PARTIAL_SUCCESS")
        textract_result = self.official_document.awstextractparsedsopn
        if textract_result.status in COMPLETED_STATES and not reparse:
            return textract_result

        if not blocking:
            # If we're not blocking, simply check the status and save it
            # In the case that it's not finished, just save the status and return
            response = self.extractor.textract_client.get_document_analysis(
                JobId=textract_result.job_id
            )
            textract_result.status = response["JobStatus"]
            if response["JobStatus"] not in COMPLETED_STATES:
                textract_result.save()
                return textract_result

        # extractor.get_result is blocking by default (e.g, it will poll
        # for the job finishing see
        # https://github.com/aws-samples/amazon-textract-textractor/issues/326)
        # because the above check for `if not blocking` should have returned
        # by now if we didn't want to block (or the job is finished)
        # it's safe to call this and have it 'block' on noting.
        textract_document = self.extractor.get_result(
            textract_result.job_id, TextractAPI.ANALYZE
        )

        print("Saving images")
        textract_result.images.all().delete()
        images = self.extractor._get_document_images_from_path(
            f"s3://{self.bucket_name}{settings.MEDIA_URL}{self.official_document.uploaded_file.name}"
        )
        for i, image in enumerate(images):
            image_model = AWSTextractParsedSOPNImage.objects.create(
                parsed_sopn=textract_result,
            )
            image_model.image = AWSTextractParsedSOPNImage.pil_to_content_image(
                image, f"page_{i}.png"
            )
            image_model.save()
        print(
            f"Finished saving images for {self.official_document.ballot.ballot_paper_id}"
        )

        # Add the images back in manually
        images = list(textract_result.images.all())
        for i, page in enumerate(textract_document._pages):
            page.image = Image.open(images[i].image)
        for i, page in enumerate(textract_document.pages):
            images[i].image = AWSTextractParsedSOPNImage.pil_to_content_image(
                page.visualize(), f"page_{i}_annotated.png"
            )
            images[i].save()

        textract_result.status = textract_document.response["JobStatus"]
        textract_result.raw_data = json.dumps(textract_document.response)
        textract_result.save()
        return textract_result


class TextractSOPNParsingHelper:
    """Helper class to extract the AWS Textract blocks for a given SOPN
    and return the results as a dataframe. This is not to be confused with
    the SOPN parsing functionality that matches fields including
    candidates to parties."""

    def __init__(self, ballot_sopn: BallotSOPN):
        self.official_document = ballot_sopn
        self.parsed_sopn = self.official_document.awstextractparsedsopn

    def parse(self):
        self.parsed_sopn.parse_raw_data()
        self.parsed_sopn.save()
        return self.parsed_sopn

1	import json	1✔
2	from typing import Optional	1✔
3
4	import boto3	1✔
5	from botocore.client import Config	1✔
6	from django.conf import settings	1✔
7	from django.db import IntegrityError	1✔
8	from official_documents.models import BallotSOPN, ElectionSOPN	1✔
9	from pdfminer.pdftypes import PDFException	1✔
10	from PIL import Image	1✔
11	from sopn_parsing.helpers.pdf_helpers import ElectionSOPNDocument	1✔
12	from sopn_parsing.helpers.text_helpers import NoTextInDocumentError	1✔
13	from sopn_parsing.models import (	1✔
14	AWSTextractParsedSOPN,
15	AWSTextractParsedSOPNImage,
16	)
17	from textractor import Textractor	1✔
18	from textractor.data.constants import TextractAPI, TextractFeatures	1✔
19	from textractor.entities.lazy_document import LazyDocument	1✔
20
21
22	def extract_pages_for_election_sopn(election_sopn: ElectionSOPN):	1✔
23	"""
24	Try to extract the page numbers for an ElectionSOPN
25
26	"""
27	try:	1✔
28	election_sopn_document = ElectionSOPNDocument(election_sopn)	1✔
29
30	election_sopn_document.match_all_pages()	1✔
31	if (	1!
32	len(election_sopn_document.pages) == 1
33	or election_sopn_document.matched_page_numbers == "all"
34	):
NEW 35	raise NotImplementedError(	×
36	"TODO: Convert this to a BallotSOPN model, not an ElectionSOPN model"
37	)
38
39	except NoTextInDocumentError:	×
40	# TODO: Flag that this ElectionSOPN needs manual matching, on the model
UNCOV 41	raise NoTextInDocumentError(	×
42	f"Failed to extract pages for {election_sopn.uploaded_file.path} as a NoTextInDocumentError was raised"
43	)
44	except PDFException:	×
45	print(	×
46	f"{election_sopn.election.slug} failed to parse as a PDFSyntaxError was raised"
47	)
48	raise PDFException(	×
49	f"Failed to extract pages for {election_sopn.uploaded_file.path} as a PDFSyntaxError was raised"
50	)
51
52
53	config = Config(retries={"max_attempts": 5})	1✔
54
55	textract_client = boto3.client(	1✔
56	"textract", region_name=settings.TEXTRACT_S3_BUCKET_REGION, config=config
57	)
58
59
60	class NotUsingAWSException(ValueError):	1✔
61	"""
62	Used to indicate that we're not in an environment that's not
63	using AWS S3 storages
64	"""
65
66
67	# TODO: move this code to a better place
68	class TextractSOPNHelper:	1✔
69	"""Get the AWS Textract results for a given SOPN."""
70
71	def __init__(	1✔
72	self,
73	ballot_sopn: BallotSOPN,
74	bucket_name: str = None,
75	upload_path: str = None,
76	):
77	self.official_document = ballot_sopn	1✔
78	self.bucket_name = bucket_name or getattr(	1✔
79	settings, "AWS_STORAGE_BUCKET_NAME", None
80	)
81	self.upload_path = upload_path	1✔
82	if not any((self.bucket_name, self.upload_path)):	1!
83	raise NotUsingAWSException()	×
84	self.extractor = Textractor(region_name="eu-west-2")	1✔
85
86	def start_detection(self, replace=False) -> Optional[AWSTextractParsedSOPN]:	1✔
87	parsed_sopn = getattr(	1✔
88	self.official_document, "awstextractparsedsopn", None
89	)
90	if parsed_sopn and not replace:	1!
91	return None	×
92	print("Starting analysis")	1✔
93	document = self.textract_start_document_analysis()	1✔
94	print("Saving results")	1✔
95	try:	1✔
96	textract_result, _ = AWSTextractParsedSOPN.objects.update_or_create(	1✔
97	sopn=self.official_document,
98	defaults={"raw_data": "", "job_id": document.job_id},
99	)
100	textract_result.save()	1✔
101	textract_result.refresh_from_db()	1✔
102	# Delete any old images that might exist for this SOPN
103	textract_result.images.all().delete()	1✔
104
105	return textract_result	1✔
106	except IntegrityError as e:	×
107	raise IntegrityError(	×
108	f"Failed to create AWSTextractParsedSOPN for {self.official_document.ballot.ballot_paper_id}: error {e}"
109	)
110
111	def textract_start_document_analysis(self) -> LazyDocument:	1✔
112	document: LazyDocument = self.extractor.start_document_analysis(	×
113	file_source=f"s3://{self.bucket_name}{settings.MEDIA_URL}{self.official_document.uploaded_file.name}",
114	features=[TextractFeatures.TABLES],
115	s3_output_path=f"s3://{settings.TEXTRACT_S3_BUCKET_NAME}/raw_textract_responses",
116	s3_upload_path=self.upload_path,
117	)
118	return document	×
119
120	def update_job_status(self, blocking=False, reparse=False):	1✔
121	COMPLETED_STATES = ("SUCCEEDED", "FAILED", "PARTIAL_SUCCESS")	1✔
122	textract_result = self.official_document.awstextractparsedsopn	1✔
123	if textract_result.status in COMPLETED_STATES and not reparse:	1!
124	return textract_result	×
125
126	if not blocking:	1!
127	# If we're not blocking, simply check the status and save it
128	# In the case that it's not finished, just save the status and return
129	response = self.extractor.textract_client.get_document_analysis(	×
130	JobId=textract_result.job_id
131	)
132	textract_result.status = response["JobStatus"]	×
133	if response["JobStatus"] not in COMPLETED_STATES:	×
134	textract_result.save()	×
135	return textract_result	×
136
137	# extractor.get_result is blocking by default (e.g, it will poll
138	# for the job finishing see
139	# https://github.com/aws-samples/amazon-textract-textractor/issues/326)
140	# because the above check for `if not blocking` should have returned
141	# by now if we didn't want to block (or the job is finished)
142	# it's safe to call this and have it 'block' on noting.
143	textract_document = self.extractor.get_result(	1✔
144	textract_result.job_id, TextractAPI.ANALYZE
145	)
146
147	print("Saving images")	1✔
148	textract_result.images.all().delete()	1✔
149	images = self.extractor._get_document_images_from_path(	1✔
150	f"s3://{self.bucket_name}{settings.MEDIA_URL}{self.official_document.uploaded_file.name}"
151	)
152	for i, image in enumerate(images):	1!
153	image_model = AWSTextractParsedSOPNImage.objects.create(	×
154	parsed_sopn=textract_result,
155	)
156	image_model.image = AWSTextractParsedSOPNImage.pil_to_content_image(	×
157	image, f"page_{i}.png"
158	)
159	image_model.save()	×
160	print(	1✔
161	f"Finished saving images for {self.official_document.ballot.ballot_paper_id}"
162	)
163
164	# Add the images back in manually
165	images = list(textract_result.images.all())	1✔
166	for i, page in enumerate(textract_document._pages):	1!
167	page.image = Image.open(images[i].image)	×
168	for i, page in enumerate(textract_document.pages):	1!
169	images[i].image = AWSTextractParsedSOPNImage.pil_to_content_image(	×
170	page.visualize(), f"page_{i}_annotated.png"
171	)
172	images[i].save()	×
173
174	textract_result.status = textract_document.response["JobStatus"]	1✔
175	textract_result.raw_data = json.dumps(textract_document.response)	1✔
176	textract_result.save()	1✔
177	return textract_result	1✔
178
179
180	class TextractSOPNParsingHelper:	1✔
181	"""Helper class to extract the AWS Textract blocks for a given SOPN
182	and return the results as a dataframe. This is not to be confused with
183	the SOPN parsing functionality that matches fields including
184	candidates to parties."""
185
186	def __init__(self, ballot_sopn: BallotSOPN):	1✔
NEW 187	self.official_document = ballot_sopn	×
UNCOV 188	self.parsed_sopn = self.official_document.awstextractparsedsopn	×
189
190	def parse(self):	1✔
191	self.parsed_sopn.parse_raw_data()	×
192	self.parsed_sopn.save()	×
193	return self.parsed_sopn	×

DemocracyClub / yournextrepresentative / a11757ca-33f7-44a2-a500-a7b58b9d99e5

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous