• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

DemocracyClub / yournextrepresentative / 74df76c8-4768-48d5-bb7f-5ee50aa05217

06 Nov 2023 01:38PM UTC coverage: 67.523% (-0.3%) from 67.801%
74df76c8-4768-48d5-bb7f-5ee50aa05217

Pull #2177

circleci

VirginiaDooley
Create TextractResults model
Pull Request #2177: Spike: AWS Textract

1640 of 2760 branches covered (0.0%)

Branch coverage included in aggregate %.

12 of 62 new or added lines in 3 files covered. (19.35%)

110 existing lines in 10 files now uncovered.

6662 of 9535 relevant lines covered (69.87%)

0.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/ynr/apps/sopn_parsing/management/commands/sopn_parsing_aws_textract.py
NEW
1
import os
×
NEW
2
from asyncio.log import logger
×
NEW
3
from time import sleep
×
4

NEW
5
import boto3
×
NEW
6
from botocore.exceptions import ClientError
×
NEW
7
from django.core.management.base import BaseCommand
×
8

9
# url_test_sopn = (
10
#     "https://www.bury.gov.uk/council-and-democracy/elections-and-voting/statement-of-persons-nominated/"
11
# )
12

13
# this is an html saved as a pdf
NEW
14
test_sopn = "BurySOPN.pdf"
×
15

NEW
16
accepted_file_types = [
×
17
    ".pdf",
18
    ".jpg",
19
    ".jpeg",
20
    ".png",
21
    ".tif",
22
    ".tiff",
23
]
24

NEW
25
s3 = boto3.client("s3")
×
NEW
26
textract_client = boto3.client("textract")
×
NEW
27
session = boto3.session.Session(
×
28
    aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
29
    aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
30
    aws_session_token=os.environ.get("AWS_SECURITY_TOKEN"),
31
)
32

33

NEW
34
class Command(BaseCommand):
×
NEW
35
    def handle(self, *args, **options):
×
NEW
36
        self.start_detection(test_sopn)
×
37

NEW
38
    def start_detection(self, test_sopn):
×
39
        """This is a WIP of Step 1-2 of the SOPN parsing process using AWS Textract."""
40

NEW
41
        with open(test_sopn, "rb") as file:
×
NEW
42
            file_bytes = bytearray(file.read())
×
NEW
43
        region = "eu-west-2"
×
NEW
44
        bucket_name = "public-sopns"
×
NEW
45
        s3_client = boto3.client("s3", region_name=region)
×
NEW
46
        object_key = "test/test_sopn.pdf"
×
47

NEW
48
        response = s3_client.put_object(
×
49
            Bucket=bucket_name,
50
            Key=object_key,
51
            Body=file_bytes,
52
        )
NEW
53
        print(f"Uploaded bytes to s3://{bucket_name}/{object_key}")
×
NEW
54
        response = textract_client.start_document_analysis(
×
55
            DocumentLocation={
56
                "S3Object": {
57
                    "Bucket": bucket_name,
58
                    "Name": object_key,
59
                }
60
            },
61
            FeatureTypes=["TABLES", "FORMS"],
62
            OutputConfig={
63
                "S3Bucket": "public-sopns",
64
                "S3Prefix": "test",
65
            },
66
        )
67

NEW
68
        job_id = response["JobId"]
×
NEW
69
        response = textract_client.get_document_analysis(JobId=job_id)
×
NEW
70
        while response["JobStatus"] not in ["SUCCEEDED", "FAILED"]:
×
NEW
71
            sleep(5)
×
NEW
72
            response = textract_client.get_document_analysis(JobId=job_id)
×
NEW
73
            print("This is the Job ID:", job_id)
×
NEW
74
            print(response)
×
NEW
75
        response = self.get_analysis_job(job_id)
×
76
        # find the key related to "Tables" in the response and print
NEW
77
        for block in response["Blocks"]:
×
NEW
78
            if block["BlockType"] == "TABLE":
×
NEW
79
                print(block)
×
80
        # at this stage, save the response to the database as described below
81
        # ///
82

83
        # Take the job id, save it on the model
84
        # move on to the next file
85
        # ///
86
        # For any file that is missing a JobId, get the job status
87
        # if the job status is SUCCEEDED, save the JSON response against the TextractResults model
88
        # if the job status is FAILED, return status
89

90
        # Helper class OfficialDocuments to pass to TextractResults
91

NEW
92
    def get_analysis_job(self, job_id, max_tries=10):
×
93
        """
94
        Gets data for a previously started detection job that includes additional
95
        elements.
96

97
        :param job_id: The ID of the job to retrieve.
98
        :return: The job data, including a list of blocks that describe elements
99
                 detected in the image.
100
        """
101

NEW
102
        try:
×
NEW
103
            response = textract_client.get_document_analysis(JobId=job_id)
×
NEW
104
            job_status = response["JobStatus"]
×
NEW
105
            logger.info("Job %s status is %s.", job_id, job_status)
×
NEW
106
        except ClientError:
×
NEW
107
            logger.exception("Couldn't get data for job %s.", job_id)
×
NEW
108
            raise
×
109
        else:
NEW
110
            return response
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc