74df76c8-4768-48d5-bb7f-5ee50aa05217

Committed 06 Nov 2023 01:38PM UTC coverage: 67.523% (-0.3%) from 67.801%

Build # 74df76c8-4768-48d5-bb7f-5ee50aa05217

Build Type

Pull #2177

circleci

Committed by

VirginiaDooley

Commit Message

Create TextractResults model

Pull Request Pull Request #2177: Spike: AWS Textract

Run Details

1640 of 2760 branches covered (0.0%)

Branch coverage included in aggregate %.

12 of 62 new or added lines in 3 files covered. (19.35%)

110 existing lines in 10 files now uncovered.

6662 of 9535 relevant lines covered (69.87%)

0.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/ynr/apps/sopn_parsing/management/commands/sopn_parsing_aws_textract.py

NEW 1	import os	×
NEW 2	from asyncio.log import logger	×
NEW 3	from time import sleep	×
4
NEW 5	import boto3	×
NEW 6	from botocore.exceptions import ClientError	×
NEW 7	from django.core.management.base import BaseCommand	×
8
9	# url_test_sopn = (
10	# "https://www.bury.gov.uk/council-and-democracy/elections-and-voting/statement-of-persons-nominated/"
11	# )
12
13	# this is an html saved as a pdf
NEW 14	test_sopn = "BurySOPN.pdf"	×
15
NEW 16	accepted_file_types = [	×
17	".pdf",
18	".jpg",
19	".jpeg",
20	".png",
21	".tif",
22	".tiff",
23	]
24
NEW 25	s3 = boto3.client("s3")	×
NEW 26	textract_client = boto3.client("textract")	×
NEW 27	session = boto3.session.Session(	×
28	aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
29	aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
30	aws_session_token=os.environ.get("AWS_SECURITY_TOKEN"),
31	)
32
33
NEW 34	class Command(BaseCommand):	×
NEW 35	def handle(self, args, *options):	×
NEW 36	self.start_detection(test_sopn)	×
37
NEW 38	def start_detection(self, test_sopn):	×
39	"""This is a WIP of Step 1-2 of the SOPN parsing process using AWS Textract."""
40
NEW 41	with open(test_sopn, "rb") as file:	×
NEW 42	file_bytes = bytearray(file.read())	×
NEW 43	region = "eu-west-2"	×
NEW 44	bucket_name = "public-sopns"	×
NEW 45	s3_client = boto3.client("s3", region_name=region)	×
NEW 46	object_key = "test/test_sopn.pdf"	×
47
NEW 48	response = s3_client.put_object(	×
49	Bucket=bucket_name,
50	Key=object_key,
51	Body=file_bytes,
52	)
NEW 53	print(f"Uploaded bytes to s3://{bucket_name}/{object_key}")	×
NEW 54	response = textract_client.start_document_analysis(	×
55	DocumentLocation={
56	"S3Object": {
57	"Bucket": bucket_name,
58	"Name": object_key,
59	}
60	},
61	FeatureTypes=["TABLES", "FORMS"],
62	OutputConfig={
63	"S3Bucket": "public-sopns",
64	"S3Prefix": "test",
65	},
66	)
67
NEW 68	job_id = response["JobId"]	×
NEW 69	response = textract_client.get_document_analysis(JobId=job_id)	×
NEW 70	while response["JobStatus"] not in ["SUCCEEDED", "FAILED"]:	×
NEW 71	sleep(5)	×
NEW 72	response = textract_client.get_document_analysis(JobId=job_id)	×
NEW 73	print("This is the Job ID:", job_id)	×
NEW 74	print(response)	×
NEW 75	response = self.get_analysis_job(job_id)	×
76	# find the key related to "Tables" in the response and print
NEW 77	for block in response["Blocks"]:	×
NEW 78	if block["BlockType"] == "TABLE":	×
NEW 79	print(block)	×
80	# at this stage, save the response to the database as described below
81	# ///
82
83	# Take the job id, save it on the model
84	# move on to the next file
85	# ///
86	# For any file that is missing a JobId, get the job status
87	# if the job status is SUCCEEDED, save the JSON response against the TextractResults model
88	# if the job status is FAILED, return status
89
90	# Helper class OfficialDocuments to pass to TextractResults
91
NEW 92	def get_analysis_job(self, job_id, max_tries=10):	×
93	"""
94	Gets data for a previously started detection job that includes additional
95	elements.
96
97	:param job_id: The ID of the job to retrieve.
98	:return: The job data, including a list of blocks that describe elements
99	detected in the image.
100	"""
101
NEW 102	try:	×
NEW 103	response = textract_client.get_document_analysis(JobId=job_id)	×
NEW 104	job_status = response["JobStatus"]	×
NEW 105	logger.info("Job %s status is %s.", job_id, job_status)	×
NEW 106	except ClientError:	×
NEW 107	logger.exception("Couldn't get data for job %s.", job_id)	×
NEW 108	raise	×
109	else:
NEW 110	return response	×

DemocracyClub / yournextrepresentative / 74df76c8-4768-48d5-bb7f-5ee50aa05217

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous