74df76c8-4768-48d5-bb7f-5ee50aa05217

Committed 06 Nov 2023 01:38PM UTC coverage: 67.523% (-0.3%) from 67.801%

Build # 74df76c8-4768-48d5-bb7f-5ee50aa05217

Build Type

Pull #2177

circleci

Committed by

VirginiaDooley

Commit Message

Create TextractResults model

Pull Request Pull Request #2177: Spike: AWS Textract

Run Details

1640 of 2760 branches covered (0.0%)

Branch coverage included in aggregate %.

12 of 62 new or added lines in 3 files covered. (19.35%)

110 existing lines in 10 files now uncovered.

6662 of 9535 relevant lines covered (69.87%)

0.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/ynr/apps/sopn_parsing/management/commands/sopn_tooling_compare_raw_people.py

import json
import os
from collections import Counter

from bulk_adding.models import RawPeople
from candidates.models import Ballot
from django.core.management import call_command
from official_documents.models import OfficialDocument
from popolo.models import Membership
from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand
from sopn_parsing.models import ParsedSOPN


class Command(BaseSOPNParsingCommand):
    CORRECT_EXACTLY = "correct_exactly"
    NUM_CORRECT_MISSING_PARTIES = "num_correct_some_parties_missing"
    NUM_INCORRECT = "num_incorrect"
    ZERO_CANDIDATES = "zero_candidates"

    def add_arguments(self, parser):
        super().add_arguments(parser)
        parser.add_argument("--loud", action="store_true", default=False)

    def handle(self, *args, **options):
        """
        - Check we have a baseline file to compare with
        - Prepare some OfficialDocuments
        - Re-parse the documents
        - Loop through the created RawPeople objects, comparing to our baseline
        to make sure that we are parsing at least as many people as before
        - If no asserts failed, use the data to write a new baseline file
        """

        self.loud = options.pop("loud")

        self.candidates_results = {
            "correct_exactly": [],
            "num_correct_some_parties_missing": [],
            "num_incorrect": [],
            "zero_candidates": [],
        }

        raw_people_file = "ynr/apps/sopn_parsing/tests/data/sopn_baseline.json"
        if not os.path.isfile(raw_people_file):
            call_command("sopn_tooling_write_baseline")
            self.stdout.write("Baseline file didn't exist so one was created")

        options.update({"testing": True})

        OfficialDocument.objects.update(relevant_pages="")
        call_command("sopn_parsing_extract_page_numbers", *args, **options)
        ParsedSOPN.objects.all().delete()
        call_command("sopn_parsing_extract_tables", *args, **options)
        RawPeople.objects.all().delete()
        call_command("sopn_parsing_parse_tables", *args, **options)

        with open(raw_people_file) as file:
            old_raw_people = json.loads(file.read())

        self.new_raw_people = {}
        for ballot in Ballot.objects.exclude(officialdocument__isnull=True):
            ballot_data = old_raw_people.get(ballot.ballot_paper_id, {})

            self.compare_relevant_pages(ballot=ballot, ballot_data=ballot_data)

            self.compare_raw_people(ballot=ballot, ballot_data=ballot_data)

        # display some overall totals
        self.stdout.write(
            "Old total 'people' parsed WAS {old}\n"
            "New total 'people' parsed IS {new}".format(
                old=self.count_people_parsed(old_raw_people),
                new=self.count_people_parsed(self.new_raw_people),
            )
        )

        old_raw_people_obj_count = len(
            {k: v for k, v in old_raw_people.items() if v["raw_people"]}
        )
        new_raw_people_obj_count = RawPeople.objects.count()
        style = self.style.SUCCESS
        if new_raw_people_obj_count < old_raw_people_obj_count:
            style = self.style.ERROR
        self.stdout.write(
            style(
                f"Old RawPeople count: {old_raw_people_obj_count}\n"
                f"New total RawPeople count: {new_raw_people_obj_count}"
            )
        )

        for result, ballots in self.candidates_results.items():
            total = len(ballots)
            self.stdout.write(f"{total} ballots parsed {result}")
            # Write a new baseline
        call_command("sopn_tooling_write_baseline")

    def compare_relevant_pages(self, ballot, ballot_data):
        old_relevant_pages = ballot_data.get("relevant_pages", "")
        new_relevant_pages = ballot.sopn.relevant_pages

        if old_relevant_pages != new_relevant_pages:
            self.stdout.write(
                self.style.WARNING(
                    f"RELEVANT PAGES CHANGED FROM {old_relevant_pages} to {new_relevant_pages} for {ballot.ballot_paper_id}"
                )
            )

    def compare_raw_people(self, ballot, ballot_data):
        try:
            raw_people = ballot.rawpeople.data
        except RawPeople.DoesNotExist:
            raw_people = []

        old_raw_people_for_ballot = ballot_data.get("raw_people", [])
        old_count = len(old_raw_people_for_ballot)
        new_count = len(raw_people)
        if new_count < old_count:
            self.stderr.write(
                f"Uh oh, parsed people for {ballot.ballot_paper_id} decreased from {old_count} to {new_count}. Stopping."
            )

        if new_count > old_count:
            self.stdout.write(
                f"{ballot.ballot_paper_id} increased from {old_count} to {new_count} parsed people.\n"
                f"Check the SOPN at https://candidates.democracyclub.org.uk{ballot.get_sopn_url()}."
            )
            for person in raw_people:
                if person not in old_raw_people_for_ballot:
                    self.stdout.write(self.style.SUCCESS(person))

        # when people parsed have changed e.g. different name/different party print it for further checking
        changed_people = [
            person
            for person in old_raw_people_for_ballot
            if person not in raw_people
        ]
        if changed_people:
            self.stdout.write(
                self.style.WARNING(
                    f"Parsed data changed for {ballot.ballot_paper_id}\n"
                    f"New raw people data:\n"
                    f"{raw_people}\n"
                    "Missing people:"
                )
            )
            for person in changed_people:
                self.stderr.write(str(person))

        self.new_raw_people[ballot.ballot_paper_id] = {"raw_people": raw_people}

        self.parties_correct(ballot, raw_people)

    def count_people_parsed(self, raw_people_data):
        """
        Returns the total number of "people" that were parsed.
        NB that just because something was parsed, it doesnt mean that it was
        accurately parsed. Therefore this total is best used to look for large
        changes that should then be checked in detail.
        """
        return sum(
            [len(data["raw_people"]) for data in raw_people_data.values()]
        )

    def parties_correct(self, ballot, raw_people_for_ballot):
        candidates = Membership.objects.filter(ballot=ballot)
        if not candidates:
            self.stdout.write(
                self.style.WARNING(
                    f"We dont have candidates for {ballot.ballot_paper_id}. Try updating with the live site first?"
                )
            )

        if not raw_people_for_ballot:
            self.candidates_results[self.ZERO_CANDIDATES].append(
                ballot.ballot_paper_id
            )
            return None

        num_candidates_correct = candidates.count() == len(
            raw_people_for_ballot
        )

        if self.loud:
            if num_candidates_correct:
                self.stdout.write(
                    self.style.SUCCESS(
                        f"Correct number of people parsed as expected for {ballot.ballot_paper_id}"
                    )
                )
            else:
                self.stdout.write(
                    self.style.ERROR(
                        f"Incorrect number of people parsed for {ballot.ballot_paper_id}"
                    )
                )

        parsed = sorted(
            [person["party_id"] for person in raw_people_for_ballot]
        )
        expected = list(
            candidates.values_list("party__ec_id", flat=True).order_by(
                "party__ec_id"
            )
        )

        if parsed == expected:
            return self.candidates_results[self.CORRECT_EXACTLY].append(
                ballot.ballot_paper_id
            )

        # count number of each missing party ID as there could be more than one
        # missing candidate for a party e.g. 1 missing Green, 2 missing independents
        parsed = Counter(parsed)
        expected = Counter(expected)
        missing = expected - parsed
        if missing:
            total = sum(missing.values())
            self.stderr.write(
                f"{total} MISSING parties for {ballot.ballot_paper_id} (party_id:num_missing)\n{missing}"
            )
        else:
            # sometimes we incorrectly parse extra people - often independents
            # due to an empty row
            extras = parsed - expected
            total = sum(extras.values())
            self.stderr.write(
                f"{total} EXTRA parties for {ballot.ballot_paper_id}\n{extras}"
            )

        if num_candidates_correct:
            return self.candidates_results[
                self.NUM_CORRECT_MISSING_PARTIES
            ].append(ballot.ballot_paper_id)

        return self.candidates_results[self.NUM_INCORRECT].append(
            ballot.ballot_paper_id
        )

1	import json	×
2	import os	×
3	from collections import Counter	×
4
5	from bulk_adding.models import RawPeople	×
6	from candidates.models import Ballot	×
7	from django.core.management import call_command	×
8	from official_documents.models import OfficialDocument	×
9	from popolo.models import Membership	×
10	from sopn_parsing.helpers.command_helpers import BaseSOPNParsingCommand	×
11	from sopn_parsing.models import ParsedSOPN	×
12
13
14	class Command(BaseSOPNParsingCommand):	×
15	CORRECT_EXACTLY = "correct_exactly"	×
16	NUM_CORRECT_MISSING_PARTIES = "num_correct_some_parties_missing"	×
17	NUM_INCORRECT = "num_incorrect"	×
18	ZERO_CANDIDATES = "zero_candidates"	×
19
20	def add_arguments(self, parser):	×
21	super().add_arguments(parser)	×
22	parser.add_argument("--loud", action="store_true", default=False)	×
23
24	def handle(self, args, *options):	×
25	"""
26	- Check we have a baseline file to compare with
27	- Prepare some OfficialDocuments
28	- Re-parse the documents
29	- Loop through the created RawPeople objects, comparing to our baseline
30	to make sure that we are parsing at least as many people as before
31	- If no asserts failed, use the data to write a new baseline file
32	"""
33
34	self.loud = options.pop("loud")	×
35
36	self.candidates_results = {	×
37	"correct_exactly": [],
38	"num_correct_some_parties_missing": [],
39	"num_incorrect": [],
40	"zero_candidates": [],
41	}
42
43	raw_people_file = "ynr/apps/sopn_parsing/tests/data/sopn_baseline.json"	×
44	if not os.path.isfile(raw_people_file):	×
45	call_command("sopn_tooling_write_baseline")	×
46	self.stdout.write("Baseline file didn't exist so one was created")	×
47
48	options.update({"testing": True})	×
49
50	OfficialDocument.objects.update(relevant_pages="")	×
51	call_command("sopn_parsing_extract_page_numbers", args, *options)	×
52	ParsedSOPN.objects.all().delete()	×
53	call_command("sopn_parsing_extract_tables", args, *options)	×
54	RawPeople.objects.all().delete()	×
55	call_command("sopn_parsing_parse_tables", args, *options)	×
56
UNCOV 57	with open(raw_people_file) as file:	×
58	old_raw_people = json.loads(file.read())	×
59
UNCOV 60	self.new_raw_people = {}	×
61	for ballot in Ballot.objects.exclude(officialdocument__isnull=True):	×
62	ballot_data = old_raw_people.get(ballot.ballot_paper_id, {})	×
63
UNCOV 64	self.compare_relevant_pages(ballot=ballot, ballot_data=ballot_data)	×
65
UNCOV 66	self.compare_raw_people(ballot=ballot, ballot_data=ballot_data)	×
67
68	# display some overall totals
UNCOV 69	self.stdout.write(	×
70	"Old total 'people' parsed WAS {old}\n"
71	"New total 'people' parsed IS {new}".format(
72	old=self.count_people_parsed(old_raw_people),
73	new=self.count_people_parsed(self.new_raw_people),
74	)
75	)
76
UNCOV 77	old_raw_people_obj_count = len(	×
78	{k: v for k, v in old_raw_people.items() if v["raw_people"]}
79	)
UNCOV 80	new_raw_people_obj_count = RawPeople.objects.count()	×
81	style = self.style.SUCCESS	×
82	if new_raw_people_obj_count < old_raw_people_obj_count:	×
83	style = self.style.ERROR	×
84	self.stdout.write(	×
85	style(
86	f"Old RawPeople count: {old_raw_people_obj_count}\n"
87	f"New total RawPeople count: {new_raw_people_obj_count}"
88	)
89	)
90
UNCOV 91	for result, ballots in self.candidates_results.items():	×
92	total = len(ballots)	×
93	self.stdout.write(f"{total} ballots parsed {result}")	×
94	# Write a new baseline
UNCOV 95	call_command("sopn_tooling_write_baseline")	×
96
UNCOV 97	def compare_relevant_pages(self, ballot, ballot_data):	×
98	old_relevant_pages = ballot_data.get("relevant_pages", "")	×
99	new_relevant_pages = ballot.sopn.relevant_pages	×
100
UNCOV 101	if old_relevant_pages != new_relevant_pages:	×
102	self.stdout.write(	×
103	self.style.WARNING(
104	f"RELEVANT PAGES CHANGED FROM {old_relevant_pages} to {new_relevant_pages} for {ballot.ballot_paper_id}"
105	)
106	)
107
UNCOV 108	def compare_raw_people(self, ballot, ballot_data):	×
109	try:	×
110	raw_people = ballot.rawpeople.data	×
111	except RawPeople.DoesNotExist:	×
112	raw_people = []	×
113
UNCOV 114	old_raw_people_for_ballot = ballot_data.get("raw_people", [])	×
115	old_count = len(old_raw_people_for_ballot)	×
116	new_count = len(raw_people)	×
117	if new_count < old_count:	×
118	self.stderr.write(	×
119	f"Uh oh, parsed people for {ballot.ballot_paper_id} decreased from {old_count} to {new_count}. Stopping."
120	)
121
UNCOV 122	if new_count > old_count:	×
123	self.stdout.write(	×
124	f"{ballot.ballot_paper_id} increased from {old_count} to {new_count} parsed people.\n"
125	f"Check the SOPN at https://candidates.democracyclub.org.uk{ballot.get_sopn_url()}."
126	)
UNCOV 127	for person in raw_people:	×
128	if person not in old_raw_people_for_ballot:	×
129	self.stdout.write(self.style.SUCCESS(person))	×
130
131	# when people parsed have changed e.g. different name/different party print it for further checking
UNCOV 132	changed_people = [	×
133	person
134	for person in old_raw_people_for_ballot
135	if person not in raw_people
136	]
UNCOV 137	if changed_people:	×
138	self.stdout.write(	×
139	self.style.WARNING(
140	f"Parsed data changed for {ballot.ballot_paper_id}\n"
141	f"New raw people data:\n"
142	f"{raw_people}\n"
143	"Missing people:"
144	)
145	)
UNCOV 146	for person in changed_people:	×
147	self.stderr.write(str(person))	×
148
UNCOV 149	self.new_raw_people[ballot.ballot_paper_id] = {"raw_people": raw_people}	×
150
UNCOV 151	self.parties_correct(ballot, raw_people)	×
152
UNCOV 153	def count_people_parsed(self, raw_people_data):	×
154	"""
155	Returns the total number of "people" that were parsed.
156	NB that just because something was parsed, it doesnt mean that it was
157	accurately parsed. Therefore this total is best used to look for large
158	changes that should then be checked in detail.
159	"""
UNCOV 160	return sum(	×
161	[len(data["raw_people"]) for data in raw_people_data.values()]
162	)
163
UNCOV 164	def parties_correct(self, ballot, raw_people_for_ballot):	×
165	candidates = Membership.objects.filter(ballot=ballot)	×
166	if not candidates:	×
167	self.stdout.write(	×
168	self.style.WARNING(
169	f"We dont have candidates for {ballot.ballot_paper_id}. Try updating with the live site first?"
170	)
171	)
172
UNCOV 173	if not raw_people_for_ballot:	×
174	self.candidates_results[self.ZERO_CANDIDATES].append(	×
175	ballot.ballot_paper_id
176	)
UNCOV 177	return None	×
178
UNCOV 179	num_candidates_correct = candidates.count() == len(	×
180	raw_people_for_ballot
181	)
182
UNCOV 183	if self.loud:	×
184	if num_candidates_correct:	×
185	self.stdout.write(	×
186	self.style.SUCCESS(
187	f"Correct number of people parsed as expected for {ballot.ballot_paper_id}"
188	)
189	)
190	else:
UNCOV 191	self.stdout.write(	×
192	self.style.ERROR(
193	f"Incorrect number of people parsed for {ballot.ballot_paper_id}"
194	)
195	)
196
UNCOV 197	parsed = sorted(	×
198	[person["party_id"] for person in raw_people_for_ballot]
199	)
UNCOV 200	expected = list(	×
201	candidates.values_list("party__ec_id", flat=True).order_by(
202	"party__ec_id"
203	)
204	)
205
UNCOV 206	if parsed == expected:	×
207	return self.candidates_results[self.CORRECT_EXACTLY].append(	×
208	ballot.ballot_paper_id
209	)
210
211	# count number of each missing party ID as there could be more than one
212	# missing candidate for a party e.g. 1 missing Green, 2 missing independents
UNCOV 213	parsed = Counter(parsed)	×
214	expected = Counter(expected)	×
215	missing = expected - parsed	×
216	if missing:	×
217	total = sum(missing.values())	×
218	self.stderr.write(	×
219	f"{total} MISSING parties for {ballot.ballot_paper_id} (party_id:num_missing)\n{missing}"
220	)
221	else:
222	# sometimes we incorrectly parse extra people - often independents
223	# due to an empty row
UNCOV 224	extras = parsed - expected	×
225	total = sum(extras.values())	×
226	self.stderr.write(	×
227	f"{total} EXTRA parties for {ballot.ballot_paper_id}\n{extras}"
228	)
229
UNCOV 230	if num_candidates_correct:	×
231	return self.candidates_results[	×
232	self.NUM_CORRECT_MISSING_PARTIES
233	].append(ballot.ballot_paper_id)
234
UNCOV 235	return self.candidates_results[self.NUM_INCORRECT].append(	×
236	ballot.ballot_paper_id
237	)

DemocracyClub / yournextrepresentative / 74df76c8-4768-48d5-bb7f-5ee50aa05217

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous