f1338858-c0fa-44cb-ba00-f6491924f61b

Committed 19 Aug 2025 09:05AM UTC coverage: 72.061% (+0.3%) from 71.724%

Build # f1338858-c0fa-44cb-ba00-f6491924f61b

Build Type

Pull #2498

circleci

Committed by

awdem

Commit Message

add tippecanoe to CI

Pull Request Pull Request #2498: Feat/divset pmtiles

Run Details

98 of 110 new or added lines in 6 files covered. (89.09%)

11 existing lines in 1 file now uncovered.

3598 of 4993 relevant lines covered (72.06%)

0.72 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.5

/every_election/apps/organisations/boundaries/boundary_bot/spider.py

import json
import tempfile

import scrapy
from organisations.boundaries.boundary_bot.common import (
    REQUEST_HEADERS,
    START_PAGE,
)
from organisations.boundaries.constants import LGBCE_SLUG_TO_ORG_SLUG
from organisations.models.divisions import ReviewStatus
from scrapy.crawler import CrawlerProcess


def get_link_from_container_label(label, response, link_div_class):
    """
    lgbce website has chunks of html like:

    <div class="link-name-and-view-container">
      <div class="link-name-container">
        <div class="link-title">The Mole Valley (Electoral Changes) Order 2023</div>
      </div>
      <div class="link-view-container">
        <a href="https://www.legislation.gov.uk/uksi/2023/49/contents/made" target="_blank" rel="nofollow noopener noreferrer">
          View
          <span class="sr-only">(opens in a new tab)</span>
        </a>
      </div>
    </div>

    This method grabs the links contained by the grandparent of the div containing text matching lower.
    Search is case insensitive.
    Caller needs to check that there's only one link.
    """
    x_path = (
        f'//div[@class="{link_div_class}"][contains(translate('
        "text(),"
        '"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz"), '
        f'"{label.lower()}")]/../..//a/@href'
    )
    return response.xpath(x_path).extract()


class LgbceSpider(scrapy.Spider):
    name = "reviews"
    custom_settings = {
        "CONCURRENT_REQUESTS": 5,  # keep the concurrent requests low
        "DOWNLOAD_DELAY": 0.25,  # throttle the crawl speed a bit
        "COOKIES_ENABLED": False,
        "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0",
        "FEED_FORMAT": "json",
        "DEFAULT_REQUEST_HEADERS": REQUEST_HEADERS,
        # "HTTPCACHE_ENABLED": True,  # Uncomment for Dev
    }
    allowed_domains = ["lgbce.org.uk"]
    start_urls = [START_PAGE]

    def get_shapefiles(self, response):
        # find any zipfile links in divs that also have a header element containing the text 'Final'
        zipfiles = response.xpath(
            "/html/body//div[h4[contains(text(), 'Final')]]//a[contains(@href,'.zip')]/@href"
        ).extract()

        if len(zipfiles) == 1:
            # if we found exactly one link to a zipfile,
            # assume that's what we're looking for
            return zipfiles[0]

        return None

    def get_latest_event(self, response):
        latest_stage = response.css("div.stage-latest")
        if latest_stage:
            return (
                latest_stage.css("div > div > a > h3")
                .xpath("text()")[0]
                .extract()
                .strip()
            )
        return None

    def get_eco_title_and_link(self, response, latest_event):
        def get_link_title(selector):
            return selector.xpath(
                '*/div[@class="link-title"]//text()'
            ).extract_first()

        def get_link(selector):
            return selector.xpath("*/a/@href").extract_first()

        def is_relevant_review(title):
            return (
                "(electoral changes) order" in title.lower()
                or "(structural changes) order" in title.lower()
                or "greater london authority"  # edge case to handle https://www.lgbce.org.uk/all-reviews/greater-london-authority
                in title.lower()
            )

        links = [
            (get_link_title(selector), get_link(selector))
            for selector in response.xpath(
                '//div[@class="latest-information"]//div[@class="link-name-and-view-container"]'
            )
        ]

        made_ecos = [
            (title, link) for title, link in links if is_relevant_review(title)
        ]

        if latest_event == "Effective date" and len(made_ecos) == 1:
            # This catches draft links and made links.
            # Sometimes they put a draft link in where the made link should go.
            # So, if the change is 'effective', and we only have a draft link,
            # use it
            return made_ecos[0]

        made_ecos = [
            (title, link)
            for title, link in made_ecos
            if is_relevant_review(title) and "ukdsi" not in link
        ]

        if len(made_ecos) == 1:
            return made_ecos[0]

        return None, None

    def get_status(self, response):
        lgbce_status = response.css("div.status::text")
        if lgbce_status:
            lgbce_status = lgbce_status.extract_first().strip()
        match lgbce_status:
            case "Currently in review":
                return ReviewStatus.CURRENT
            case "Completed":
                return ReviewStatus.COMPLETED
            case _:
                return None

    def parse(self, response):
        status = self.get_status(response)
        if status:
            latest_event = self.get_latest_event(response)
            if (
                latest_event == "Initial consultation"
                and status == ReviewStatus.COMPLETED
            ):
                status = ReviewStatus.CURRENT
            legislation_title, legislation_url = self.get_eco_title_and_link(
                response, latest_event
            )
            lgbce_slug = response.url.split("/")[-1]
            try:
                slug = LGBCE_SLUG_TO_ORG_SLUG[lgbce_slug]
            except KeyError:
                slug = lgbce_slug
            rec = {
                "slug": slug,
                "latest_event": latest_event,
                "boundaries_url": self.get_shapefiles(response),
                "status": status,
                "legislation_url": legislation_url,
                "legislation_made": 0,
                "legislation_title": legislation_title,
            }

            if rec["legislation_url"]:
                rec["legislation_made"] = 1

            yield rec
        for next_page in response.css("div.letter_section > div > a"):
            if "all-reviews" in next_page.extract():
                yield response.follow(next_page, self.parse)


class SpiderWrapper:
    # Wrapper class that allows us to run a scrapy spider
    # and return the result as a list

    def __init__(self, spider):
        self.spider = spider

    def run_spider(self):
        # Scrapy likes to dump its output to file
        # so we will write it out to a file and read it back in.
        # The 'proper' way to do this is probably to write a custom Exporter
        # but this will do for now

        with tempfile.NamedTemporaryFile() as tmpfile:
            process = CrawlerProcess(
                {
                    "FEED_URI": tmpfile.name,
                }
            )
            process.crawl(self.spider)
            process.start()

            tmpfile.seek(0)
            return json.load(tmpfile)

1	import json	1✔
2	import tempfile	1✔
3
4	import scrapy	1✔
5	from organisations.boundaries.boundary_bot.common import (	1✔
6	REQUEST_HEADERS,
7	START_PAGE,
8	)
9	from organisations.boundaries.constants import LGBCE_SLUG_TO_ORG_SLUG	1✔
10	from organisations.models.divisions import ReviewStatus	1✔
11	from scrapy.crawler import CrawlerProcess	1✔
12
13
14	def get_link_from_container_label(label, response, link_div_class):	1✔
15	"""
16	lgbce website has chunks of html like:
17
18	<div class="link-name-and-view-container">
19	<div class="link-name-container">
20	<div class="link-title">The Mole Valley (Electoral Changes) Order 2023</div>
21	</div>
22	<div class="link-view-container">
23	<a href="https://www.legislation.gov.uk/uksi/2023/49/contents/made" target="_blank" rel="nofollow noopener noreferrer">
24	View
25	<span class="sr-only">(opens in a new tab)</span>
26	</a>
27	</div>
28	</div>
29
30	This method grabs the links contained by the grandparent of the div containing text matching lower.
31	Search is case insensitive.
32	Caller needs to check that there's only one link.
33	"""
34	x_path = (	×
35	f'//div[@class="{link_div_class}"][contains(translate('
36	"text(),"
37	'"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz"), '
38	f'"{label.lower()}")]/../..//a/@href'
39	)
40	return response.xpath(x_path).extract()	×
41
42
43	class LgbceSpider(scrapy.Spider):	1✔
44	name = "reviews"	1✔
45	custom_settings = {	1✔
46	"CONCURRENT_REQUESTS": 5, # keep the concurrent requests low
47	"DOWNLOAD_DELAY": 0.25, # throttle the crawl speed a bit
48	"COOKIES_ENABLED": False,
49	"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0",
50	"FEED_FORMAT": "json",
51	"DEFAULT_REQUEST_HEADERS": REQUEST_HEADERS,
52	# "HTTPCACHE_ENABLED": True, # Uncomment for Dev
53	}
54	allowed_domains = ["lgbce.org.uk"]	1✔
55	start_urls = [START_PAGE]	1✔
56
57	def get_shapefiles(self, response):	1✔
58	# find any zipfile links in divs that also have a header element containing the text 'Final'
59	zipfiles = response.xpath(	1✔
60	"/html/body//div[h4[contains(text(), 'Final')]]//a[contains(@href,'.zip')]/@href"
61	).extract()
62
63	if len(zipfiles) == 1:	1✔
64	# if we found exactly one link to a zipfile,
65	# assume that's what we're looking for
66	return zipfiles[0]	1✔
67
68	return None	1✔
69
70	def get_latest_event(self, response):	1✔
71	latest_stage = response.css("div.stage-latest")	1✔
72	if latest_stage:	1✔
73	return (	1✔
74	latest_stage.css("div > div > a > h3")
75	.xpath("text()")[0]
76	.extract()
77	.strip()
78	)
UNCOV 79	return None	×
80
81	def get_eco_title_and_link(self, response, latest_event):	1✔
82	def get_link_title(selector):	1✔
83	return selector.xpath(	1✔
84	'*/div[@class="link-title"]//text()'
85	).extract_first()
86
87	def get_link(selector):	1✔
88	return selector.xpath("*/a/@href").extract_first()	1✔
89
90	def is_relevant_review(title):	1✔
91	return (	1✔
92	"(electoral changes) order" in title.lower()
93	or "(structural changes) order" in title.lower()
94	or "greater london authority" # edge case to handle https://www.lgbce.org.uk/all-reviews/greater-london-authority
95	in title.lower()
96	)
97
98	links = [	1✔
99	(get_link_title(selector), get_link(selector))
100	for selector in response.xpath(
101	'//div[@class="latest-information"]//div[@class="link-name-and-view-container"]'
102	)
103	]
104
105	made_ecos = [	1✔
106	(title, link) for title, link in links if is_relevant_review(title)
107	]
108
109	if latest_event == "Effective date" and len(made_ecos) == 1:	1✔
110	# This catches draft links and made links.
111	# Sometimes they put a draft link in where the made link should go.
112	# So, if the change is 'effective', and we only have a draft link,
113	# use it
114	return made_ecos[0]	1✔
115
116	made_ecos = [	1✔
117	(title, link)
118	for title, link in made_ecos
119	if is_relevant_review(title) and "ukdsi" not in link
120	]
121
122	if len(made_ecos) == 1:	1✔
123	return made_ecos[0]	1✔
124
125	return None, None	1✔
126
127	def get_status(self, response):	1✔
128	lgbce_status = response.css("div.status::text")	1✔
129	if lgbce_status:	1✔
130	lgbce_status = lgbce_status.extract_first().strip()	1✔
131	match lgbce_status:	1✔
132	case "Currently in review":	1✔
133	return ReviewStatus.CURRENT	1✔
134	case "Completed":	1✔
135	return ReviewStatus.COMPLETED	1✔
136	case _:	1✔
137	return None	1✔
138
139	def parse(self, response):	1✔
140	status = self.get_status(response)	1✔
141	if status:	1✔
142	latest_event = self.get_latest_event(response)	1✔
143	if (	1✔
144	latest_event == "Initial consultation"
145	and status == ReviewStatus.COMPLETED
146	):
UNCOV 147	status = ReviewStatus.CURRENT	×
148	legislation_title, legislation_url = self.get_eco_title_and_link(	1✔
149	response, latest_event
150	)
151	lgbce_slug = response.url.split("/")[-1]	1✔
152	try:	1✔
153	slug = LGBCE_SLUG_TO_ORG_SLUG[lgbce_slug]	1✔
UNCOV 154	except KeyError:	×
UNCOV 155	slug = lgbce_slug	×
156	rec = {	1✔
157	"slug": slug,
158	"latest_event": latest_event,
159	"boundaries_url": self.get_shapefiles(response),
160	"status": status,
161	"legislation_url": legislation_url,
162	"legislation_made": 0,
163	"legislation_title": legislation_title,
164	}
165
166	if rec["legislation_url"]:	1✔
167	rec["legislation_made"] = 1	1✔
168
169	yield rec	1✔
170	for next_page in response.css("div.letter_section > div > a"):	1✔
UNCOV 171	if "all-reviews" in next_page.extract():	×
UNCOV 172	yield response.follow(next_page, self.parse)	×
173
174
175	class SpiderWrapper:	1✔
176	# Wrapper class that allows us to run a scrapy spider
177	# and return the result as a list
178
179	def __init__(self, spider):	1✔
180	self.spider = spider	1✔
181
182	def run_spider(self):	1✔
183	# Scrapy likes to dump its output to file
184	# so we will write it out to a file and read it back in.
185	# The 'proper' way to do this is probably to write a custom Exporter
186	# but this will do for now
187
UNCOV 188	with tempfile.NamedTemporaryFile() as tmpfile:	×
189	process = CrawlerProcess(	×
190	{
191	"FEED_URI": tmpfile.name,
192	}
193	)
UNCOV 194	process.crawl(self.spider)	×
UNCOV 195	process.start()	×
196
UNCOV 197	tmpfile.seek(0)	×
UNCOV 198	return json.load(tmpfile)	×

DemocracyClub / EveryElection / f1338858-c0fa-44cb-ba00-f6491924f61b

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous