• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

DemocracyClub / EveryElection / f1338858-c0fa-44cb-ba00-f6491924f61b

19 Aug 2025 09:05AM UTC coverage: 72.061% (+0.3%) from 71.724%
f1338858-c0fa-44cb-ba00-f6491924f61b

Pull #2498

circleci

awdem
add tippecanoe to CI
Pull Request #2498: Feat/divset pmtiles

98 of 110 new or added lines in 6 files covered. (89.09%)

11 existing lines in 1 file now uncovered.

3598 of 4993 relevant lines covered (72.06%)

0.72 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

82.5
/every_election/apps/organisations/boundaries/boundary_bot/spider.py
1
import json
1✔
2
import tempfile
1✔
3

4
import scrapy
1✔
5
from organisations.boundaries.boundary_bot.common import (
1✔
6
    REQUEST_HEADERS,
7
    START_PAGE,
8
)
9
from organisations.boundaries.constants import LGBCE_SLUG_TO_ORG_SLUG
1✔
10
from organisations.models.divisions import ReviewStatus
1✔
11
from scrapy.crawler import CrawlerProcess
1✔
12

13

14
def get_link_from_container_label(label, response, link_div_class):
1✔
15
    """
16
    lgbce website has chunks of html like:
17

18
    <div class="link-name-and-view-container">
19
      <div class="link-name-container">
20
        <div class="link-title">The Mole Valley (Electoral Changes) Order 2023</div>
21
      </div>
22
      <div class="link-view-container">
23
        <a href="https://www.legislation.gov.uk/uksi/2023/49/contents/made" target="_blank" rel="nofollow noopener noreferrer">
24
          View
25
          <span class="sr-only">(opens in a new tab)</span>
26
        </a>
27
      </div>
28
    </div>
29

30
    This method grabs the links contained by the grandparent of the div containing text matching lower.
31
    Search is case insensitive.
32
    Caller needs to check that there's only one link.
33
    """
34
    x_path = (
×
35
        f'//div[@class="{link_div_class}"][contains(translate('
36
        "text(),"
37
        '"ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz"), '
38
        f'"{label.lower()}")]/../..//a/@href'
39
    )
40
    return response.xpath(x_path).extract()
×
41

42

43
class LgbceSpider(scrapy.Spider):
1✔
44
    name = "reviews"
1✔
45
    custom_settings = {
1✔
46
        "CONCURRENT_REQUESTS": 5,  # keep the concurrent requests low
47
        "DOWNLOAD_DELAY": 0.25,  # throttle the crawl speed a bit
48
        "COOKIES_ENABLED": False,
49
        "USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0",
50
        "FEED_FORMAT": "json",
51
        "DEFAULT_REQUEST_HEADERS": REQUEST_HEADERS,
52
        # "HTTPCACHE_ENABLED": True,  # Uncomment for Dev
53
    }
54
    allowed_domains = ["lgbce.org.uk"]
1✔
55
    start_urls = [START_PAGE]
1✔
56

57
    def get_shapefiles(self, response):
1✔
58
        # find any zipfile links in divs that also have a header element containing the text 'Final'
59
        zipfiles = response.xpath(
1✔
60
            "/html/body//div[h4[contains(text(), 'Final')]]//a[contains(@href,'.zip')]/@href"
61
        ).extract()
62

63
        if len(zipfiles) == 1:
1✔
64
            # if we found exactly one link to a zipfile,
65
            # assume that's what we're looking for
66
            return zipfiles[0]
1✔
67

68
        return None
1✔
69

70
    def get_latest_event(self, response):
1✔
71
        latest_stage = response.css("div.stage-latest")
1✔
72
        if latest_stage:
1✔
73
            return (
1✔
74
                latest_stage.css("div > div > a > h3")
75
                .xpath("text()")[0]
76
                .extract()
77
                .strip()
78
            )
UNCOV
79
        return None
×
80

81
    def get_eco_title_and_link(self, response, latest_event):
1✔
82
        def get_link_title(selector):
1✔
83
            return selector.xpath(
1✔
84
                '*/div[@class="link-title"]//text()'
85
            ).extract_first()
86

87
        def get_link(selector):
1✔
88
            return selector.xpath("*/a/@href").extract_first()
1✔
89

90
        def is_relevant_review(title):
1✔
91
            return (
1✔
92
                "(electoral changes) order" in title.lower()
93
                or "(structural changes) order" in title.lower()
94
                or "greater london authority"  # edge case to handle https://www.lgbce.org.uk/all-reviews/greater-london-authority
95
                in title.lower()
96
            )
97

98
        links = [
1✔
99
            (get_link_title(selector), get_link(selector))
100
            for selector in response.xpath(
101
                '//div[@class="latest-information"]//div[@class="link-name-and-view-container"]'
102
            )
103
        ]
104

105
        made_ecos = [
1✔
106
            (title, link) for title, link in links if is_relevant_review(title)
107
        ]
108

109
        if latest_event == "Effective date" and len(made_ecos) == 1:
1✔
110
            # This catches draft links and made links.
111
            # Sometimes they put a draft link in where the made link should go.
112
            # So, if the change is 'effective', and we only have a draft link,
113
            # use it
114
            return made_ecos[0]
1✔
115

116
        made_ecos = [
1✔
117
            (title, link)
118
            for title, link in made_ecos
119
            if is_relevant_review(title) and "ukdsi" not in link
120
        ]
121

122
        if len(made_ecos) == 1:
1✔
123
            return made_ecos[0]
1✔
124

125
        return None, None
1✔
126

127
    def get_status(self, response):
1✔
128
        lgbce_status = response.css("div.status::text")
1✔
129
        if lgbce_status:
1✔
130
            lgbce_status = lgbce_status.extract_first().strip()
1✔
131
        match lgbce_status:
1✔
132
            case "Currently in review":
1✔
133
                return ReviewStatus.CURRENT
1✔
134
            case "Completed":
1✔
135
                return ReviewStatus.COMPLETED
1✔
136
            case _:
1✔
137
                return None
1✔
138

139
    def parse(self, response):
1✔
140
        status = self.get_status(response)
1✔
141
        if status:
1✔
142
            latest_event = self.get_latest_event(response)
1✔
143
            if (
1✔
144
                latest_event == "Initial consultation"
145
                and status == ReviewStatus.COMPLETED
146
            ):
UNCOV
147
                status = ReviewStatus.CURRENT
×
148
            legislation_title, legislation_url = self.get_eco_title_and_link(
1✔
149
                response, latest_event
150
            )
151
            lgbce_slug = response.url.split("/")[-1]
1✔
152
            try:
1✔
153
                slug = LGBCE_SLUG_TO_ORG_SLUG[lgbce_slug]
1✔
UNCOV
154
            except KeyError:
×
UNCOV
155
                slug = lgbce_slug
×
156
            rec = {
1✔
157
                "slug": slug,
158
                "latest_event": latest_event,
159
                "boundaries_url": self.get_shapefiles(response),
160
                "status": status,
161
                "legislation_url": legislation_url,
162
                "legislation_made": 0,
163
                "legislation_title": legislation_title,
164
            }
165

166
            if rec["legislation_url"]:
1✔
167
                rec["legislation_made"] = 1
1✔
168

169
            yield rec
1✔
170
        for next_page in response.css("div.letter_section > div > a"):
1✔
UNCOV
171
            if "all-reviews" in next_page.extract():
×
UNCOV
172
                yield response.follow(next_page, self.parse)
×
173

174

175
class SpiderWrapper:
1✔
176
    # Wrapper class that allows us to run a scrapy spider
177
    # and return the result as a list
178

179
    def __init__(self, spider):
1✔
180
        self.spider = spider
1✔
181

182
    def run_spider(self):
1✔
183
        # Scrapy likes to dump its output to file
184
        # so we will write it out to a file and read it back in.
185
        # The 'proper' way to do this is probably to write a custom Exporter
186
        # but this will do for now
187

UNCOV
188
        with tempfile.NamedTemporaryFile() as tmpfile:
×
189
            process = CrawlerProcess(
×
190
                {
191
                    "FEED_URI": tmpfile.name,
192
                }
193
            )
UNCOV
194
            process.crawl(self.spider)
×
UNCOV
195
            process.start()
×
196

UNCOV
197
            tmpfile.seek(0)
×
UNCOV
198
            return json.load(tmpfile)
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc