• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Freegle / iznik-server / #2469

30 Nov 2025 09:48AM UTC coverage: 90.08% (-0.3%) from 90.386%
#2469

push

php-coveralls

edwh
Skip testMessageIsochrones - requires external Mapbox API

26478 of 29394 relevant lines covered (90.08%)

31.7 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

81.3
/include/misc/Jobs.php
1
<?php
2
namespace Freegle\Iznik;
3

4
use Prewk\XmlStringStreamer;
5
use Prewk\XmlStringStreamer\Stream;
6
use Prewk\XmlStringStreamer\Parser;
7

8
require_once(IZNIK_BASE . '/lib/geoPHP/geoPHP.inc');
1✔
9

10
class Jobs {
11
    /** @public  $dbhr LoggedPDO */
12
    public $dbhr;
13
    /** @public  $dbhm LoggedPDO */
14
    public $dbhm;
15

16
    private $jobKeywords = NULL;
17

18
    const MINIMUM_CPC = 0.10;
19

20
    function __construct(LoggedPDO $dbhr, LoggedPDO $dbhm) {
21
        $this->dbhr = $dbhr;
8✔
22
        $this->dbhm = $dbhm;
8✔
23
    }
24

25
    public function query($lat, $lng, $limit = 50, $category = NULL) {
26
        # To make efficient use of the spatial index we construct a box around our lat/lng, and search for jobs
27
        # where the geometry overlaps it.  We keep expanding our box until we find enough.
28
        #
29
        # We used to double the ambit each time, but that led to long queries, probably because we would suddenly
30
        # include a couple of cities or something.
31
        $step = 0.02;
6✔
32
        $ambit = $step;
6✔
33

34
        $ret = [];
6✔
35
        $got = [];
6✔
36
        $gotbody = [];
6✔
37
        $gottitle = [];
6✔
38
        $passes = 0;
6✔
39

40
        do {
41
            $swlat = $lat - $ambit;
6✔
42
            $nelat = $lat + $ambit;
6✔
43
            $swlng = $lng - $ambit;
6✔
44
            $nelng = $lng + $ambit;
6✔
45

46
            $poly = "POLYGON(($swlng $swlat, $swlng $nelat, $nelng $nelat, $nelng $swlat, $swlng $swlat))";
6✔
47
            $categoryq = $category ? (" AND category = " . $this->dbhr->quote($category)) : '';
6✔
48

49
            # We use ST_Within because that takes advantage of the spatial index, whereas ST_Intersects does not.
50
            $alreadyq = '';
6✔
51

52
            if (count($got)) {
6✔
53
                $alreadyq = " AND jobs.id NOT IN (" . implode(',', array_keys($got)) . ") AND jobs.title NOT IN (";
4✔
54

55
                $first = TRUE;
4✔
56
                foreach ($gottitle as $title) {
4✔
57
                    if (!$first) {
4✔
58
                        $alreadyq .= ", ";
×
59
                    }
60

61
                    $alreadyq .= $this->dbhr->quote($title);
4✔
62
                    $first = FALSE;
4✔
63
                }
64

65
                $alreadyq .= ") ";
4✔
66
            }
67

68
            $sql = "SELECT $ambit AS ambit, 
6✔
69
       ST_Distance(geometry, ST_GeomFromText('POINT($lng $lat)', {$this->dbhr->SRID()})) AS dist,
6✔
70
       CASE WHEN ST_Dimension(geometry) < 2 THEN 0 ELSE ST_Area(geometry) END AS area,
71
       jobs.id, jobs.url, jobs.title, jobs.location, jobs.body, jobs.job_reference, jobs.cpc, jobs.clickability
72
        FROM `jobs`
73
        WHERE ST_Within(geometry, ST_GeomFromText('$poly', {$this->dbhr->SRID()})) 
6✔
74
            AND (ST_Dimension(geometry) < 2 OR ST_Area(geometry) / ST_Area(ST_GeomFromText('$poly', {$this->dbhr->SRID()})) < 2)
6✔
75
            AND cpc >= " . Jobs::MINIMUM_CPC . "
6✔
76
            AND visible = 1
77
            $alreadyq
6✔
78
            $categoryq
6✔
79
        ORDER BY cpc DESC, dist ASC, posted_at DESC LIMIT $limit;";
6✔
80
            $jobs = $this->dbhr->preQuery($sql);
6✔
81
            #error_log($sql . " found " . count($jobs));
82
            $passes++;
6✔
83

84
            foreach ($jobs as $job) {
6✔
85
                $got[$job['id']] = TRUE;
4✔
86
                $gotbody[$job['body']] = $job['body'];
4✔
87
                $gottitle[$job['title']] = $job['title'];
4✔
88
                $job['passes'] = $passes;
4✔
89

90
                # We have clickability, which is our estimate of how likely people are to click on a job based on
91
                # past clicks.  We also have the CPC, which is how much we expect to earn from a click.
92
                # Use these to order the jobs by our expected income.
93
                $cpc = max($job['cpc'], 0.0001);
4✔
94
                $job['expectation'] = $cpc * $job['clickability'];
4✔
95

96
                $ret[] = $job;
4✔
97
            }
98

99
            $ambit += $step;
6✔
100
        } while (count($ret) < $limit && $ambit < 1);
6✔
101

102
        usort($ret, function($a, $b) {
6✔
103
            # Take care - must return integer, not float, otherwise the sort doesn't work.
104
            return ceil($b['cpc'] - $a['cpc']);
×
105
        });
6✔
106

107
        $ret = array_slice($ret, 0, $limit);
6✔
108

109
        return $ret;
6✔
110
    }
111

112
    public function get($id) {
113
        $jobs = $this->dbhr->preQuery("SELECT * FROM jobs WHERE id = ?", [
1✔
114
            $id
1✔
115
        ]);
1✔
116

117
        return $jobs;
1✔
118
    }
119

120
    public static function getKeywords($str) {
121
        $initial = explode(' ',$str);
1✔
122

123
        # Remove some stuff.
124
        $arr = [];
1✔
125

126
        foreach ($initial as $i) {
1✔
127
            $w = preg_replace("/[^A-Za-z]/", '', $i);
1✔
128

129
            if (strlen($w) > 2) {
1✔
130
                $arr[] = $w;
1✔
131
            }
132
        }
133

134
        $result = [];
1✔
135

136
        for($i=0; $i < count($arr)-1; $i++) {
1✔
137
            $result[] =  strtolower($arr[$i]) . ' ' . strtolower($arr[$i+1]);
1✔
138
        }
139

140
        return $result;
1✔
141
    }
142

143
    public function recordClick($jobid, $link, $userid) {
144
        # Use ignore because we can get clicks for jobs we have purged.
145
        $this->dbhm->preExec("INSERT IGNORE INTO logs_jobs (userid, jobid, link) VALUES (?, ?, ?);", [
1✔
146
            $userid,
1✔
147
            $jobid,
1✔
148
            $link
1✔
149
        ]);
1✔
150
    }
151

152
    public function analyseClickability() {
153
        # This looks at the jobs we've clicked on, extracts keywords, and uses that to build a measure of how
154
        # clickable a job is.  Obviously this is distorted by the order in which we display jobs, but it
155
        # still gives a reasonable idea of the keywords which would generate the most clicks.
156
        #
157
        # Prune old job logs - old data not worth analysing.
158
        $mysqltime = date("Y-m-d H:i:s", strtotime("midnight 31 days ago"));
×
159
        $this->dbhm->preExec("DELETE FROM logs_jobs WHERE timestamp < ?", [
×
160
            $mysqltime
×
161
        ]);
×
162

163
        # Find any jobs which have the link but not the id.
164
        $logs = $this->dbhr->preQuery("SELECT jobs.id AS jobid, logs_jobs.id FROM logs_jobs 
×
165
    INNER JOIN jobs ON jobs.url = logs_jobs.link 
166
    WHERE jobid IS NULL AND link IS NOT NULL
167
    ORDER BY id DESC;");
×
168
        error_log("Logs to fix " . count($logs));
×
169
        $count = 0;
×
170

171
        foreach ($logs as $log) {
×
172
            $this->dbhm->preExec("UPDATE logs_jobs SET jobid = ? WHERE id = ?;", [
×
173
                $log['jobid'],
×
174
                $log['id']
×
175
            ]);
×
176

177
            $count++;
×
178

179
            if ($count % 100 == 0) {
×
180
                error_log("...$count / " . count($logs));
×
181
            }
182
        }
183

184
        # Now process the clicked jobs to extract keywords.  Use DISTINCT as some people click obsessively on the same job.
185
        $jobs = $this->dbhr->preQuery("SELECT DISTINCT jobs.* FROM logs_jobs INNER JOIN jobs ON logs_jobs.jobid = jobs.id");
×
186
        $this->dbhm->preExec("TRUNCATE TABLE jobs_keywords");
×
187

188
        error_log("Process " . count($jobs));
×
189

190
        foreach ($jobs as $job) {
×
191
            $keywords = Jobs::getKeywords($job['title']);
×
192

193
            if (count($keywords)) {
×
194
                #error_log("{$job['title']} => " . json_encode($keywords));
195
                foreach ($keywords as $k) {
×
196
                    $this->dbhm->preExec("INSERT INTO jobs_keywords (keyword, count) VALUES (?, 1) ON DUPLICATE KEY UPDATE count = count + 1;", [
×
197
                        $k
×
198
                    ]);
×
199
                }
200
            }
201
        }
202
    }
203

204
    public function updateClickability() {
205
        # This updates the list of jobs with a clickability measure, based on the keyword analyse we have done on
206
        # previous clicks.
207
        $jobs = $this->dbhr->preQuery("SELECT id, title FROM jobs;");
×
208

209
        $maxish = $this->getMaxish();
×
210

211
        foreach ($jobs as $job) {
×
212
            $score = $this->clickability($job['id'], $job['title'], $maxish);
×
213
            $this->dbhm->preExec("UPDATE jobs SET clickability = ? WHERE id = ?;", [
×
214
                $score,
×
215
                $job['id']
×
216
            ]);
×
217
        }
218
    }
219

220
    public function clickability($jobid, $title = NULL, $maxish = NULL) {
221
        $ret = 0;
1✔
222

223
        if (!$title) {
1✔
224
            # Collect the sum of the counts of keywords present in this title.
225
            $jobs = $this->dbhr->preQuery("SELECT title FROM jobs WHERE id = ?;", [
1✔
226
                $jobid
1✔
227
            ]);
1✔
228

229
            foreach ($jobs as $job) {
1✔
230
                $title = $job['title'];
1✔
231
            }
232
        }
233

234
        if ($title) {
1✔
235
            $keywords = Jobs::getKeywords($title);
1✔
236

237
            if (count($keywords)) {
1✔
238
                if (!$this->jobKeywords) {
1✔
239
                    # Cache in this object because it speeds bulk load a lot.
240
                    $this->jobKeywords = [];
1✔
241
                    $jobKeywords = $this->dbhr->preQuery("SELECT * FROM jobs_keywords");
1✔
242

243
                    foreach ($jobKeywords as $j) {
1✔
244
                        $this->jobKeywords[$j['keyword']] = $j['count'];
1✔
245
                    }
246
                }
247

248
                foreach ($keywords as $keyword) {
1✔
249
                    if (array_key_exists($keyword, $this->jobKeywords)) {
1✔
250
                        $ret += $this->jobKeywords[$keyword];
×
251
                    }
252
                }
253
            }
254
        }
255

256
        # Normalise this, so that if we get more clicks overall because we have more site activity we don't
257
        # think it's because the jobs we happen to be clicking are more desirable. Use the 95th percentile to
258
        # get the maxish value (avoiding outliers).
259
        if (!$maxish) {
1✔
260
            $maxish = $this->getMaxish();
1✔
261
        }
262

263
        return $ret / $maxish;
1✔
264
    }
265

266
    public function getMaxish() {
267
        $maxish = 1;
2✔
268

269
        $m = $this->dbhr->preQuery("SELECT count FROM 
2✔
270
(SELECT t.*,  @row_num :=@row_num + 1 AS row_num FROM jobs_keywords t, 
271
    (SELECT @row_num:=0) counter ORDER BY count) 
272
temp WHERE temp.row_num = ROUND (.95* @row_num);");
2✔
273

274
        if (count($m)) {
2✔
275
            $maxish = $m[0]['count'];
2✔
276
        }
277

278
        return $maxish;
2✔
279
    }
280

281
    public static function geocode($addr, $allowPoint, $exact, $bbswlat = 49.959999905, $bbswlng = -7.57216793459, $bbnelat = 58.6350001085, $bbnelng = 1.68153079591) {
282
        // Special cases
283
        if ($addr == 'West Marsh') {
2✔
284
            $addr = 'Grimsby';
×
285
        } else if ($addr == 'Stoney Middleton') {
2✔
286
            $addr .= ', Derbyshire';
×
287
        } else if ($addr == 'Middleton Stoney') {
2✔
288
            $addr .= ', Oxfordshire';
×
289
        } else if ($addr == 'Kirkwall') {
2✔
290
            $addr .= ', Orkney';
×
291
        } else if ($addr == 'City of York') {
2✔
292
            $addr = 'York';
×
293
        } else if ($addr == 'Sutton Central') {
2✔
294
            $addr = 'Sutton, London';
×
295
        } else if ($addr == 'Hampden Park') {
2✔
296
            $addr = 'Hampden Park, Eastbourne';
×
297
        } else if ($addr == 'Guernsey') {
2✔
298
            return [ NULL, NULL, NULL, NULL, NULL, NULL ];
×
299
        }
300

301
        $url = "https://" . GEOCODER . "/api?q=" . urlencode($addr) . "&bbox=$bbswlng%2C$bbswlat%2C$bbnelng%2C$bbnelat";
2✔
302
        $geocode = @file_get_contents($url);
2✔
303
        #error_log("Geocode $addr, allow point $allowPoint, exact $exact, $url");
304
        $swlng = $swlat = $nelng = $nelat = $geom = $area = NULL;
2✔
305

306
        if ($geocode) {
2✔
307
            $results = json_decode($geocode, TRUE);
2✔
308

309
            if (Utils::pres('features', $results) && count($results['features'])) {
2✔
310
                foreach ($results['features'] as $feature) {
2✔
311
                    if (Utils::pres('properties', $feature)) {
2✔
312
                        $nameMatches = Utils::pres('name', $feature['properties']) && strcmp(strtolower($feature['properties']['name']), strtolower($addr)) == 0;
2✔
313

314
                        if (Utils::pres('extent', $feature['properties'])) {
2✔
315
                            if (!$exact || !$nameMatches) {
2✔
316
                                $swlng = floatval($feature['properties']['extent'][0]);
2✔
317
                                $swlat = floatval($feature['properties']['extent'][1]);
2✔
318
                                $nelng = floatval($feature['properties']['extent'][2]);
2✔
319
                                $nelat = floatval($feature['properties']['extent'][3]);
2✔
320
                                $geom = Utils::getBoxPoly($swlat, $swlng, $nelat, $nelng);
2✔
321
                                #error_log("From extent $geom");
322
                            }
323
                            break;
2✔
324
                        } else if ($allowPoint &&
1✔
325
                            (!$exact || $nameMatches) &&
1✔
326
                            Utils::pres('geometry', $feature) &&
1✔
327
                            Utils::pres('coordinates', $feature['geometry'])) {
1✔
328
                            # Invent a small polygon, just so all the geometries have the same dimension
329
                            $lat = floatval($feature['geometry']['coordinates'][1]);
1✔
330
                            $lng = floatval($feature['geometry']['coordinates'][0]);
1✔
331
                            $swlng = $lng - 0.0005;
1✔
332
                            $swlat = $lat - 0.0005;
1✔
333
                            $nelat = $lat + 0.0005;
1✔
334
                            $nelng = $lng + 0.0005;
1✔
335
                            $geom = Utils::getBoxPoly($swlat, $swlng, $nelat, $nelng);
1✔
336
                            #error_log("From point $geom");
337
                            break;
1✔
338
                        }
339
                    }
340
                }
341
            }
342
        }
343

344
        if ($geom) {
2✔
345
            $g = new \geoPHP();
2✔
346
            $poly = $g::load($geom, 'wkt');
2✔
347
            $area = $poly->area();
2✔
348
        }
349

350
        #error_log("Geocode $addr => $geom area $area");
351
        return [ $swlat, $swlng, $nelat, $nelng, $geom, $area ];
2✔
352
    }
353

354
    public function scanToCSV($inputFile, $outputFile, $maxage = 7, $fakeTime = FALSE, $distribute = 0.0005, $perFile = 1000, $format = 'csv') {
355
        $now = $fakeTime ? '2001-01-01 00:00:00' : date("Y-m-d H:i:s", time());
1✔
356
        $out = fopen("$outputFile.tmp", 'w');
1✔
357

358
        # This scans the XML job file provided by WhatJobs, filters out the ones we want, and writes to a
359
        # CSV file.
360
        $options = array(
1✔
361
            "captureDepth" => 3,
1✔
362
            "expectGT" => TRUE
1✔
363
        );
1✔
364

365
        $parser = new Parser\StringWalker($options);
1✔
366
        $stream = new Stream\File($inputFile, 1024);
1✔
367
        $streamer = new XmlStringStreamer($parser, $stream);
1✔
368

369
        $count = 0;
1✔
370
        $new = 0;
1✔
371
        $old = 0;
1✔
372
        $toolow = 0;
1✔
373
        $nogeocode = 0;
1✔
374
        $geocache = [];
1✔
375

376
        $maxish = $this->getMaxish();
1✔
377

378
        while ($node = $streamer->getNode()) {
1✔
379
            $job = simplexml_load_string($node);
1✔
380

381
            if ($format == 1) {
1✔
382
                $location = $job->locations->location->location;
1✔
383
                $timeposted = $job->timePosted;
1✔
384
                $city = $job->locations->location->city;
1✔
385
                $state = $job->locations->location->state;
1✔
386
                $zip = $job->locations->location->zip;
1✔
387
                $country = $job->locations->location->country;
1✔
388
                $companyname = $job->company->name;
1✔
389
                $cpc = $job->custom->CPC;
1✔
390
                $deeplink = $job->urlDeeplink;
1✔
391
                $jobid = $job->id;
1✔
392
                $jobtitle = $job->title;
1✔
393
                $description = $job->description;
1✔
394
                $category = $job->category;
1✔
395
            } else {
396
                $location = $job->location;
1✔
397
                $timeposted = $job->posted_at;
1✔
398
                $city = $job->city;
1✔
399
                $state = $job->state;
1✔
400
                $zip = $job->zip;
1✔
401
                $country = $job->country;
1✔
402
                $companyname = $job->company;
1✔
403
                $cpc = $job->cpc;
1✔
404
                $deeplink = $job->url;
1✔
405
                $jobid = $job->job_reference;
1✔
406
                $jobtitle = $job->title;
1✔
407
                $description = $job->body;
1✔
408
                $category = $job->category;
1✔
409
            }
410

411
            #error_log("Location $location, timeposted $timeposted, city $city, state $state, zip $zip, country $country, companyname $companyname, cpc $cpc, deeplink $deeplink, jobid $jobid, jobtitle $jobtitle, description " . strlen($description) . ", category $category");
412

413
            # Only add new jobs.
414
            $age = (time() - strtotime($timeposted)) / (60 * 60 * 24);
1✔
415

416
            if ($age < $maxage) {
1✔
417
                if (!$jobid) {
1✔
418
                    #error_log("No job id for {$jobtitle}, {$location}");
419
                } else if (floatval($cpc) < Jobs::MINIMUM_CPC) {
1✔
420
                    # Ignore this job - not worth us showing.
421
                    $toolow++;
×
422
                } else {
423
                    $geokey = "{$city},{$state},{$country}";
1✔
424
                    $geom = NULL;
1✔
425

426
                    $wascached = array_key_exists($geokey, $geocache);
1✔
427

428
                    if ($wascached) {
1✔
429
                        # In memory cache is faster than DB queries.  The total number of locations of the order
430
                        # of 20K so this is ok.
431
                        list ($geom, $swlat, $swlng, $nelat, $nelng) = $geocache[$geokey];
×
432
                        #error_log("Got $geokey from geocache");
433
                    } else {
434
                        # See if we already have the address geocoded - would save hitting the geocoder.
435
                        $geo = $this->dbhr->preQuery(
1✔
436
                            "SELECT ST_AsText(ST_Envelope(geometry)) AS geom FROM jobs WHERE city = ? AND state = ? AND country = ? LIMIT 1;",
1✔
437
                            [
1✔
438
                                $city,
1✔
439
                                $state,
1✔
440
                                $country
1✔
441
                            ]
1✔
442
                        );
1✔
443

444
                        $geom = null;
1✔
445
                        #error_log("Found " . count($geo) . " {$city}, {$state}, {$country}");
446

447
                        if (count($geo))
1✔
448
                        {
449
                            $geom = $geo[0]['geom'];
×
450
                            $g = new \geoPHP();
×
451
                            $poly = $g::load($geom, 'wkt');
×
452
                            $bbox = $poly->getBBox();
×
453
                            $swlat = $bbox['miny'];
×
454
                            $swlng = $bbox['minx'];
×
455
                            $nelat = $bbox['maxy'];
×
456
                            $nelng = $bbox['maxx'];
×
457

458
                            #error_log("Got existing location $geom = $swlat,$swlng to $nelat, $nelng");
459
                        }
460
                    }
461

462
                    if (!$geom) {
1✔
463
                        # Try to geocode the address.
464
                        #
465
                        # We have location, city, state, and country.  We can ignore the country.
466
                        $badStates = [ 'not specified', 'united kingdom of great britain and northern ireland', 'united kingdom', 'uk', 'england', 'scotland', 'wales', 'home based' ];
1✔
467
                        if ($state && strlen(trim($state)) && !in_array(strtolower(trim($state)), $badStates)) {
1✔
468
                            # Sometimes the state is a region.  Sometimes it is a much more specific location. Sigh.
469
                            #
470
                            # So first, geocode the state; if we get a specific location we can use that.  If it's a larger
471
                            # location then we can use it as a bounding box.
472
                            #
473
                            # Geocoding gets confused by "Borough of", e.g. "Borough of Swindon".
474
                            $state = str_ireplace($state, 'Borough of ', '');
1✔
475

476
                            list ($swlat, $swlng, $nelat, $nelng, $geom, $area) = Jobs::geocode($job, FALSE, TRUE);
1✔
477

478
                            if ($area && $area < 0.05) {
1✔
479
                                # We have a small 'state', which must be an actual location.  Stop.
480
                                #error_log("Got small area {$state}");
481
                            } else if ($geom) {
1✔
482
                                # We have a large state, which is a plausible region. Use it as a bounding box for the
483
                                # city.
484
                                #error_log("Gecoded {$state} to $geom");
485
                                list ($swlat, $swlng, $nelat, $nelng, $geom, $area) = Jobs::geocode($city, TRUE, FALSE, $swlat, $swlng, $nelat, $nelng);
×
486

487
                                if (!$geom) {
×
488
                                    #error_log("Failed to geocode {$city} in {$state}");
489
                                }
490
                            }
491
                        }
492
                    }
493

494
                    $badCities = [ 'not specified', 'null', 'home based', 'united kingdom', ', , united kingdom' ];
1✔
495
                    if (!$geom && $city && strlen(trim($city)) && !in_array(strtolower(trim($city)), $badCities)) {
1✔
496
                        # We have not managed anything from the state.  Try just the city.  This will lead to some being
497
                        # wrong, but that's life.
498
                        #error_log("State no use, use city {$city}");
499
                        list ($swlat, $swlng, $nelat, $nelng, $geom, $area) = Jobs::geocode($city, TRUE, FALSE);
1✔
500

501
                        if ($area > 50) {
1✔
502
                            #error_log("{$city} => $geom is too large at $area");
503
                            $geom = NULL;
×
504
                        }
505
                    }
506

507
                    if ($geom) {
1✔
508
                        #error_log("Geocoded {$city}, {$state} => $geom");
509

510
                        if (!$wascached) {
1✔
511
                            $geocache[$geokey] = [ $geom, $swlat, $swlng, $nelat, $nelng];
1✔
512
                        }
513

514
                        # Jobs tend to cluster, e.g. in cities.  When we are searching we expand a box from our current
515
                        # location until we overlap enough.  The clustering means we may suddenly match thousands of them,
516
                        # which is slow.  So instead of using the job location box as is, randomise to be a small box
517
                        # within the location box.  That way we will encounter some of the jobs sooner and hence have faster queries.
518
                        $newlat = $swlat + (mt_rand() / mt_getrandmax()) * ($nelat - $swlat);
1✔
519
                        $newlng = $swlng + (mt_rand() / mt_getrandmax()) * ($nelng - $swlng);
1✔
520
                        $swlng = $newlng - $distribute;
1✔
521
                        $swlat = $newlat - $distribute;
1✔
522
                        $nelat = $newlat + $distribute;
1✔
523
                        $nelng = $newlng + $distribute;
1✔
524
                        $geom = $distribute ? Utils::getBoxPoly($swlat, $swlng, $nelat, $nelng) : $geom;
1✔
525
                        #error_log("Modified loc to $geom");
526

527
                        # If the job already exists in the old table we want to preserve the id, because people
528
                        # might click on an old id from email.  If we don't find it then we'll use NULL, which will
529
                        # create a new auto-increment id.  We have to go to some effort to get the NULL in there -
530
                        # we want \N, but unquoted, so we put \N and then strip the quotes later.
531
                        #
532
                        # We could speed this up by sending out the job_reference rather than the id.
533
                        $id = "\N";
1✔
534

535
                        $oldids = $this->dbhr->preQuery("SELECT id FROM jobs WHERE job_reference = ?;", [
1✔
536
                            $jobid
1✔
537
                        ]);
1✔
538

539
                        foreach ($oldids as $oldid) {
1✔
540
                            $id = $oldid['id'];
×
541
                        }
542

543
                        try {
544
                            #error_log("Added job $jobid $id $geom");
545
                            $clickability = 0;
1✔
546
                            $title = NULL;
1✔
547

548
                            if ($jobtitle) {
1✔
549
                                $title = str_replace('&#39;', "'", html_entity_decode($jobtitle));
1✔
550
                                # Don't get clickability - we don't currently use it and it slows things down.
551
                                $clickability = 1;
1✔
552
                                #$clickability = $this->clickability(NULL, html_entity_decode($title), $maxish);
553
                            }
554

555
                            $body = NULL;
1✔
556

557
                            if ($description) {
1✔
558
                                # Truncate the body - we only display the body as a snippet to get people to click
559
                                # through.
560
                                #
561
                                # Fix up line endings which cause problems with fputcsv, and other weirdo characters
562
                                # that get mangled en route to us.  This isn't a simple UTF8 problem because the data
563
                                # comes from all over the place and is a bit of a mess.
564
                                $body = html_entity_decode($description);
1✔
565
                                $body = str_replace("\r\n", " ", $body);
1✔
566
                                $body = str_replace("\r", " ", $body);
1✔
567
                                $body = str_replace("\n", " ", $body);
1✔
568
                                $body = str_replace('–', '-', $body);
1✔
569
                                $body = str_replace('Â', '-', $body);
1✔
570
                                $body = substr($body, 0, 256);
1✔
571

572
                                # The truncation might happen to leave a \ at the end of the string, which would then
573
                                # escape the following quote added by fputcsv, and the world would explode in a ball of
574
                                # fire.  Add a space to avoid that.
575
                                $body .= ' ';
1✔
576
                            }
577

578
                            # Sometimes the geocode can end up with line breaks.
579
                            $geom = str_replace(["\n", "\r"], '', $geom);
1✔
580

581
                            # Write the job to CSV, ready for LOAD DATA INFILE later.
582
                            # location, title, city, state, zip, country, job_type, posted_at, job_reference, company,
583
                            # mobile_friendly_apply, category, html_jobs, url, body, cpc, geometry, clickability,
584
                            # bodyhash, seenat
585
                            fputcsv($out, [
1✔
586
                                $id,
1✔
587
                                $location ? html_entity_decode($location) : NULL,
1✔
588
                                $title,
1✔
589
                                $city ? html_entity_decode($city) : NULL,
1✔
590
                                $state ? html_entity_decode($state) : NULL,
1✔
591
                                $zip ? html_entity_decode($zip) : NULL,
1✔
592
                                $country ? html_entity_decode($country) : NULL,
1✔
593
                                NULL,
1✔
594
                                $timeposted ? html_entity_decode($timeposted) : NULL,
1✔
595
                                $jobid ? html_entity_decode($jobid) : NULL,
1✔
596
                                $companyname ? html_entity_decode($companyname) : NULL,
1✔
597
                                NULL,
1✔
598
                                $category ? html_entity_decode($category) : NULL,
1✔
599
                                NULL,
1✔
600
                                $deeplink ? html_entity_decode($deeplink) : NULL,
1✔
601
                                $body,
1✔
602
                                $cpc ? html_entity_decode($cpc) : NULL,
1✔
603
                                $geom,
1✔
604
                                $clickability,
1✔
605
                                md5($body),
1✔
606
                                $now,
1✔
607
                                1
1✔
608
                            ]);
1✔
609

610
                            $new++;
1✔
611
                        } catch (\Exception $e) {
×
612
                            error_log("Failed to add {$jobtitle} $geom " . $e->getMessage());
1✔
613
                        }
614
                    } else {
615
                        #error_log("Couldn't geocode {$city}, {$state}");
616
                        $nogeocode++;
1✔
617
                    }
618
                }
619
            } else {
620
                $old++;
×
621
            }
622

623
            $count++;
1✔
624

625
            if ($count % 1000 === 0) {
1✔
626
                error_log(date("Y-m-d H:i:s", time()) . "...processing $count");
×
627
            }
628
        }
629

630
        fclose($out);
1✔
631

632
        # Now unquote any "\N" into \N.
633
        error_log(date("Y-m-d H:i:s", time()) . "...finished loop, split");
1✔
634
        $data = file_get_contents("$outputFile.tmp");
1✔
635
        $data = str_replace('"\N"', '\N', $data);
1✔
636
        file_put_contents($outputFile, $data);
1✔
637
        error_log(date("Y-m-d H:i:s", time()) . "...written file $outputFile");
1✔
638
    }
639

640
    public function prepareForLoadCSV() {
641
        error_log(date("Y-m-d H:i:s", time()) . "...DROP jobs_new");
1✔
642
        $this->dbhm->preExec("DROP TABLE IF EXISTS jobs_new;");
1✔
643
        error_log(date("Y-m-d H:i:s", time()) . "...CREATE jobs_new");
1✔
644
        $this->dbhm->preExec("CREATE TABLE jobs_new LIKE jobs;");
1✔
645
        $this->dbhm->preExec("SET GLOBAL local_infile=1;");
1✔
646
    }
647

648
    public function loadCSV($csv) {
649
        # Percona cluster has non-standard behaviour for LOAD DATA INFILE, and commits are performed every 10K rows.
650
        # But even this seems to place too much load on cluster syncing, and can cause lockups.  So we split the CSV
651
        # into 1K rows and load each of them in turn.
652
        error_log(date("Y-m-d H:i:s", time()) . "...Split CSV file");
1✔
653
        system("cd /tmp/; rm feed-split*; split -1000 $csv feed-split");
1✔
654
        error_log(date("Y-m-d H:i:s", time()) . "...load files");
1✔
655
        foreach (glob('/tmp/feed-split*') as $fn) {
1✔
656
            set_time_limit(600);
1✔
657
            $this->dbhm->preExec("LOAD DATA LOCAL INFILE '$fn' INTO TABLE jobs_new
1✔
658
            CHARACTER SET latin1
659
            FIELDS TERMINATED BY ',' 
660
            OPTIONALLY ENCLOSED BY '\"' 
661
            LINES TERMINATED BY '\n'
662
            (id, location, title, city, state, zip, country, job_type, posted_at, job_reference, company,
663
             mobile_friendly_apply, category, html_jobs, url, body, cpc, @GEOM, clickability,
664
             bodyhash, seenat, visible) SET geometry = ST_GeomFromText(@GEOM, " . $this->dbhm->SRID() . ");");
1✔
665
            error_log(date("Y-m-d H:i:s", time()) . "...loaded file $fn");
1✔
666
        }
667
        error_log(date("Y-m-d H:i:s", time()) . "...finished file load");
1✔
668
    }
669

670
    public function deleteSpammyJobs($table) {
671
        # There are some "spammy" jobs which are posted with identical descriptions across the UK.  They feel scuzzy, so
672
        # remove them.
673
        $spamcount = 0;
1✔
674
        error_log(date("Y-m-d H:i:s", time()) . "Count spammy jobs");
1✔
675
        $spams = $this->dbhr->preQuery("SELECT COUNT(*) as count, title, bodyhash FROM $table GROUP BY bodyhash HAVING count > 50 AND bodyhash IS NOT NULL;");
1✔
676

677
        error_log(date("Y-m-d H:i:s", time()) . "Delete spammy jobs");
1✔
678

679
        foreach ($spams as $spam) {
1✔
680
            error_log("Delete spammy job {$spam['title']} * {$spam['count']}");
×
681
            set_time_limit(600);
×
682
            $spamcount += $spam['count'];
×
683

684
            do {
685
                $this->dbhm->preExec("DELETE FROM $table WHERE bodyhash = ? LIMIT 1;", [
×
686
                    $spam['bodyhash']
×
687
                ]);
×
688
            } while ($this->dbhm->rowsAffected());
×
689
        }
690
    }
691

692
    public function swapTables() {
693
        # We want to swap the jobs_new table with the jobs table, atomically.
694
        error_log(date("Y-m-d H:i:s", time()) . " Swap tables...");
1✔
695
        $this->dbhm->preExec("DROP TABLE IF EXISTS jobs_old;");
1✔
696
        $this->dbhm->preExec("RENAME TABLE jobs TO jobs_old, jobs_new TO jobs;");
1✔
697
        $this->dbhm->preExec("DROP TABLE IF EXISTS jobs_old;");
1✔
698
        error_log(date("Y-m-d H:i:s", time()) . "...tables swapped...");
1✔
699
    }
700
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc