• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Freegle / iznik-server / #2585

02 Feb 2026 12:27PM UTC coverage: 85.462% (-0.1%) from 85.583%
#2585

push

edwh
Merge remote-tracking branch 'origin/master' into feature/incoming-email-migration

25583 of 29935 relevant lines covered (85.46%)

30.5 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.38
/include/misc/Jobs.php
1
<?php
2
namespace Freegle\Iznik;
3

4
use Prewk\XmlStringStreamer;
5
use Prewk\XmlStringStreamer\Stream;
6
use Prewk\XmlStringStreamer\Parser;
7

8
require_once(IZNIK_BASE . '/lib/geoPHP/geoPHP.inc');
4✔
9

10
class Jobs {
11
    /** @public  $dbhr LoggedPDO */
12
    public $dbhr;
13
    /** @public  $dbhm LoggedPDO */
14
    public $dbhm;
15

16
    private $jobKeywords = NULL;
17

18
    const MINIMUM_CPC = 0.10;
19

20
    function __construct(LoggedPDO $dbhr, LoggedPDO $dbhm) {
21
        $this->dbhr = $dbhr;
6✔
22
        $this->dbhm = $dbhm;
6✔
23
    }
24

25
    public function query($lat, $lng, $limit = 50, $category = NULL) {
26
        # To make efficient use of the spatial index we construct a box around our lat/lng, and search for jobs
27
        # where the geometry overlaps it.  We keep expanding our box until we find enough.
28
        #
29
        # We used to double the ambit each time, but that led to long queries, probably because we would suddenly
30
        # include a couple of cities or something.
31
        $step = 0.02;
4✔
32
        $ambit = $step;
4✔
33

34
        $ret = [];
4✔
35
        $got = [];
4✔
36
        $gotbody = [];
4✔
37
        $gottitle = [];
4✔
38
        $passes = 0;
4✔
39

40
        do {
41
            $swlat = $lat - $ambit;
4✔
42
            $nelat = $lat + $ambit;
4✔
43
            $swlng = $lng - $ambit;
4✔
44
            $nelng = $lng + $ambit;
4✔
45

46
            $poly = "POLYGON(($swlng $swlat, $swlng $nelat, $nelng $nelat, $nelng $swlat, $swlng $swlat))";
4✔
47
            $categoryq = $category ? (" AND category = " . $this->dbhr->quote($category)) : '';
4✔
48

49
            # We use ST_Within because that takes advantage of the spatial index, whereas ST_Intersects does not.
50
            $alreadyq = '';
4✔
51

52
            if (count($got)) {
4✔
53
                $alreadyq = " AND jobs.id NOT IN (" . implode(',', array_keys($got)) . ") AND jobs.title NOT IN (";
2✔
54

55
                $first = TRUE;
2✔
56
                foreach ($gottitle as $title) {
2✔
57
                    if (!$first) {
2✔
58
                        $alreadyq .= ", ";
×
59
                    }
60

61
                    $alreadyq .= $this->dbhr->quote($title);
2✔
62
                    $first = FALSE;
2✔
63
                }
64

65
                $alreadyq .= ") ";
2✔
66
            }
67

68
            $sql = "SELECT $ambit AS ambit, 
4✔
69
       ST_Distance(geometry, ST_GeomFromText('POINT($lng $lat)', {$this->dbhr->SRID()})) AS dist,
4✔
70
       CASE WHEN ST_Dimension(geometry) < 2 THEN 0 ELSE ST_Area(geometry) END AS area,
71
       jobs.id, jobs.url, jobs.title, jobs.location, jobs.body, jobs.job_reference, jobs.cpc, jobs.clickability, jobs.canonical_title
72
        FROM `jobs`
73
        WHERE ST_Within(geometry, ST_GeomFromText('$poly', {$this->dbhr->SRID()})) 
4✔
74
            AND (ST_Dimension(geometry) < 2 OR ST_Area(geometry) / ST_Area(ST_GeomFromText('$poly', {$this->dbhr->SRID()})) < 2)
4✔
75
            AND cpc >= " . Jobs::MINIMUM_CPC . "
4✔
76
            AND visible = 1
77
            $alreadyq
4✔
78
            $categoryq
4✔
79
        ORDER BY cpc DESC, dist ASC, posted_at DESC LIMIT $limit;";
4✔
80
            $jobs = $this->dbhr->preQuery($sql);
4✔
81
            #error_log($sql . " found " . count($jobs));
82
            $passes++;
4✔
83

84
            foreach ($jobs as $job) {
4✔
85
                $got[$job['id']] = TRUE;
2✔
86
                $gotbody[$job['body']] = $job['body'];
2✔
87
                $gottitle[$job['title']] = $job['title'];
2✔
88
                $job['passes'] = $passes;
2✔
89

90
                # We have clickability, which is our estimate of how likely people are to click on a job based on
91
                # past clicks.  We also have the CPC, which is how much we expect to earn from a click.
92
                # Use these to order the jobs by our expected income.
93
                $cpc = max($job['cpc'], 0.0001);
2✔
94
                $job['expectation'] = $cpc * $job['clickability'];
2✔
95

96
                $ret[] = $job;
2✔
97
            }
98

99
            $ambit += $step;
4✔
100
        } while (count($ret) < $limit && $ambit < 1);
4✔
101

102
        usort($ret, function($a, $b) {
4✔
103
            # Take care - must return integer, not float, otherwise the sort doesn't work.
104
            return ceil($b['cpc'] - $a['cpc']);
×
105
        });
4✔
106

107
        $ret = array_slice($ret, 0, $limit);
4✔
108

109
        # Look up cached AI images using the pre-computed canonical_title column.
110
        $canonicalTitles = array_unique(array_filter(array_column($ret, 'canonical_title')));
4✔
111

112
        $imageCache = [];
4✔
113

114
        if (count($canonicalTitles)) {
4✔
115
            $placeholders = implode(',', array_fill(0, count($canonicalTitles), '?'));
×
116
            $images = $this->dbhr->preQuery(
×
117
                "SELECT name, externaluid FROM ai_images WHERE name IN ($placeholders)",
×
118
                array_values($canonicalTitles)
×
119
            );
×
120

121
            foreach ($images as $img) {
×
122
                $imageCache[$img['name']] = $img['externaluid'];
×
123
            }
124
        }
125

126
        # Add image URL to each job
127
        foreach ($ret as &$job) {
4✔
128
            $canonical = $job['canonical_title'] ?? NULL;
2✔
129
            if ($canonical && isset($imageCache[$canonical])) {
2✔
130
                $uid = $imageCache[$canonical];
×
131
                $p = strrpos($uid, 'freegletusd-');
×
132
                if ($p !== FALSE) {
×
133
                    $job['image'] = IMAGE_DELIVERY . "?url=" . TUS_UPLOADER . "/" . substr($uid, $p + strlen('freegletusd-')) . "/";
×
134
                }
135
            }
136
        }
137

138
        return $ret;
4✔
139
    }
140

141
    public function get($id) {
142
        $jobs = $this->dbhr->preQuery("SELECT * FROM jobs WHERE id = ?", [
1✔
143
            $id
1✔
144
        ]);
1✔
145

146
        return $jobs;
1✔
147
    }
148

149
    public static function getKeywords($str) {
150
        $initial = explode(' ',$str);
11✔
151

152
        # Remove some stuff.
153
        $arr = [];
11✔
154

155
        foreach ($initial as $i) {
11✔
156
            $w = preg_replace("/[^A-Za-z]/", '', $i);
11✔
157

158
            if (strlen($w) > 2) {
11✔
159
                $arr[] = $w;
10✔
160
            }
161
        }
162

163
        $result = [];
11✔
164

165
        for($i=0; $i < count($arr)-1; $i++) {
11✔
166
            $result[] =  strtolower($arr[$i]) . ' ' . strtolower($arr[$i+1]);
8✔
167
        }
168

169
        return $result;
11✔
170
    }
171

172
    public function recordClick($jobid, $link, $userid) {
173
        # Use ignore because we can get clicks for jobs we have purged.
174
        $this->dbhm->preExec("INSERT IGNORE INTO logs_jobs (userid, jobid, link) VALUES (?, ?, ?);", [
1✔
175
            $userid,
1✔
176
            $jobid,
1✔
177
            $link
1✔
178
        ]);
1✔
179
    }
180

181
    public function analyseClickability() {
182
        # This looks at the jobs we've clicked on, extracts keywords, and uses that to build a measure of how
183
        # clickable a job is.  Obviously this is distorted by the order in which we display jobs, but it
184
        # still gives a reasonable idea of the keywords which would generate the most clicks.
185
        #
186
        # Prune old job logs - old data not worth analysing.
187
        $mysqltime = date("Y-m-d H:i:s", strtotime("midnight 31 days ago"));
×
188
        $this->dbhm->preExec("DELETE FROM logs_jobs WHERE timestamp < ?", [
×
189
            $mysqltime
×
190
        ]);
×
191

192
        # Find any jobs which have the link but not the id.
193
        $logs = $this->dbhr->preQuery("SELECT jobs.id AS jobid, logs_jobs.id FROM logs_jobs 
×
194
    INNER JOIN jobs ON jobs.url = logs_jobs.link 
195
    WHERE jobid IS NULL AND link IS NOT NULL
196
    ORDER BY id DESC;");
×
197
        error_log("Logs to fix " . count($logs));
×
198
        $count = 0;
×
199

200
        foreach ($logs as $log) {
×
201
            $this->dbhm->preExec("UPDATE logs_jobs SET jobid = ? WHERE id = ?;", [
×
202
                $log['jobid'],
×
203
                $log['id']
×
204
            ]);
×
205

206
            $count++;
×
207

208
            if ($count % 100 == 0) {
×
209
                error_log("...$count / " . count($logs));
×
210
            }
211
        }
212

213
        # Now process the clicked jobs to extract keywords.  Use DISTINCT as some people click obsessively on the same job.
214
        $jobs = $this->dbhr->preQuery("SELECT DISTINCT jobs.* FROM logs_jobs INNER JOIN jobs ON logs_jobs.jobid = jobs.id");
×
215
        $this->dbhm->preExec("TRUNCATE TABLE jobs_keywords");
×
216

217
        error_log("Process " . count($jobs));
×
218

219
        foreach ($jobs as $job) {
×
220
            $keywords = Jobs::getKeywords($job['title']);
×
221

222
            if (count($keywords)) {
×
223
                #error_log("{$job['title']} => " . json_encode($keywords));
224
                foreach ($keywords as $k) {
×
225
                    $this->dbhm->preExec("INSERT INTO jobs_keywords (keyword, count) VALUES (?, 1) ON DUPLICATE KEY UPDATE count = count + 1;", [
×
226
                        $k
×
227
                    ]);
×
228
                }
229
            }
230
        }
231
    }
232

233
    public function updateClickability() {
234
        # This updates the list of jobs with a clickability measure, based on the keyword analyse we have done on
235
        # previous clicks.
236
        $jobs = $this->dbhr->preQuery("SELECT id, title FROM jobs;");
×
237

238
        $maxish = $this->getMaxish();
×
239

240
        foreach ($jobs as $job) {
×
241
            $score = $this->clickability($job['id'], $job['title'], $maxish);
×
242
            $this->dbhm->preExec("UPDATE jobs SET clickability = ? WHERE id = ?;", [
×
243
                $score,
×
244
                $job['id']
×
245
            ]);
×
246
        }
247
    }
248

249
    public function clickability($jobid, $title = NULL, $maxish = NULL) {
250
        $ret = 0;
1✔
251

252
        if (!$title) {
1✔
253
            # Collect the sum of the counts of keywords present in this title.
254
            $jobs = $this->dbhr->preQuery("SELECT title FROM jobs WHERE id = ?;", [
1✔
255
                $jobid
1✔
256
            ]);
1✔
257

258
            foreach ($jobs as $job) {
1✔
259
                $title = $job['title'];
1✔
260
            }
261
        }
262

263
        if ($title) {
1✔
264
            $keywords = Jobs::getKeywords($title);
1✔
265

266
            if (count($keywords)) {
1✔
267
                if (!$this->jobKeywords) {
1✔
268
                    # Cache in this object because it speeds bulk load a lot.
269
                    $this->jobKeywords = [];
1✔
270
                    $jobKeywords = $this->dbhr->preQuery("SELECT * FROM jobs_keywords");
1✔
271

272
                    foreach ($jobKeywords as $j) {
1✔
273
                        $this->jobKeywords[$j['keyword']] = $j['count'];
1✔
274
                    }
275
                }
276

277
                foreach ($keywords as $keyword) {
1✔
278
                    if (array_key_exists($keyword, $this->jobKeywords)) {
1✔
279
                        $ret += $this->jobKeywords[$keyword];
×
280
                    }
281
                }
282
            }
283
        }
284

285
        # Normalise this, so that if we get more clicks overall because we have more site activity we don't
286
        # think it's because the jobs we happen to be clicking are more desirable. Use the 95th percentile to
287
        # get the maxish value (avoiding outliers).
288
        if (!$maxish) {
1✔
289
            $maxish = $this->getMaxish();
1✔
290
        }
291

292
        return $ret / $maxish;
1✔
293
    }
294

295
    public function getMaxish() {
296
        $maxish = 1;
2✔
297

298
        $m = $this->dbhr->preQuery("SELECT count FROM 
2✔
299
(SELECT t.*,  @row_num :=@row_num + 1 AS row_num FROM jobs_keywords t, 
300
    (SELECT @row_num:=0) counter ORDER BY count) 
301
temp WHERE temp.row_num = ROUND (.95* @row_num);");
2✔
302

303
        if (count($m)) {
2✔
304
            $maxish = $m[0]['count'];
2✔
305
        }
306

307
        return $maxish;
2✔
308
    }
309

310
    public static function geocode($addr, $allowPoint, $exact, $bbswlat = 49.959999905, $bbswlng = -7.57216793459, $bbnelat = 58.6350001085, $bbnelng = 1.68153079591) {
311
        // Special cases
312
        if ($addr == 'West Marsh') {
2✔
313
            $addr = 'Grimsby';
×
314
        } else if ($addr == 'Stoney Middleton') {
2✔
315
            $addr .= ', Derbyshire';
×
316
        } else if ($addr == 'Middleton Stoney') {
2✔
317
            $addr .= ', Oxfordshire';
×
318
        } else if ($addr == 'Kirkwall') {
2✔
319
            $addr .= ', Orkney';
×
320
        } else if ($addr == 'City of York') {
2✔
321
            $addr = 'York';
×
322
        } else if ($addr == 'Sutton Central') {
2✔
323
            $addr = 'Sutton, London';
×
324
        } else if ($addr == 'Hampden Park') {
2✔
325
            $addr = 'Hampden Park, Eastbourne';
×
326
        } else if ($addr == 'Guernsey') {
2✔
327
            return [ NULL, NULL, NULL, NULL, NULL, NULL ];
×
328
        }
329

330
        $url = "https://" . GEOCODER . "/api?q=" . urlencode($addr) . "&bbox=$bbswlng%2C$bbswlat%2C$bbnelng%2C$bbnelat";
2✔
331
        $geocode = @file_get_contents($url);
2✔
332
        #error_log("Geocode $addr, allow point $allowPoint, exact $exact, $url");
333
        $swlng = $swlat = $nelng = $nelat = $geom = $area = NULL;
2✔
334

335
        if ($geocode) {
2✔
336
            $results = json_decode($geocode, TRUE);
2✔
337

338
            if (Utils::pres('features', $results) && count($results['features'])) {
2✔
339
                foreach ($results['features'] as $feature) {
2✔
340
                    if (Utils::pres('properties', $feature)) {
2✔
341
                        $nameMatches = Utils::pres('name', $feature['properties']) && strcmp(strtolower($feature['properties']['name']), strtolower($addr)) == 0;
2✔
342

343
                        if (Utils::pres('extent', $feature['properties'])) {
2✔
344
                            if (!$exact || !$nameMatches) {
2✔
345
                                $swlng = floatval($feature['properties']['extent'][0]);
2✔
346
                                $swlat = floatval($feature['properties']['extent'][1]);
2✔
347
                                $nelng = floatval($feature['properties']['extent'][2]);
2✔
348
                                $nelat = floatval($feature['properties']['extent'][3]);
2✔
349
                                $geom = Utils::getBoxPoly($swlat, $swlng, $nelat, $nelng);
2✔
350
                                #error_log("From extent $geom");
351
                            }
352
                            break;
2✔
353
                        } else if ($allowPoint &&
1✔
354
                            (!$exact || $nameMatches) &&
1✔
355
                            Utils::pres('geometry', $feature) &&
1✔
356
                            Utils::pres('coordinates', $feature['geometry'])) {
1✔
357
                            # Invent a small polygon, just so all the geometries have the same dimension
358
                            $lat = floatval($feature['geometry']['coordinates'][1]);
1✔
359
                            $lng = floatval($feature['geometry']['coordinates'][0]);
1✔
360
                            $swlng = $lng - 0.0005;
1✔
361
                            $swlat = $lat - 0.0005;
1✔
362
                            $nelat = $lat + 0.0005;
1✔
363
                            $nelng = $lng + 0.0005;
1✔
364
                            $geom = Utils::getBoxPoly($swlat, $swlng, $nelat, $nelng);
1✔
365
                            #error_log("From point $geom");
366
                            break;
1✔
367
                        }
368
                    }
369
                }
370
            }
371
        }
372

373
        if ($geom) {
2✔
374
            $g = new \geoPHP();
2✔
375
            $poly = $g::load($geom, 'wkt');
2✔
376
            $area = $poly->area();
2✔
377
        }
378

379
        #error_log("Geocode $addr => $geom area $area");
380
        return [ $swlat, $swlng, $nelat, $nelng, $geom, $area ];
2✔
381
    }
382

383
    public function scanToCSV($inputFile, $outputFile, $maxage = 7, $fakeTime = FALSE, $distribute = 0.0005, $perFile = 1000, $format = 'csv') {
384
        $now = $fakeTime ? '2001-01-01 00:00:00' : date("Y-m-d H:i:s", time());
1✔
385
        $out = fopen("$outputFile.tmp", 'w');
1✔
386

387
        # This scans the XML job file provided by WhatJobs, filters out the ones we want, and writes to a
388
        # CSV file.
389
        $options = array(
1✔
390
            "captureDepth" => 3,
1✔
391
            "expectGT" => TRUE
1✔
392
        );
1✔
393

394
        $parser = new Parser\StringWalker($options);
1✔
395
        $stream = new Stream\File($inputFile, 1024);
1✔
396
        $streamer = new XmlStringStreamer($parser, $stream);
1✔
397

398
        $count = 0;
1✔
399
        $new = 0;
1✔
400
        $old = 0;
1✔
401
        $toolow = 0;
1✔
402
        $nogeocode = 0;
1✔
403
        $geocache = [];
1✔
404

405
        $maxish = $this->getMaxish();
1✔
406

407
        while ($node = $streamer->getNode()) {
1✔
408
            $job = simplexml_load_string($node);
1✔
409

410
            if ($format == 1) {
1✔
411
                $location = $job->locations->location->location;
1✔
412
                $timeposted = $job->timePosted;
1✔
413
                $city = $job->locations->location->city;
1✔
414
                $state = $job->locations->location->state;
1✔
415
                $zip = $job->locations->location->zip;
1✔
416
                $country = $job->locations->location->country;
1✔
417
                $companyname = $job->company->name;
1✔
418
                $cpc = $job->custom->CPC;
1✔
419
                $deeplink = $job->urlDeeplink;
1✔
420
                $jobid = $job->id;
1✔
421
                $jobtitle = $job->title;
1✔
422
                $description = $job->description;
1✔
423
                $category = $job->category;
1✔
424
            } else {
425
                $location = $job->location;
1✔
426
                $timeposted = $job->posted_at;
1✔
427
                $city = $job->city;
1✔
428
                $state = $job->state;
1✔
429
                $zip = $job->zip;
1✔
430
                $country = $job->country;
1✔
431
                $companyname = $job->company;
1✔
432
                $cpc = $job->cpc;
1✔
433
                $deeplink = $job->url;
1✔
434
                $jobid = $job->job_reference;
1✔
435
                $jobtitle = $job->title;
1✔
436
                $description = $job->body;
1✔
437
                $category = $job->category;
1✔
438
            }
439

440
            #error_log("Location $location, timeposted $timeposted, city $city, state $state, zip $zip, country $country, companyname $companyname, cpc $cpc, deeplink $deeplink, jobid $jobid, jobtitle $jobtitle, description " . strlen($description) . ", category $category");
441

442
            # Only add new jobs.
443
            $age = (time() - strtotime($timeposted)) / (60 * 60 * 24);
1✔
444

445
            if ($age < $maxage) {
1✔
446
                if (!$jobid) {
1✔
447
                    #error_log("No job id for {$jobtitle}, {$location}");
448
                } else if (floatval($cpc) < Jobs::MINIMUM_CPC) {
1✔
449
                    # Ignore this job - not worth us showing.
450
                    $toolow++;
×
451
                } else {
452
                    $geokey = "{$city},{$state},{$country}";
1✔
453
                    $geom = NULL;
1✔
454

455
                    $wascached = array_key_exists($geokey, $geocache);
1✔
456

457
                    if ($wascached) {
1✔
458
                        # In memory cache is faster than DB queries.  The total number of locations of the order
459
                        # of 20K so this is ok.
460
                        list ($geom, $swlat, $swlng, $nelat, $nelng) = $geocache[$geokey];
×
461
                        #error_log("Got $geokey from geocache");
462
                    } else {
463
                        # See if we already have the address geocoded - would save hitting the geocoder.
464
                        $geo = $this->dbhr->preQuery(
1✔
465
                            "SELECT ST_AsText(ST_Envelope(geometry)) AS geom FROM jobs WHERE city = ? AND state = ? AND country = ? LIMIT 1;",
1✔
466
                            [
1✔
467
                                $city,
1✔
468
                                $state,
1✔
469
                                $country
1✔
470
                            ]
1✔
471
                        );
1✔
472

473
                        $geom = null;
1✔
474
                        #error_log("Found " . count($geo) . " {$city}, {$state}, {$country}");
475

476
                        if (count($geo))
1✔
477
                        {
478
                            $geom = $geo[0]['geom'];
×
479
                            $g = new \geoPHP();
×
480
                            $poly = $g::load($geom, 'wkt');
×
481
                            $bbox = $poly->getBBox();
×
482
                            $swlat = $bbox['miny'];
×
483
                            $swlng = $bbox['minx'];
×
484
                            $nelat = $bbox['maxy'];
×
485
                            $nelng = $bbox['maxx'];
×
486

487
                            #error_log("Got existing location $geom = $swlat,$swlng to $nelat, $nelng");
488
                        }
489
                    }
490

491
                    if (!$geom) {
1✔
492
                        # Try to geocode the address.
493
                        #
494
                        # We have location, city, state, and country.  We can ignore the country.
495
                        $badStates = [ 'not specified', 'united kingdom of great britain and northern ireland', 'united kingdom', 'uk', 'england', 'scotland', 'wales', 'home based' ];
1✔
496
                        if ($state && strlen(trim($state)) && !in_array(strtolower(trim($state)), $badStates)) {
1✔
497
                            # Sometimes the state is a region.  Sometimes it is a much more specific location. Sigh.
498
                            #
499
                            # So first, geocode the state; if we get a specific location we can use that.  If it's a larger
500
                            # location then we can use it as a bounding box.
501
                            #
502
                            # Geocoding gets confused by "Borough of", e.g. "Borough of Swindon".
503
                            $state = str_ireplace($state, 'Borough of ', '');
1✔
504

505
                            list ($swlat, $swlng, $nelat, $nelng, $geom, $area) = Jobs::geocode($job, FALSE, TRUE);
1✔
506

507
                            if ($area && $area < 0.05) {
1✔
508
                                # We have a small 'state', which must be an actual location.  Stop.
509
                                #error_log("Got small area {$state}");
510
                            } else if ($geom) {
1✔
511
                                # We have a large state, which is a plausible region. Use it as a bounding box for the
512
                                # city.
513
                                #error_log("Gecoded {$state} to $geom");
514
                                list ($swlat, $swlng, $nelat, $nelng, $geom, $area) = Jobs::geocode($city, TRUE, FALSE, $swlat, $swlng, $nelat, $nelng);
×
515

516
                                if (!$geom) {
×
517
                                    #error_log("Failed to geocode {$city} in {$state}");
518
                                }
519
                            }
520
                        }
521
                    }
522

523
                    $badCities = [ 'not specified', 'null', 'home based', 'united kingdom', ', , united kingdom' ];
1✔
524
                    if (!$geom && $city && strlen(trim($city)) && !in_array(strtolower(trim($city)), $badCities)) {
1✔
525
                        # We have not managed anything from the state.  Try just the city.  This will lead to some being
526
                        # wrong, but that's life.
527
                        #error_log("State no use, use city {$city}");
528
                        list ($swlat, $swlng, $nelat, $nelng, $geom, $area) = Jobs::geocode($city, TRUE, FALSE);
1✔
529

530
                        if ($area > 50) {
1✔
531
                            #error_log("{$city} => $geom is too large at $area");
532
                            $geom = NULL;
×
533
                        }
534
                    }
535

536
                    if ($geom) {
1✔
537
                        #error_log("Geocoded {$city}, {$state} => $geom");
538

539
                        if (!$wascached) {
1✔
540
                            $geocache[$geokey] = [ $geom, $swlat, $swlng, $nelat, $nelng];
1✔
541
                        }
542

543
                        # Jobs tend to cluster, e.g. in cities.  When we are searching we expand a box from our current
544
                        # location until we overlap enough.  The clustering means we may suddenly match thousands of them,
545
                        # which is slow.  So instead of using the job location box as is, randomise to be a small box
546
                        # within the location box.  That way we will encounter some of the jobs sooner and hence have faster queries.
547
                        $newlat = $swlat + (mt_rand() / mt_getrandmax()) * ($nelat - $swlat);
1✔
548
                        $newlng = $swlng + (mt_rand() / mt_getrandmax()) * ($nelng - $swlng);
1✔
549
                        $swlng = $newlng - $distribute;
1✔
550
                        $swlat = $newlat - $distribute;
1✔
551
                        $nelat = $newlat + $distribute;
1✔
552
                        $nelng = $newlng + $distribute;
1✔
553
                        $geom = $distribute ? Utils::getBoxPoly($swlat, $swlng, $nelat, $nelng) : $geom;
1✔
554
                        #error_log("Modified loc to $geom");
555

556
                        # If the job already exists in the old table we want to preserve the id, because people
557
                        # might click on an old id from email.  If we don't find it then we'll use NULL, which will
558
                        # create a new auto-increment id.  We have to go to some effort to get the NULL in there -
559
                        # we want \N, but unquoted, so we put \N and then strip the quotes later.
560
                        #
561
                        # We could speed this up by sending out the job_reference rather than the id.
562
                        $id = "\N";
1✔
563

564
                        $oldids = $this->dbhr->preQuery("SELECT id FROM jobs WHERE job_reference = ?;", [
1✔
565
                            $jobid
1✔
566
                        ]);
1✔
567

568
                        foreach ($oldids as $oldid) {
1✔
569
                            $id = $oldid['id'];
×
570
                        }
571

572
                        try {
573
                            #error_log("Added job $jobid $id $geom");
574
                            $clickability = 0;
1✔
575
                            $title = NULL;
1✔
576

577
                            if ($jobtitle) {
1✔
578
                                $title = str_replace('&#39;', "'", html_entity_decode($jobtitle));
1✔
579
                                # Don't get clickability - we don't currently use it and it slows things down.
580
                                $clickability = 1;
1✔
581
                                #$clickability = $this->clickability(NULL, html_entity_decode($title), $maxish);
582
                            }
583

584
                            $body = NULL;
1✔
585

586
                            if ($description) {
1✔
587
                                # Truncate the body - we only display the body as a snippet to get people to click
588
                                # through.
589
                                #
590
                                # Fix up line endings which cause problems with fputcsv, and other weirdo characters
591
                                # that get mangled en route to us.  This isn't a simple UTF8 problem because the data
592
                                # comes from all over the place and is a bit of a mess.
593
                                $body = html_entity_decode($description);
1✔
594
                                $body = str_replace("\r\n", " ", $body);
1✔
595
                                $body = str_replace("\r", " ", $body);
1✔
596
                                $body = str_replace("\n", " ", $body);
1✔
597
                                $body = str_replace('–', '-', $body);
1✔
598
                                $body = str_replace('Â', '-', $body);
1✔
599
                                $body = substr($body, 0, 256);
1✔
600

601
                                # The truncation might happen to leave a \ at the end of the string, which would then
602
                                # escape the following quote added by fputcsv, and the world would explode in a ball of
603
                                # fire.  Add a space to avoid that.
604
                                $body .= ' ';
1✔
605
                            }
606

607
                            # Sometimes the geocode can end up with line breaks.
608
                            $geom = str_replace(["\n", "\r"], '', $geom);
1✔
609

610
                            # Write the job to CSV, ready for LOAD DATA INFILE later.
611
                            # location, title, city, state, zip, country, job_type, posted_at, job_reference, company,
612
                            # mobile_friendly_apply, category, html_jobs, url, body, cpc, geometry, clickability,
613
                            # bodyhash, seenat
614
                            # Map title to canonical form for AI image lookup.
615
                            $canonicalTitle = Pollinations::canonicalJobTitle($title);
1✔
616

617
                            fputcsv($out, [
1✔
618
                                $id,
1✔
619
                                $location ? html_entity_decode($location) : NULL,
1✔
620
                                $title,
1✔
621
                                $city ? html_entity_decode($city) : NULL,
1✔
622
                                $state ? html_entity_decode($state) : NULL,
1✔
623
                                $zip ? html_entity_decode($zip) : NULL,
1✔
624
                                $country ? html_entity_decode($country) : NULL,
1✔
625
                                NULL,
1✔
626
                                $timeposted ? html_entity_decode($timeposted) : NULL,
1✔
627
                                $jobid ? html_entity_decode($jobid) : NULL,
1✔
628
                                $companyname ? html_entity_decode($companyname) : NULL,
1✔
629
                                NULL,
1✔
630
                                $category ? html_entity_decode($category) : NULL,
1✔
631
                                NULL,
1✔
632
                                $deeplink ? html_entity_decode($deeplink) : NULL,
1✔
633
                                $body,
1✔
634
                                $cpc ? html_entity_decode($cpc) : NULL,
1✔
635
                                $geom,
1✔
636
                                $clickability,
1✔
637
                                md5($body),
1✔
638
                                $now,
1✔
639
                                1,
1✔
640
                                $canonicalTitle
1✔
641
                            ]);
1✔
642

643
                            $new++;
1✔
644
                        } catch (\Exception $e) {
×
645
                            error_log("Failed to add {$jobtitle} $geom " . $e->getMessage());
1✔
646
                        }
647
                    } else {
648
                        #error_log("Couldn't geocode {$city}, {$state}");
649
                        $nogeocode++;
1✔
650
                    }
651
                }
652
            } else {
653
                $old++;
×
654
            }
655

656
            $count++;
1✔
657

658
            if ($count % 1000 === 0) {
1✔
659
                error_log(date("Y-m-d H:i:s", time()) . "...processing $count");
×
660
            }
661
        }
662

663
        fclose($out);
1✔
664

665
        # Now unquote any "\N" into \N.
666
        error_log(date("Y-m-d H:i:s", time()) . "...finished loop, split");
1✔
667
        $data = file_get_contents("$outputFile.tmp");
1✔
668
        $data = str_replace('"\N"', '\N', $data);
1✔
669
        file_put_contents($outputFile, $data);
1✔
670
        error_log(date("Y-m-d H:i:s", time()) . "...written file $outputFile");
1✔
671
    }
672

673
    public function prepareForLoadCSV() {
674
        error_log(date("Y-m-d H:i:s", time()) . "...DROP jobs_new");
1✔
675
        $this->dbhm->preExec("DROP TABLE IF EXISTS jobs_new;");
1✔
676
        error_log(date("Y-m-d H:i:s", time()) . "...CREATE jobs_new");
1✔
677
        $this->dbhm->preExec("CREATE TABLE jobs_new LIKE jobs;");
1✔
678
        $this->dbhm->preExec("SET GLOBAL local_infile=1;");
1✔
679
    }
680

681
    public function loadCSV($csv) {
682
        # Percona cluster has non-standard behaviour for LOAD DATA INFILE, and commits are performed every 10K rows.
683
        # But even this seems to place too much load on cluster syncing, and can cause lockups.  So we split the CSV
684
        # into 1K rows and load each of them in turn.
685
        error_log(date("Y-m-d H:i:s", time()) . "...Split CSV file");
1✔
686
        system("cd /tmp/; rm feed-split*; split -1000 $csv feed-split");
1✔
687
        error_log(date("Y-m-d H:i:s", time()) . "...load files");
1✔
688
        foreach (glob('/tmp/feed-split*') as $fn) {
1✔
689
            set_time_limit(600);
1✔
690
            $this->dbhm->preExec("LOAD DATA LOCAL INFILE '$fn' INTO TABLE jobs_new
1✔
691
            CHARACTER SET latin1
692
            FIELDS TERMINATED BY ',' 
693
            OPTIONALLY ENCLOSED BY '\"' 
694
            LINES TERMINATED BY '\n'
695
            (id, location, title, city, state, zip, country, job_type, posted_at, job_reference, company,
696
             mobile_friendly_apply, category, html_jobs, url, body, cpc, @GEOM, clickability,
697
             bodyhash, seenat, visible, canonical_title) SET geometry = ST_GeomFromText(@GEOM, " . $this->dbhm->SRID() . ");");
1✔
698
            error_log(date("Y-m-d H:i:s", time()) . "...loaded file $fn");
1✔
699
        }
700
        error_log(date("Y-m-d H:i:s", time()) . "...finished file load");
1✔
701
    }
702

703
    public function deleteSpammyJobs($table) {
704
        # There are some "spammy" jobs which are posted with identical descriptions across the UK.  They feel scuzzy, so
705
        # remove them.
706
        $spamcount = 0;
1✔
707
        error_log(date("Y-m-d H:i:s", time()) . "Count spammy jobs");
1✔
708
        $spams = $this->dbhr->preQuery("SELECT COUNT(*) as count, title, bodyhash FROM $table GROUP BY bodyhash HAVING count > 50 AND bodyhash IS NOT NULL;");
1✔
709

710
        error_log(date("Y-m-d H:i:s", time()) . "Delete spammy jobs");
1✔
711

712
        foreach ($spams as $spam) {
1✔
713
            error_log("Delete spammy job {$spam['title']} * {$spam['count']}");
×
714
            set_time_limit(600);
×
715
            $spamcount += $spam['count'];
×
716

717
            do {
718
                $this->dbhm->preExec("DELETE FROM $table WHERE bodyhash = ? LIMIT 1;", [
×
719
                    $spam['bodyhash']
×
720
                ]);
×
721
            } while ($this->dbhm->rowsAffected());
×
722
        }
723
    }
724

725
    public function swapTables() {
726
        # We want to swap the jobs_new table with the jobs table, atomically.
727
        error_log(date("Y-m-d H:i:s", time()) . " Swap tables...");
1✔
728
        $this->dbhm->preExec("DROP TABLE IF EXISTS jobs_old;");
1✔
729
        $this->dbhm->preExec("RENAME TABLE jobs TO jobs_old, jobs_new TO jobs;");
1✔
730
        $this->dbhm->preExec("DROP TABLE IF EXISTS jobs_old;");
1✔
731
        error_log(date("Y-m-d H:i:s", time()) . "...tables swapped...");
1✔
732
    }
733
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc