• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Freegle / iznik-server / #2506

09 Dec 2025 09:26PM UTC coverage: 90.265% (+4.5%) from 85.773%
#2506

push

php-coveralls

edwh
Add AI image caching for jobs

- Add ai_images table for caching Pollinations-generated images
- Add jobs_illustrations.php script to generate images for popular job titles
- Update messages_illustrations.php to use cached images and cache new ones
- Update Jobs.php to return cached image URLs with job data

11 of 16 new or added lines in 1 file covered. (68.75%)

26566 of 29431 relevant lines covered (90.27%)

31.48 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

81.04
/include/misc/Jobs.php
1
<?php
2
namespace Freegle\Iznik;
3

4
use Prewk\XmlStringStreamer;
5
use Prewk\XmlStringStreamer\Stream;
6
use Prewk\XmlStringStreamer\Parser;
7

8
require_once(IZNIK_BASE . '/lib/geoPHP/geoPHP.inc');
1✔
9

10
class Jobs {
11
    /** @public  $dbhr LoggedPDO */
12
    public $dbhr;
13
    /** @public  $dbhm LoggedPDO */
14
    public $dbhm;
15

16
    private $jobKeywords = NULL;
17

18
    const MINIMUM_CPC = 0.10;
19

20
    function __construct(LoggedPDO $dbhr, LoggedPDO $dbhm) {
21
        $this->dbhr = $dbhr;
8✔
22
        $this->dbhm = $dbhm;
8✔
23
    }
24

25
    public function query($lat, $lng, $limit = 50, $category = NULL) {
26
        # To make efficient use of the spatial index we construct a box around our lat/lng, and search for jobs
27
        # where the geometry overlaps it.  We keep expanding our box until we find enough.
28
        #
29
        # We used to double the ambit each time, but that led to long queries, probably because we would suddenly
30
        # include a couple of cities or something.
31
        $step = 0.02;
6✔
32
        $ambit = $step;
6✔
33

34
        $ret = [];
6✔
35
        $got = [];
6✔
36
        $gotbody = [];
6✔
37
        $gottitle = [];
6✔
38
        $passes = 0;
6✔
39

40
        do {
41
            $swlat = $lat - $ambit;
6✔
42
            $nelat = $lat + $ambit;
6✔
43
            $swlng = $lng - $ambit;
6✔
44
            $nelng = $lng + $ambit;
6✔
45

46
            $poly = "POLYGON(($swlng $swlat, $swlng $nelat, $nelng $nelat, $nelng $swlat, $swlng $swlat))";
6✔
47
            $categoryq = $category ? (" AND category = " . $this->dbhr->quote($category)) : '';
6✔
48

49
            # We use ST_Within because that takes advantage of the spatial index, whereas ST_Intersects does not.
50
            $alreadyq = '';
6✔
51

52
            if (count($got)) {
6✔
53
                $alreadyq = " AND jobs.id NOT IN (" . implode(',', array_keys($got)) . ") AND jobs.title NOT IN (";
4✔
54

55
                $first = TRUE;
4✔
56
                foreach ($gottitle as $title) {
4✔
57
                    if (!$first) {
4✔
58
                        $alreadyq .= ", ";
×
59
                    }
60

61
                    $alreadyq .= $this->dbhr->quote($title);
4✔
62
                    $first = FALSE;
4✔
63
                }
64

65
                $alreadyq .= ") ";
4✔
66
            }
67

68
            $sql = "SELECT $ambit AS ambit, 
6✔
69
       ST_Distance(geometry, ST_GeomFromText('POINT($lng $lat)', {$this->dbhr->SRID()})) AS dist,
6✔
70
       CASE WHEN ST_Dimension(geometry) < 2 THEN 0 ELSE ST_Area(geometry) END AS area,
71
       jobs.id, jobs.url, jobs.title, jobs.location, jobs.body, jobs.job_reference, jobs.cpc, jobs.clickability
72
        FROM `jobs`
73
        WHERE ST_Within(geometry, ST_GeomFromText('$poly', {$this->dbhr->SRID()})) 
6✔
74
            AND (ST_Dimension(geometry) < 2 OR ST_Area(geometry) / ST_Area(ST_GeomFromText('$poly', {$this->dbhr->SRID()})) < 2)
6✔
75
            AND cpc >= " . Jobs::MINIMUM_CPC . "
6✔
76
            AND visible = 1
77
            $alreadyq
6✔
78
            $categoryq
6✔
79
        ORDER BY cpc DESC, dist ASC, posted_at DESC LIMIT $limit;";
6✔
80
            $jobs = $this->dbhr->preQuery($sql);
6✔
81
            #error_log($sql . " found " . count($jobs));
82
            $passes++;
6✔
83

84
            foreach ($jobs as $job) {
6✔
85
                $got[$job['id']] = TRUE;
4✔
86
                $gotbody[$job['body']] = $job['body'];
4✔
87
                $gottitle[$job['title']] = $job['title'];
4✔
88
                $job['passes'] = $passes;
4✔
89

90
                # We have clickability, which is our estimate of how likely people are to click on a job based on
91
                # past clicks.  We also have the CPC, which is how much we expect to earn from a click.
92
                # Use these to order the jobs by our expected income.
93
                $cpc = max($job['cpc'], 0.0001);
4✔
94
                $job['expectation'] = $cpc * $job['clickability'];
4✔
95

96
                $ret[] = $job;
4✔
97
            }
98

99
            $ambit += $step;
6✔
100
        } while (count($ret) < $limit && $ambit < 1);
6✔
101

102
        usort($ret, function($a, $b) {
6✔
103
            # Take care - must return integer, not float, otherwise the sort doesn't work.
104
            return ceil($b['cpc'] - $a['cpc']);
1✔
105
        });
6✔
106

107
        $ret = array_slice($ret, 0, $limit);
6✔
108

109
        # Look up cached AI images for job titles
110
        $titles = array_unique(array_column($ret, 'title'));
6✔
111
        $imageCache = [];
6✔
112

113
        if (count($titles)) {
6✔
114
            $placeholders = implode(',', array_fill(0, count($titles), '?'));
4✔
115
            $images = $this->dbhr->preQuery(
4✔
116
                "SELECT name, externaluid FROM ai_images WHERE name IN ($placeholders)",
4✔
117
                array_values($titles)
4✔
118
            );
4✔
119

120
            foreach ($images as $img) {
4✔
NEW
121
                $imageCache[$img['name']] = $img['externaluid'];
×
122
            }
123
        }
124

125
        # Add image URL to each job
126
        foreach ($ret as &$job) {
6✔
127
            if (isset($imageCache[$job['title']])) {
4✔
NEW
128
                $uid = $imageCache[$job['title']];
×
NEW
129
                $p = strrpos($uid, 'freegletusd-');
×
NEW
130
                if ($p !== FALSE) {
×
NEW
131
                    $job['image'] = IMAGE_DELIVERY . "?url=" . TUS_UPLOADER . "/" . substr($uid, $p + strlen('freegletusd-')) . "/";
×
132
                }
133
            }
134
        }
135

136
        return $ret;
6✔
137
    }
138

139
    public function get($id) {
140
        $jobs = $this->dbhr->preQuery("SELECT * FROM jobs WHERE id = ?", [
1✔
141
            $id
1✔
142
        ]);
1✔
143

144
        return $jobs;
1✔
145
    }
146

147
    public static function getKeywords($str) {
148
        $initial = explode(' ',$str);
1✔
149

150
        # Remove some stuff.
151
        $arr = [];
1✔
152

153
        foreach ($initial as $i) {
1✔
154
            $w = preg_replace("/[^A-Za-z]/", '', $i);
1✔
155

156
            if (strlen($w) > 2) {
1✔
157
                $arr[] = $w;
1✔
158
            }
159
        }
160

161
        $result = [];
1✔
162

163
        for($i=0; $i < count($arr)-1; $i++) {
1✔
164
            $result[] =  strtolower($arr[$i]) . ' ' . strtolower($arr[$i+1]);
1✔
165
        }
166

167
        return $result;
1✔
168
    }
169

170
    public function recordClick($jobid, $link, $userid) {
171
        # Use ignore because we can get clicks for jobs we have purged.
172
        $this->dbhm->preExec("INSERT IGNORE INTO logs_jobs (userid, jobid, link) VALUES (?, ?, ?);", [
1✔
173
            $userid,
1✔
174
            $jobid,
1✔
175
            $link
1✔
176
        ]);
1✔
177
    }
178

179
    public function analyseClickability() {
180
        # This looks at the jobs we've clicked on, extracts keywords, and uses that to build a measure of how
181
        # clickable a job is.  Obviously this is distorted by the order in which we display jobs, but it
182
        # still gives a reasonable idea of the keywords which would generate the most clicks.
183
        #
184
        # Prune old job logs - old data not worth analysing.
185
        $mysqltime = date("Y-m-d H:i:s", strtotime("midnight 31 days ago"));
×
186
        $this->dbhm->preExec("DELETE FROM logs_jobs WHERE timestamp < ?", [
×
187
            $mysqltime
×
188
        ]);
×
189

190
        # Find any jobs which have the link but not the id.
191
        $logs = $this->dbhr->preQuery("SELECT jobs.id AS jobid, logs_jobs.id FROM logs_jobs 
×
192
    INNER JOIN jobs ON jobs.url = logs_jobs.link 
193
    WHERE jobid IS NULL AND link IS NOT NULL
194
    ORDER BY id DESC;");
×
195
        error_log("Logs to fix " . count($logs));
×
196
        $count = 0;
×
197

198
        foreach ($logs as $log) {
×
199
            $this->dbhm->preExec("UPDATE logs_jobs SET jobid = ? WHERE id = ?;", [
×
200
                $log['jobid'],
×
201
                $log['id']
×
202
            ]);
×
203

204
            $count++;
×
205

206
            if ($count % 100 == 0) {
×
207
                error_log("...$count / " . count($logs));
×
208
            }
209
        }
210

211
        # Now process the clicked jobs to extract keywords.  Use DISTINCT as some people click obsessively on the same job.
212
        $jobs = $this->dbhr->preQuery("SELECT DISTINCT jobs.* FROM logs_jobs INNER JOIN jobs ON logs_jobs.jobid = jobs.id");
×
213
        $this->dbhm->preExec("TRUNCATE TABLE jobs_keywords");
×
214

215
        error_log("Process " . count($jobs));
×
216

217
        foreach ($jobs as $job) {
×
218
            $keywords = Jobs::getKeywords($job['title']);
×
219

220
            if (count($keywords)) {
×
221
                #error_log("{$job['title']} => " . json_encode($keywords));
222
                foreach ($keywords as $k) {
×
223
                    $this->dbhm->preExec("INSERT INTO jobs_keywords (keyword, count) VALUES (?, 1) ON DUPLICATE KEY UPDATE count = count + 1;", [
×
224
                        $k
×
225
                    ]);
×
226
                }
227
            }
228
        }
229
    }
230

231
    public function updateClickability() {
232
        # This updates the list of jobs with a clickability measure, based on the keyword analyse we have done on
233
        # previous clicks.
234
        $jobs = $this->dbhr->preQuery("SELECT id, title FROM jobs;");
×
235

236
        $maxish = $this->getMaxish();
×
237

238
        foreach ($jobs as $job) {
×
239
            $score = $this->clickability($job['id'], $job['title'], $maxish);
×
240
            $this->dbhm->preExec("UPDATE jobs SET clickability = ? WHERE id = ?;", [
×
241
                $score,
×
242
                $job['id']
×
243
            ]);
×
244
        }
245
    }
246

247
    public function clickability($jobid, $title = NULL, $maxish = NULL) {
248
        $ret = 0;
1✔
249

250
        if (!$title) {
1✔
251
            # Collect the sum of the counts of keywords present in this title.
252
            $jobs = $this->dbhr->preQuery("SELECT title FROM jobs WHERE id = ?;", [
1✔
253
                $jobid
1✔
254
            ]);
1✔
255

256
            foreach ($jobs as $job) {
1✔
257
                $title = $job['title'];
1✔
258
            }
259
        }
260

261
        if ($title) {
1✔
262
            $keywords = Jobs::getKeywords($title);
1✔
263

264
            if (count($keywords)) {
1✔
265
                if (!$this->jobKeywords) {
1✔
266
                    # Cache in this object because it speeds bulk load a lot.
267
                    $this->jobKeywords = [];
1✔
268
                    $jobKeywords = $this->dbhr->preQuery("SELECT * FROM jobs_keywords");
1✔
269

270
                    foreach ($jobKeywords as $j) {
1✔
271
                        $this->jobKeywords[$j['keyword']] = $j['count'];
1✔
272
                    }
273
                }
274

275
                foreach ($keywords as $keyword) {
1✔
276
                    if (array_key_exists($keyword, $this->jobKeywords)) {
1✔
277
                        $ret += $this->jobKeywords[$keyword];
×
278
                    }
279
                }
280
            }
281
        }
282

283
        # Normalise this, so that if we get more clicks overall because we have more site activity we don't
284
        # think it's because the jobs we happen to be clicking are more desirable. Use the 95th percentile to
285
        # get the maxish value (avoiding outliers).
286
        if (!$maxish) {
1✔
287
            $maxish = $this->getMaxish();
1✔
288
        }
289

290
        return $ret / $maxish;
1✔
291
    }
292

293
    public function getMaxish() {
294
        $maxish = 1;
2✔
295

296
        $m = $this->dbhr->preQuery("SELECT count FROM 
2✔
297
(SELECT t.*,  @row_num :=@row_num + 1 AS row_num FROM jobs_keywords t, 
298
    (SELECT @row_num:=0) counter ORDER BY count) 
299
temp WHERE temp.row_num = ROUND (.95* @row_num);");
2✔
300

301
        if (count($m)) {
2✔
302
            $maxish = $m[0]['count'];
2✔
303
        }
304

305
        return $maxish;
2✔
306
    }
307

308
    public static function geocode($addr, $allowPoint, $exact, $bbswlat = 49.959999905, $bbswlng = -7.57216793459, $bbnelat = 58.6350001085, $bbnelng = 1.68153079591) {
309
        // Special cases
310
        if ($addr == 'West Marsh') {
2✔
311
            $addr = 'Grimsby';
×
312
        } else if ($addr == 'Stoney Middleton') {
2✔
313
            $addr .= ', Derbyshire';
×
314
        } else if ($addr == 'Middleton Stoney') {
2✔
315
            $addr .= ', Oxfordshire';
×
316
        } else if ($addr == 'Kirkwall') {
2✔
317
            $addr .= ', Orkney';
×
318
        } else if ($addr == 'City of York') {
2✔
319
            $addr = 'York';
×
320
        } else if ($addr == 'Sutton Central') {
2✔
321
            $addr = 'Sutton, London';
×
322
        } else if ($addr == 'Hampden Park') {
2✔
323
            $addr = 'Hampden Park, Eastbourne';
×
324
        } else if ($addr == 'Guernsey') {
2✔
325
            return [ NULL, NULL, NULL, NULL, NULL, NULL ];
×
326
        }
327

328
        $url = "https://" . GEOCODER . "/api?q=" . urlencode($addr) . "&bbox=$bbswlng%2C$bbswlat%2C$bbnelng%2C$bbnelat";
2✔
329
        $geocode = @file_get_contents($url);
2✔
330
        #error_log("Geocode $addr, allow point $allowPoint, exact $exact, $url");
331
        $swlng = $swlat = $nelng = $nelat = $geom = $area = NULL;
2✔
332

333
        if ($geocode) {
2✔
334
            $results = json_decode($geocode, TRUE);
2✔
335

336
            if (Utils::pres('features', $results) && count($results['features'])) {
2✔
337
                foreach ($results['features'] as $feature) {
2✔
338
                    if (Utils::pres('properties', $feature)) {
2✔
339
                        $nameMatches = Utils::pres('name', $feature['properties']) && strcmp(strtolower($feature['properties']['name']), strtolower($addr)) == 0;
2✔
340

341
                        if (Utils::pres('extent', $feature['properties'])) {
2✔
342
                            if (!$exact || !$nameMatches) {
2✔
343
                                $swlng = floatval($feature['properties']['extent'][0]);
2✔
344
                                $swlat = floatval($feature['properties']['extent'][1]);
2✔
345
                                $nelng = floatval($feature['properties']['extent'][2]);
2✔
346
                                $nelat = floatval($feature['properties']['extent'][3]);
2✔
347
                                $geom = Utils::getBoxPoly($swlat, $swlng, $nelat, $nelng);
2✔
348
                                #error_log("From extent $geom");
349
                            }
350
                            break;
2✔
351
                        } else if ($allowPoint &&
1✔
352
                            (!$exact || $nameMatches) &&
1✔
353
                            Utils::pres('geometry', $feature) &&
1✔
354
                            Utils::pres('coordinates', $feature['geometry'])) {
1✔
355
                            # Invent a small polygon, just so all the geometries have the same dimension
356
                            $lat = floatval($feature['geometry']['coordinates'][1]);
1✔
357
                            $lng = floatval($feature['geometry']['coordinates'][0]);
1✔
358
                            $swlng = $lng - 0.0005;
1✔
359
                            $swlat = $lat - 0.0005;
1✔
360
                            $nelat = $lat + 0.0005;
1✔
361
                            $nelng = $lng + 0.0005;
1✔
362
                            $geom = Utils::getBoxPoly($swlat, $swlng, $nelat, $nelng);
1✔
363
                            #error_log("From point $geom");
364
                            break;
1✔
365
                        }
366
                    }
367
                }
368
            }
369
        }
370

371
        if ($geom) {
2✔
372
            $g = new \geoPHP();
2✔
373
            $poly = $g::load($geom, 'wkt');
2✔
374
            $area = $poly->area();
2✔
375
        }
376

377
        #error_log("Geocode $addr => $geom area $area");
378
        return [ $swlat, $swlng, $nelat, $nelng, $geom, $area ];
2✔
379
    }
380

381
    public function scanToCSV($inputFile, $outputFile, $maxage = 7, $fakeTime = FALSE, $distribute = 0.0005, $perFile = 1000, $format = 'csv') {
382
        $now = $fakeTime ? '2001-01-01 00:00:00' : date("Y-m-d H:i:s", time());
1✔
383
        $out = fopen("$outputFile.tmp", 'w');
1✔
384

385
        # This scans the XML job file provided by WhatJobs, filters out the ones we want, and writes to a
386
        # CSV file.
387
        $options = array(
1✔
388
            "captureDepth" => 3,
1✔
389
            "expectGT" => TRUE
1✔
390
        );
1✔
391

392
        $parser = new Parser\StringWalker($options);
1✔
393
        $stream = new Stream\File($inputFile, 1024);
1✔
394
        $streamer = new XmlStringStreamer($parser, $stream);
1✔
395

396
        $count = 0;
1✔
397
        $new = 0;
1✔
398
        $old = 0;
1✔
399
        $toolow = 0;
1✔
400
        $nogeocode = 0;
1✔
401
        $geocache = [];
1✔
402

403
        $maxish = $this->getMaxish();
1✔
404

405
        while ($node = $streamer->getNode()) {
1✔
406
            $job = simplexml_load_string($node);
1✔
407

408
            if ($format == 1) {
1✔
409
                $location = $job->locations->location->location;
1✔
410
                $timeposted = $job->timePosted;
1✔
411
                $city = $job->locations->location->city;
1✔
412
                $state = $job->locations->location->state;
1✔
413
                $zip = $job->locations->location->zip;
1✔
414
                $country = $job->locations->location->country;
1✔
415
                $companyname = $job->company->name;
1✔
416
                $cpc = $job->custom->CPC;
1✔
417
                $deeplink = $job->urlDeeplink;
1✔
418
                $jobid = $job->id;
1✔
419
                $jobtitle = $job->title;
1✔
420
                $description = $job->description;
1✔
421
                $category = $job->category;
1✔
422
            } else {
423
                $location = $job->location;
1✔
424
                $timeposted = $job->posted_at;
1✔
425
                $city = $job->city;
1✔
426
                $state = $job->state;
1✔
427
                $zip = $job->zip;
1✔
428
                $country = $job->country;
1✔
429
                $companyname = $job->company;
1✔
430
                $cpc = $job->cpc;
1✔
431
                $deeplink = $job->url;
1✔
432
                $jobid = $job->job_reference;
1✔
433
                $jobtitle = $job->title;
1✔
434
                $description = $job->body;
1✔
435
                $category = $job->category;
1✔
436
            }
437

438
            #error_log("Location $location, timeposted $timeposted, city $city, state $state, zip $zip, country $country, companyname $companyname, cpc $cpc, deeplink $deeplink, jobid $jobid, jobtitle $jobtitle, description " . strlen($description) . ", category $category");
439

440
            # Only add new jobs.
441
            $age = (time() - strtotime($timeposted)) / (60 * 60 * 24);
1✔
442

443
            if ($age < $maxage) {
1✔
444
                if (!$jobid) {
1✔
445
                    #error_log("No job id for {$jobtitle}, {$location}");
446
                } else if (floatval($cpc) < Jobs::MINIMUM_CPC) {
1✔
447
                    # Ignore this job - not worth us showing.
448
                    $toolow++;
×
449
                } else {
450
                    $geokey = "{$city},{$state},{$country}";
1✔
451
                    $geom = NULL;
1✔
452

453
                    $wascached = array_key_exists($geokey, $geocache);
1✔
454

455
                    if ($wascached) {
1✔
456
                        # In memory cache is faster than DB queries.  The total number of locations of the order
457
                        # of 20K so this is ok.
458
                        list ($geom, $swlat, $swlng, $nelat, $nelng) = $geocache[$geokey];
×
459
                        #error_log("Got $geokey from geocache");
460
                    } else {
461
                        # See if we already have the address geocoded - would save hitting the geocoder.
462
                        $geo = $this->dbhr->preQuery(
1✔
463
                            "SELECT ST_AsText(ST_Envelope(geometry)) AS geom FROM jobs WHERE city = ? AND state = ? AND country = ? LIMIT 1;",
1✔
464
                            [
1✔
465
                                $city,
1✔
466
                                $state,
1✔
467
                                $country
1✔
468
                            ]
1✔
469
                        );
1✔
470

471
                        $geom = null;
1✔
472
                        #error_log("Found " . count($geo) . " {$city}, {$state}, {$country}");
473

474
                        if (count($geo))
1✔
475
                        {
476
                            $geom = $geo[0]['geom'];
×
477
                            $g = new \geoPHP();
×
478
                            $poly = $g::load($geom, 'wkt');
×
479
                            $bbox = $poly->getBBox();
×
480
                            $swlat = $bbox['miny'];
×
481
                            $swlng = $bbox['minx'];
×
482
                            $nelat = $bbox['maxy'];
×
483
                            $nelng = $bbox['maxx'];
×
484

485
                            #error_log("Got existing location $geom = $swlat,$swlng to $nelat, $nelng");
486
                        }
487
                    }
488

489
                    if (!$geom) {
1✔
490
                        # Try to geocode the address.
491
                        #
492
                        # We have location, city, state, and country.  We can ignore the country.
493
                        $badStates = [ 'not specified', 'united kingdom of great britain and northern ireland', 'united kingdom', 'uk', 'england', 'scotland', 'wales', 'home based' ];
1✔
494
                        if ($state && strlen(trim($state)) && !in_array(strtolower(trim($state)), $badStates)) {
1✔
495
                            # Sometimes the state is a region.  Sometimes it is a much more specific location. Sigh.
496
                            #
497
                            # So first, geocode the state; if we get a specific location we can use that.  If it's a larger
498
                            # location then we can use it as a bounding box.
499
                            #
500
                            # Geocoding gets confused by "Borough of", e.g. "Borough of Swindon".
501
                            $state = str_ireplace($state, 'Borough of ', '');
1✔
502

503
                            list ($swlat, $swlng, $nelat, $nelng, $geom, $area) = Jobs::geocode($job, FALSE, TRUE);
1✔
504

505
                            if ($area && $area < 0.05) {
1✔
506
                                # We have a small 'state', which must be an actual location.  Stop.
507
                                #error_log("Got small area {$state}");
508
                            } else if ($geom) {
1✔
509
                                # We have a large state, which is a plausible region. Use it as a bounding box for the
510
                                # city.
511
                                #error_log("Gecoded {$state} to $geom");
512
                                list ($swlat, $swlng, $nelat, $nelng, $geom, $area) = Jobs::geocode($city, TRUE, FALSE, $swlat, $swlng, $nelat, $nelng);
×
513

514
                                if (!$geom) {
×
515
                                    #error_log("Failed to geocode {$city} in {$state}");
516
                                }
517
                            }
518
                        }
519
                    }
520

521
                    $badCities = [ 'not specified', 'null', 'home based', 'united kingdom', ', , united kingdom' ];
1✔
522
                    if (!$geom && $city && strlen(trim($city)) && !in_array(strtolower(trim($city)), $badCities)) {
1✔
523
                        # We have not managed anything from the state.  Try just the city.  This will lead to some being
524
                        # wrong, but that's life.
525
                        #error_log("State no use, use city {$city}");
526
                        list ($swlat, $swlng, $nelat, $nelng, $geom, $area) = Jobs::geocode($city, TRUE, FALSE);
1✔
527

528
                        if ($area > 50) {
1✔
529
                            #error_log("{$city} => $geom is too large at $area");
530
                            $geom = NULL;
×
531
                        }
532
                    }
533

534
                    if ($geom) {
1✔
535
                        #error_log("Geocoded {$city}, {$state} => $geom");
536

537
                        if (!$wascached) {
1✔
538
                            $geocache[$geokey] = [ $geom, $swlat, $swlng, $nelat, $nelng];
1✔
539
                        }
540

541
                        # Jobs tend to cluster, e.g. in cities.  When we are searching we expand a box from our current
542
                        # location until we overlap enough.  The clustering means we may suddenly match thousands of them,
543
                        # which is slow.  So instead of using the job location box as is, randomise to be a small box
544
                        # within the location box.  That way we will encounter some of the jobs sooner and hence have faster queries.
545
                        $newlat = $swlat + (mt_rand() / mt_getrandmax()) * ($nelat - $swlat);
1✔
546
                        $newlng = $swlng + (mt_rand() / mt_getrandmax()) * ($nelng - $swlng);
1✔
547
                        $swlng = $newlng - $distribute;
1✔
548
                        $swlat = $newlat - $distribute;
1✔
549
                        $nelat = $newlat + $distribute;
1✔
550
                        $nelng = $newlng + $distribute;
1✔
551
                        $geom = $distribute ? Utils::getBoxPoly($swlat, $swlng, $nelat, $nelng) : $geom;
1✔
552
                        #error_log("Modified loc to $geom");
553

554
                        # If the job already exists in the old table we want to preserve the id, because people
555
                        # might click on an old id from email.  If we don't find it then we'll use NULL, which will
556
                        # create a new auto-increment id.  We have to go to some effort to get the NULL in there -
557
                        # we want \N, but unquoted, so we put \N and then strip the quotes later.
558
                        #
559
                        # We could speed this up by sending out the job_reference rather than the id.
560
                        $id = "\N";
1✔
561

562
                        $oldids = $this->dbhr->preQuery("SELECT id FROM jobs WHERE job_reference = ?;", [
1✔
563
                            $jobid
1✔
564
                        ]);
1✔
565

566
                        foreach ($oldids as $oldid) {
1✔
567
                            $id = $oldid['id'];
×
568
                        }
569

570
                        try {
571
                            #error_log("Added job $jobid $id $geom");
572
                            $clickability = 0;
1✔
573
                            $title = NULL;
1✔
574

575
                            if ($jobtitle) {
1✔
576
                                $title = str_replace('&#39;', "'", html_entity_decode($jobtitle));
1✔
577
                                # Don't get clickability - we don't currently use it and it slows things down.
578
                                $clickability = 1;
1✔
579
                                #$clickability = $this->clickability(NULL, html_entity_decode($title), $maxish);
580
                            }
581

582
                            $body = NULL;
1✔
583

584
                            if ($description) {
1✔
585
                                # Truncate the body - we only display the body as a snippet to get people to click
586
                                # through.
587
                                #
588
                                # Fix up line endings which cause problems with fputcsv, and other weirdo characters
589
                                # that get mangled en route to us.  This isn't a simple UTF8 problem because the data
590
                                # comes from all over the place and is a bit of a mess.
591
                                $body = html_entity_decode($description);
1✔
592
                                $body = str_replace("\r\n", " ", $body);
1✔
593
                                $body = str_replace("\r", " ", $body);
1✔
594
                                $body = str_replace("\n", " ", $body);
1✔
595
                                $body = str_replace('–', '-', $body);
1✔
596
                                $body = str_replace('Â', '-', $body);
1✔
597
                                $body = substr($body, 0, 256);
1✔
598

599
                                # The truncation might happen to leave a \ at the end of the string, which would then
600
                                # escape the following quote added by fputcsv, and the world would explode in a ball of
601
                                # fire.  Add a space to avoid that.
602
                                $body .= ' ';
1✔
603
                            }
604

605
                            # Sometimes the geocode can end up with line breaks.
606
                            $geom = str_replace(["\n", "\r"], '', $geom);
1✔
607

608
                            # Write the job to CSV, ready for LOAD DATA INFILE later.
609
                            # location, title, city, state, zip, country, job_type, posted_at, job_reference, company,
610
                            # mobile_friendly_apply, category, html_jobs, url, body, cpc, geometry, clickability,
611
                            # bodyhash, seenat
612
                            fputcsv($out, [
1✔
613
                                $id,
1✔
614
                                $location ? html_entity_decode($location) : NULL,
1✔
615
                                $title,
1✔
616
                                $city ? html_entity_decode($city) : NULL,
1✔
617
                                $state ? html_entity_decode($state) : NULL,
1✔
618
                                $zip ? html_entity_decode($zip) : NULL,
1✔
619
                                $country ? html_entity_decode($country) : NULL,
1✔
620
                                NULL,
1✔
621
                                $timeposted ? html_entity_decode($timeposted) : NULL,
1✔
622
                                $jobid ? html_entity_decode($jobid) : NULL,
1✔
623
                                $companyname ? html_entity_decode($companyname) : NULL,
1✔
624
                                NULL,
1✔
625
                                $category ? html_entity_decode($category) : NULL,
1✔
626
                                NULL,
1✔
627
                                $deeplink ? html_entity_decode($deeplink) : NULL,
1✔
628
                                $body,
1✔
629
                                $cpc ? html_entity_decode($cpc) : NULL,
1✔
630
                                $geom,
1✔
631
                                $clickability,
1✔
632
                                md5($body),
1✔
633
                                $now,
1✔
634
                                1
1✔
635
                            ]);
1✔
636

637
                            $new++;
1✔
638
                        } catch (\Exception $e) {
×
639
                            error_log("Failed to add {$jobtitle} $geom " . $e->getMessage());
1✔
640
                        }
641
                    } else {
642
                        #error_log("Couldn't geocode {$city}, {$state}");
643
                        $nogeocode++;
1✔
644
                    }
645
                }
646
            } else {
647
                $old++;
×
648
            }
649

650
            $count++;
1✔
651

652
            if ($count % 1000 === 0) {
1✔
653
                error_log(date("Y-m-d H:i:s", time()) . "...processing $count");
×
654
            }
655
        }
656

657
        fclose($out);
1✔
658

659
        # Now unquote any "\N" into \N.
660
        error_log(date("Y-m-d H:i:s", time()) . "...finished loop, split");
1✔
661
        $data = file_get_contents("$outputFile.tmp");
1✔
662
        $data = str_replace('"\N"', '\N', $data);
1✔
663
        file_put_contents($outputFile, $data);
1✔
664
        error_log(date("Y-m-d H:i:s", time()) . "...written file $outputFile");
1✔
665
    }
666

667
    public function prepareForLoadCSV() {
668
        error_log(date("Y-m-d H:i:s", time()) . "...DROP jobs_new");
1✔
669
        $this->dbhm->preExec("DROP TABLE IF EXISTS jobs_new;");
1✔
670
        error_log(date("Y-m-d H:i:s", time()) . "...CREATE jobs_new");
1✔
671
        $this->dbhm->preExec("CREATE TABLE jobs_new LIKE jobs;");
1✔
672
        $this->dbhm->preExec("SET GLOBAL local_infile=1;");
1✔
673
    }
674

675
    public function loadCSV($csv) {
676
        # Percona cluster has non-standard behaviour for LOAD DATA INFILE, and commits are performed every 10K rows.
677
        # But even this seems to place too much load on cluster syncing, and can cause lockups.  So we split the CSV
678
        # into 1K rows and load each of them in turn.
679
        error_log(date("Y-m-d H:i:s", time()) . "...Split CSV file");
1✔
680
        system("cd /tmp/; rm feed-split*; split -1000 $csv feed-split");
1✔
681
        error_log(date("Y-m-d H:i:s", time()) . "...load files");
1✔
682
        foreach (glob('/tmp/feed-split*') as $fn) {
1✔
683
            set_time_limit(600);
1✔
684
            $this->dbhm->preExec("LOAD DATA LOCAL INFILE '$fn' INTO TABLE jobs_new
1✔
685
            CHARACTER SET latin1
686
            FIELDS TERMINATED BY ',' 
687
            OPTIONALLY ENCLOSED BY '\"' 
688
            LINES TERMINATED BY '\n'
689
            (id, location, title, city, state, zip, country, job_type, posted_at, job_reference, company,
690
             mobile_friendly_apply, category, html_jobs, url, body, cpc, @GEOM, clickability,
691
             bodyhash, seenat, visible) SET geometry = ST_GeomFromText(@GEOM, " . $this->dbhm->SRID() . ");");
1✔
692
            error_log(date("Y-m-d H:i:s", time()) . "...loaded file $fn");
1✔
693
        }
694
        error_log(date("Y-m-d H:i:s", time()) . "...finished file load");
1✔
695
    }
696

697
    public function deleteSpammyJobs($table) {
698
        # There are some "spammy" jobs which are posted with identical descriptions across the UK.  They feel scuzzy, so
699
        # remove them.
700
        $spamcount = 0;
1✔
701
        error_log(date("Y-m-d H:i:s", time()) . "Count spammy jobs");
1✔
702
        $spams = $this->dbhr->preQuery("SELECT COUNT(*) as count, title, bodyhash FROM $table GROUP BY bodyhash HAVING count > 50 AND bodyhash IS NOT NULL;");
1✔
703

704
        error_log(date("Y-m-d H:i:s", time()) . "Delete spammy jobs");
1✔
705

706
        foreach ($spams as $spam) {
1✔
707
            error_log("Delete spammy job {$spam['title']} * {$spam['count']}");
×
708
            set_time_limit(600);
×
709
            $spamcount += $spam['count'];
×
710

711
            do {
712
                $this->dbhm->preExec("DELETE FROM $table WHERE bodyhash = ? LIMIT 1;", [
×
713
                    $spam['bodyhash']
×
714
                ]);
×
715
            } while ($this->dbhm->rowsAffected());
×
716
        }
717
    }
718

719
    public function swapTables() {
720
        # We want to swap the jobs_new table with the jobs table, atomically.
721
        error_log(date("Y-m-d H:i:s", time()) . " Swap tables...");
1✔
722
        $this->dbhm->preExec("DROP TABLE IF EXISTS jobs_old;");
1✔
723
        $this->dbhm->preExec("RENAME TABLE jobs TO jobs_old, jobs_new TO jobs;");
1✔
724
        $this->dbhm->preExec("DROP TABLE IF EXISTS jobs_old;");
1✔
725
        error_log(date("Y-m-d H:i:s", time()) . "...tables swapped...");
1✔
726
    }
727
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc