• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

Freegle / iznik-server / a3a11a6e-b72d-48a0-807c-9f018f8b47cd

pending completion
a3a11a6e-b72d-48a0-807c-9f018f8b47cd

push

circleci

Edward Hibbert
Duplicates when sharing popular posts.

26 of 26 new or added lines in 3 files covered. (100.0%)

19565 of 20566 relevant lines covered (95.13%)

32.31 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.51
/include/misc/Jobs.php
1
<?php
2
namespace Freegle\Iznik;
3

4
use Prewk\XmlStringStreamer;
5
use Prewk\XmlStringStreamer\Stream;
6
use Prewk\XmlStringStreamer\Parser;
7

8
require_once(IZNIK_BASE . '/lib/geoPHP/geoPHP.inc');
1✔
9

10
class Jobs {
11
    /** @public  $dbhr LoggedPDO */
12
    public $dbhr;
13
    /** @public  $dbhm LoggedPDO */
14
    public $dbhm;
15

16
    private $jobKeywords = NULL;
17

18
    const MINIMUM_CPC = 0.10;
19

20
    function __construct(LoggedPDO $dbhr, LoggedPDO $dbhm) {
21
        $this->dbhr = $dbhr;
6✔
22
        $this->dbhm = $dbhm;
6✔
23
    }
24

25
    public function query($lat, $lng, $limit = 50, $category = NULL) {
26
        # To make efficient use of the spatial index we construct a box around our lat/lng, and search for jobs
27
        # where the geometry overlaps it.  We keep expanding our box until we find enough.
28
        #
29
        # We used to double the ambit each time, but that led to long queries, probably because we would suddenly
30
        # include a couple of cities or something.
31
        $step = 0.02;
4✔
32
        $ambit = $step;
4✔
33

34
        $ret = [];
4✔
35
        $got = [];
4✔
36
        $gotbody = [];
4✔
37
        $gottitle = [];
4✔
38
        $passes = 0;
4✔
39

40
        do {
41
            $swlat = $lat - $ambit;
4✔
42
            $nelat = $lat + $ambit;
4✔
43
            $swlng = $lng - $ambit;
4✔
44
            $nelng = $lng + $ambit;
4✔
45

46
            $poly = "POLYGON(($swlng $swlat, $swlng $nelat, $nelng $nelat, $nelng $swlat, $swlng $swlat))";
4✔
47
            $categoryq = $category ? (" AND category = " . $this->dbhr->quote($category)) : '';
4✔
48

49
            # We use ST_Within because that takes advantage of the spatial index, whereas ST_Intersects does not.
50
            $alreadyq = '';
4✔
51

52
            if (count($got)) {
4✔
53
                $alreadyq = " AND jobs.id NOT IN (" . implode(',', array_keys($got)) . ") AND jobs.title NOT IN (";
2✔
54

55
                $first = TRUE;
2✔
56
                foreach ($gottitle as $title) {
2✔
57
                    if (!$first) {
2✔
58
                        $alreadyq .= ", ";
×
59
                    }
60

61
                    $alreadyq .= $this->dbhr->quote($title);
2✔
62
                    $first = FALSE;
2✔
63
                }
64

65
                $alreadyq .= ") ";
2✔
66
            }
67

68
            $sql = "SELECT $ambit AS ambit, 
4✔
69
       ST_Distance(geometry, ST_GeomFromText('POINT($lng $lat)', {$this->dbhr->SRID()})) AS dist,
4✔
70
       CASE WHEN ST_Dimension(geometry) < 2 THEN 0 ELSE ST_Area(geometry) END AS area,
71
       jobs.id, jobs.url, jobs.title, jobs.location, jobs.body, jobs.job_reference, jobs.cpc, jobs.clickability
72
        FROM `jobs`
73
        WHERE ST_Within(geometry, ST_GeomFromText('$poly', {$this->dbhr->SRID()})) 
4✔
74
            AND (ST_Dimension(geometry) < 2 OR ST_Area(geometry) / ST_Area(ST_GeomFromText('$poly', {$this->dbhr->SRID()})) < 2)
4✔
75
            AND cpc >= " . Jobs::MINIMUM_CPC . "
4✔
76
            AND visible = 1
77
            $alreadyq
78
            $categoryq
79
        ORDER BY cpc DESC, dist ASC, posted_at DESC LIMIT $limit;";
80
            $jobs = $this->dbhr->preQuery($sql);
4✔
81
            #error_log($sql . " found " . count($jobs));
82
            $passes++;
4✔
83

84
            foreach ($jobs as $job) {
4✔
85
                $got[$job['id']] = TRUE;
2✔
86
                $gotbody[$job['body']] = $job['body'];
2✔
87
                $gottitle[$job['title']] = $job['title'];
2✔
88
                $job['passes'] = $passes;
2✔
89

90
                # We have clickability, which is our estimate of how likely people are to click on a job based on
91
                # past clicks.  We also have the CPC, which is how much we expect to earn from a click.
92
                # Use these to order the jobs by our expected income.
93
                $cpc = max($job['cpc'], 0.0001);
2✔
94
                $job['expectation'] = $cpc * $job['clickability'];
2✔
95

96
                $ret[] = $job;
2✔
97
            }
98

99
            $ambit += $step;
4✔
100
        } while (count($ret) < $limit && $ambit < 1);
4✔
101

102
        usort($ret, function($a, $b) {
4✔
103
            # Take care - must return integer, not float, otherwise the sort doesn't work.
104
            return ceil($b['cpc'] - $a['cpc']);
×
105
        });
106

107
        $ret = array_slice($ret, 0, $limit);
4✔
108

109
        return $ret;
4✔
110
    }
111

112
    public function get($id) {
113
        $jobs = $this->dbhr->preQuery("SELECT * FROM jobs WHERE id = ?", [
1✔
114
            $id
115
        ]);
116

117
        return $jobs;
1✔
118
    }
119

120
    public static function getKeywords($str) {
121
        $initial = explode(' ',$str);
1✔
122

123
        # Remove some stuff.
124
        $arr = [];
1✔
125

126
        foreach ($initial as $i) {
1✔
127
            $w = preg_replace("/[^A-Za-z]/", '', $i);
1✔
128

129
            if (strlen($w) > 2) {
1✔
130
                $arr[] = $w;
1✔
131
            }
132
        }
133

134
        $result = [];
1✔
135

136
        for($i=0; $i < count($arr)-1; $i++) {
1✔
137
            $result[] =  strtolower($arr[$i]) . ' ' . strtolower($arr[$i+1]);
1✔
138
        }
139

140
        return $result;
1✔
141
    }
142

143
    public function recordClick($jobid, $link, $userid) {
144
        # Use ignore because we can get clicks for jobs we have purged.
145
        $this->dbhm->preExec("INSERT IGNORE INTO logs_jobs (userid, jobid, link) VALUES (?, ?, ?);", [
1✔
146
            $userid,
147
            $jobid,
148
            $link
149
        ]);
150
    }
151

152
    public function clickability($jobid, $title = NULL, $maxish = NULL) {
153
        $ret = 0;
1✔
154

155
        if (!$title) {
1✔
156
            # Collect the sum of the counts of keywords present in this title.
157
            $jobs = $this->dbhr->preQuery("SELECT title FROM jobs WHERE id = ?;", [
1✔
158
                $jobid
159
            ]);
160

161
            foreach ($jobs as $job) {
1✔
162
                $title = $job['title'];
1✔
163
            }
164
        }
165

166
        if ($title) {
1✔
167
            $keywords = Jobs::getKeywords($title);
1✔
168

169
            if (count($keywords)) {
1✔
170
                if (!$this->jobKeywords) {
1✔
171
                    # Cache in this object because it speeds bulk load a lot.
172
                    $this->jobKeywords = [];
1✔
173
                    $jobKeywords = $this->dbhr->preQuery("SELECT * FROM jobs_keywords");
1✔
174

175
                    foreach ($jobKeywords as $j) {
1✔
176
                        $this->jobKeywords[$j['keyword']] = $j['count'];
1✔
177
                    }
178
                }
179

180
                foreach ($keywords as $keyword) {
1✔
181
                    if (array_key_exists($keyword, $this->jobKeywords)) {
1✔
182
                        $ret += $this->jobKeywords[$keyword];
×
183
                    }
184
                }
185
            }
186
        }
187

188
        # Normalise this, so that if we get more clicks overall because we have more site activity we don't
189
        # think it's because the jobs we happen to be clicking are more desirable. Use the 95th percentile to
190
        # get the maxish value (avoiding outliers).
191
        if (!$maxish) {
1✔
192
            $maxish = $this->getMaxish();
1✔
193
        }
194

195
        return $ret / $maxish;
1✔
196
    }
197

198
    public function getMaxish() {
199
        $maxish = 1;
2✔
200

201
        $m = $this->dbhr->preQuery("SELECT count FROM 
2✔
202
(SELECT t.*,  @row_num :=@row_num + 1 AS row_num FROM jobs_keywords t, 
203
    (SELECT @row_num:=0) counter ORDER BY count) 
204
temp WHERE temp.row_num = ROUND (.95* @row_num);");
205

206
        if (count($m)) {
2✔
207
            $maxish = $m[0]['count'];
2✔
208
        }
209

210
        return $maxish;
2✔
211
    }
212

213
    public static function geocode($addr, $allowPoint, $exact, $bbswlat = 49.959999905, $bbswlng = -7.57216793459, $bbnelat = 58.6350001085, $bbnelng = 1.68153079591) {
214
        // Special cases
215
        if ($addr == 'West Marsh') {
2✔
216
            $addr = 'Grimsby';
×
217
        } else if ($addr == 'Stoney Middleton') {
2✔
218
            $addr .= ', Derbyshire';
×
219
        } else if ($addr == 'Middleton Stoney') {
2✔
220
            $addr .= ', Oxfordshire';
×
221
        } else if ($addr == 'Kirkwall') {
2✔
222
            $addr .= ', Orkney';
×
223
        } else if ($addr == 'City of York') {
2✔
224
            $addr = 'York';
×
225
        } else if ($addr == 'Sutton Central') {
2✔
226
            $addr = 'Sutton, London';
×
227
        } else if ($addr == 'Hampden Park') {
2✔
228
            $addr = 'Hampden Park, Eastbourne';
×
229
        } else if ($addr == 'Guernsey') {
2✔
230
            return [ NULL, NULL, NULL, NULL, NULL, NULL ];
×
231
        }
232

233
        $url = "https://" . GEOCODER . "/api?q=" . urlencode($addr) . "&bbox=$bbswlng%2C$bbswlat%2C$bbnelng%2C$bbnelat";
2✔
234
        $geocode = @file_get_contents($url);
2✔
235
        #error_log("Geocode $addr, allow point $allowPoint, exact $exact, $url");
236
        $swlng = $swlat = $nelng = $nelat = $geom = $area = NULL;
2✔
237

238
        if ($geocode) {
2✔
239
            $results = json_decode($geocode, true);
2✔
240

241
            if (Utils::pres('features', $results) && count($results['features'])) {
2✔
242
                foreach ($results['features'] as $feature) {
2✔
243
                    if (Utils::pres('properties', $feature)) {
2✔
244
                        $nameMatches = Utils::pres('name', $feature['properties']) && strcmp(strtolower($feature['properties']['name']), strtolower($addr)) == 0;
2✔
245

246
                        if (Utils::pres('extent', $feature['properties'])) {
2✔
247
                            if (!$exact || !$nameMatches) {
2✔
248
                                $swlng = floatval($feature['properties']['extent'][0]);
2✔
249
                                $swlat = floatval($feature['properties']['extent'][1]);
2✔
250
                                $nelng = floatval($feature['properties']['extent'][2]);
2✔
251
                                $nelat = floatval($feature['properties']['extent'][3]);
2✔
252
                                $geom = Utils::getBoxPoly($swlat, $swlng, $nelat, $nelng);
2✔
253
                                #error_log("From extent $geom");
254
                            }
255
                            break;
2✔
256
                        } else if ($allowPoint &&
257
                            (!$exact || $nameMatches) &&
258
                            Utils::pres('geometry', $feature) &&
1✔
259
                            Utils::pres('coordinates', $feature['geometry'])) {
1✔
260
                            # Invent a small polygon, just so all the geometries have the same dimension
261
                            $lat = floatval($feature['geometry']['coordinates'][1]);
1✔
262
                            $lng = floatval($feature['geometry']['coordinates'][0]);
1✔
263
                            $swlng = $lng - 0.0005;
1✔
264
                            $swlat = $lat - 0.0005;
1✔
265
                            $nelat = $lat + 0.0005;
1✔
266
                            $nelng = $lng + 0.0005;
1✔
267
                            $geom = Utils::getBoxPoly($swlat, $swlng, $nelat, $nelng);
1✔
268
                            #error_log("From point $geom");
269
                            break;
1✔
270
                        }
271
                    }
272
                }
273
            }
274
        }
275

276
        if ($geom) {
2✔
277
            $g = new \geoPHP();
2✔
278
            $poly = $g::load($geom, 'wkt');
2✔
279
            $area = $poly->area();
2✔
280
        }
281

282
        #error_log("Geocode $addr => $geom area $area");
283
        return [ $swlat, $swlng, $nelat, $nelng, $geom, $area ];
2✔
284
    }
285

286
    public function scanToCSV($inputFile, $outputFile, $maxage = 7, $fakeTime = FALSE, $distribute = 0.0005, $perFile = 1000, $format) {
287
        $now = $fakeTime ? '2001-01-01 00:00:00' : date("Y-m-d H:i:s", time());
1✔
288
        $out = fopen("$outputFile.tmp", 'w');
1✔
289

290
        # This scans the XML job file provided by WhatJobs, filters out the ones we want, and writes to a
291
        # CSV file.
292
        $options = array(
1✔
293
            "captureDepth" => 3,
294
            "expectGT" => TRUE
295
        );
296

297
        $parser = new Parser\StringWalker($options);
1✔
298
        $stream = new Stream\File($inputFile, 1024);
1✔
299
        $streamer = new XmlStringStreamer($parser, $stream);
1✔
300

301
        $count = 0;
1✔
302
        $new = 0;
1✔
303
        $old = 0;
1✔
304
        $toolow = 0;
1✔
305
        $nogeocode = 0;
1✔
306
        $geocache = [];
1✔
307

308
        $maxish = $this->getMaxish();
1✔
309

310
        while ($node = $streamer->getNode()) {
1✔
311
            $job = simplexml_load_string($node);
1✔
312

313
            if ($format == 1) {
1✔
314
                $location = $job->locations->location->location;
1✔
315
                $timeposted = $job->timePosted;
1✔
316
                $city = $job->locations->location->city;
1✔
317
                $state = $job->locations->location->state;
1✔
318
                $zip = $job->locations->location->zip;
1✔
319
                $country = $job->locations->location->country;
1✔
320
                $companyname = $job->company->name;
1✔
321
                $cpc = $job->custom->CPC;
1✔
322
                $deeplink = $job->urlDeeplink;
1✔
323
                $jobid = $job->id;
1✔
324
                $jobtitle = $job->title;
1✔
325
                $description = $job->description;
1✔
326
                $category = $job->category;
1✔
327
            } else {
328
                $location = $job->location;
1✔
329
                $timeposted = $job->posted_at;
1✔
330
                $city = $job->city;
1✔
331
                $state = $job->state;
1✔
332
                $zip = $job->zip;
1✔
333
                $country = $job->country;
1✔
334
                $companyname = $job->company;
1✔
335
                $cpc = $job->cpc;
1✔
336
                $deeplink = $job->url;
1✔
337
                $jobid = $job->job_reference;
1✔
338
                $jobtitle = $job->title;
1✔
339
                $description = $job->body;
1✔
340
                $category = $job->category;
1✔
341
            }
342

343
            #error_log("Location $location, timeposted $timeposted, city $city, state $state, zip $zip, country $country, companyname $companyname, cpc $cpc, deeplink $deeplink, jobid $jobid, jobtitle $jobtitle, description " . strlen($description) . ", category $category");
344

345
            # Only add new jobs.
346
            $age = (time() - strtotime($timeposted)) / (60 * 60 * 24);
1✔
347

348
            if ($age < $maxage) {
1✔
349
                if (!$jobid) {
1✔
350
                    #error_log("No job id for {$jobtitle}, {$location}");
351
                } else if (floatval($cpc) < Jobs::MINIMUM_CPC) {
1✔
352
                    # Ignore this job - not worth us showing.
353
                    $toolow++;
×
354
                } else {
355
                    $geokey = "{$city},{$state},{$country}";
1✔
356
                    $geom = NULL;
1✔
357

358
                    $wascached = array_key_exists($geokey, $geocache);
1✔
359

360
                    if ($wascached) {
1✔
361
                        # In memory cache is faster than DB queries.  The total number of locations of the order
362
                        # of 20K so this is ok.
363
                        list ($geom, $swlat, $swlng, $nelat, $nelng) = $geocache[$geokey];
×
364
                        #error_log("Got $geokey from geocache");
365
                    } else {
366
                        # See if we already have the address geocoded - would save hitting the geocoder.
367
                        $geo = $this->dbhr->preQuery(
1✔
368
                            "SELECT ST_AsText(ST_Envelope(geometry)) AS geom FROM jobs WHERE city = ? AND state = ? AND country = ? LIMIT 1;",
369
                            [
370
                                $city,
371
                                $state,
372
                                $country
373
                            ]
374
                        );
375

376
                        $geom = null;
1✔
377
                        #error_log("Found " . count($geo) . " {$city}, {$state}, {$country}");
378

379
                        if (count($geo))
1✔
380
                        {
381
                            $geom = $geo[0]['geom'];
×
382
                            $g = new \geoPHP();
×
383
                            $poly = $g::load($geom, 'wkt');
×
384
                            $bbox = $poly->getBBox();
×
385
                            $swlat = $bbox['miny'];
×
386
                            $swlng = $bbox['minx'];
×
387
                            $nelat = $bbox['maxy'];
×
388
                            $nelng = $bbox['maxx'];
×
389

390
                            #error_log("Got existing location $geom = $swlat,$swlng to $nelat, $nelng");
391
                        }
392
                    }
393

394
                    if (!$geom) {
1✔
395
                        # Try to geocode the address.
396
                        #
397
                        # We have location, city, state, and country.  We can ignore the country.
398
                        $badStates = [ 'not specified', 'united kingdom of great britain and northern ireland', 'united kingdom', 'uk', 'england', 'scotland', 'wales', 'home based' ];
1✔
399
                        if ($state && strlen(trim($state)) && !in_array(strtolower(trim($state)), $badStates)) {
1✔
400
                            # Sometimes the state is a region.  Sometimes it is a much more specific location. Sigh.
401
                            #
402
                            # So first, geocode the state; if we get a specific location we can use that.  If it's a larger
403
                            # location then we can use it as a bounding box.
404
                            #
405
                            # Geocoding gets confused by "Borough of", e.g. "Borough of Swindon".
406
                            $state = str_ireplace($state, 'Borough of ', '');
1✔
407

408
                            list ($swlat, $swlng, $nelat, $nelng, $geom, $area) = Jobs::geocode($job, FALSE, TRUE);
1✔
409

410
                            if ($area && $area < 0.05) {
1✔
411
                                # We have a small 'state', which must be an actual location.  Stop.
412
                                #error_log("Got small area {$state}");
413
                            } else if ($geom) {
1✔
414
                                # We have a large state, which is a plausible region. Use it as a bounding box for the
415
                                # city.
416
                                #error_log("Gecoded {$state} to $geom");
417
                                list ($swlat, $swlng, $nelat, $nelng, $geom, $area) = Jobs::geocode($city, TRUE, FALSE, $swlat, $swlng, $nelat, $nelng);
×
418

419
                                if (!$geom) {
×
420
                                    #error_log("Failed to geocode {$city} in {$state}");
421
                                }
422
                            }
423
                        }
424
                    }
425

426
                    $badCities = [ 'not specified', 'null', 'home based', 'united kingdom', ', , united kingdom' ];
1✔
427
                    if (!$geom && $city && strlen(trim($city)) && !in_array(strtolower(trim($city)), $badCities)) {
1✔
428
                        # We have not managed anything from the state.  Try just the city.  This will lead to some being
429
                        # wrong, but that's life.
430
                        #error_log("State no use, use city {$city}");
431
                        list ($swlat, $swlng, $nelat, $nelng, $geom, $area) = Jobs::geocode($city, TRUE, FALSE);
1✔
432

433
                        if ($area > 50) {
1✔
434
                            #error_log("{$city} => $geom is too large at $area");
435
                            $geom = NULL;
×
436
                        }
437
                    }
438

439
                    if ($geom) {
1✔
440
                        #error_log("Geocoded {$city}, {$state} => $geom");
441

442
                        if (!$wascached) {
1✔
443
                            $geocache[$geokey] = [ $geom, $swlat, $swlng, $nelat, $nelng];
1✔
444
                        }
445

446
                        # Jobs tend to cluster, e.g. in cities.  When we are searching we expand a box from our current
447
                        # location until we overlap enough.  The clustering means we may suddenly match thousands of them,
448
                        # which is slow.  So instead of using the job location box as is, randomise to be a small box
449
                        # within the location box.  That way we will encounter some of the jobs sooner and hence have faster queries.
450
                        $newlat = $swlat + (mt_rand() / mt_getrandmax()) * ($nelat - $swlat);
1✔
451
                        $newlng = $swlng + (mt_rand() / mt_getrandmax()) * ($nelng - $swlng);
1✔
452
                        $swlng = $newlng - $distribute;
1✔
453
                        $swlat = $newlat - $distribute;
1✔
454
                        $nelat = $newlat + $distribute;
1✔
455
                        $nelng = $newlng + $distribute;
1✔
456
                        $geom = $distribute ? Utils::getBoxPoly($swlat, $swlng, $nelat, $nelng) : $geom;
1✔
457
                        #error_log("Modified loc to $geom");
458

459
                        # If the job already exists in the old table we want to preserve the id, because people
460
                        # might click on an old id from email.  If we don't find it then we'll use NULL, which will
461
                        # create a new auto-increment id.  We have to go to some effort to get the NULL in there -
462
                        # we want \N, but unquoted, so we put \N and then strip the quotes later.
463
                        #
464
                        # We could speed this up by sending out the job_reference rather than the id.
465
                        $id = "\N";
1✔
466

467
                        $oldids = $this->dbhr->preQuery("SELECT id FROM jobs WHERE job_reference = ?;", [
1✔
468
                            $jobid
469
                        ]);
470

471
                        foreach ($oldids as $oldid) {
1✔
472
                            $id = $oldid['id'];
×
473
                        }
474

475
                        try {
476
                            #error_log("Added job $jobid $id $geom");
477
                            $clickability = 0;
1✔
478
                            $title = NULL;
1✔
479

480
                            if ($jobtitle) {
1✔
481
                                $title = str_replace('&#39;', "'", html_entity_decode($jobtitle));
1✔
482
                                # Don't get clickability - we don't currently use it and it slows things down.
483
                                $clickability = 1;
1✔
484
                                #$clickability = $this->clickability(NULL, html_entity_decode($title), $maxish);
485
                            }
486

487
                            $body = NULL;
1✔
488

489
                            if ($description) {
1✔
490
                                # Truncate the body - we only display the body as a snippet to get people to click
491
                                # through.
492
                                #
493
                                # Fix up line endings which cause problems with fputcsv, and other weirdo characters
494
                                # that get mangled en route to us.  This isn't a simple UTF8 problem because the data
495
                                # comes from all over the place and is a bit of a mess.
496
                                $body = html_entity_decode($description);
1✔
497
                                $body = str_replace("\r\n", " ", $body);
1✔
498
                                $body = str_replace("\r", " ", $body);
1✔
499
                                $body = str_replace("\n", " ", $body);
1✔
500
                                $body = str_replace('–', '-', $body);
1✔
501
                                $body = str_replace('Â', '-', $body);
1✔
502
                                $body = substr($body, 0, 256);
1✔
503

504
                                # The truncation might happen to leave a \ at the end of the string, which would then
505
                                # escape the following quote added by fputcsv, and the world would explode in a ball of
506
                                # fire.  Add a space to avoid that.
507
                                $body .= ' ';
1✔
508
                            }
509

510
                            # Sometimes the geocode can end up with line breaks.
511
                            $geom = str_replace(["\n", "\r"], '', $geom);
1✔
512

513
                            # Write the job to CSV, ready for LOAD DATA INFILE later.
514
                            # location, title, city, state, zip, country, job_type, posted_at, job_reference, company,
515
                            # mobile_friendly_apply, category, html_jobs, url, body, cpc, geometry, clickability,
516
                            # bodyhash, seenat
517
                            fputcsv($out, [
1✔
518
                                $id,
519
                                $location ? html_entity_decode($location) : NULL,
1✔
520
                                $title,
521
                                $city ? html_entity_decode($city) : NULL,
1✔
522
                                $state ? html_entity_decode($state) : NULL,
1✔
523
                                $zip ? html_entity_decode($zip) : NULL,
1✔
524
                                $country ? html_entity_decode($country) : NULL,
1✔
525
                                NULL,
526
                                $timeposted ? html_entity_decode($timeposted) : NULL,
1✔
527
                                $jobid ? html_entity_decode($jobid) : NULL,
1✔
528
                                $companyname ? html_entity_decode($companyname) : NULL,
1✔
529
                                NULL,
530
                                $category ? html_entity_decode($category) : NULL,
1✔
531
                                NULL,
532
                                $deeplink ? html_entity_decode($deeplink) : NULL,
1✔
533
                                $body,
534
                                $cpc ? html_entity_decode($cpc) : NULL,
1✔
535
                                $geom,
536
                                $clickability,
537
                                md5($body),
1✔
538
                                $now,
539
                                1
540
                            ]);
541

542
                            $new++;
1✔
543
                        } catch (\Exception $e) {
×
544
                            error_log("Failed to add {$jobtitle} $geom " . $e->getMessage());
1✔
545
                        }
546
                    } else {
547
                        #error_log("Couldn't geocode {$city}, {$state}");
548
                        $nogeocode++;
1✔
549
                    }
550
                }
551
            } else {
552
                $old++;
×
553
            }
554

555
            $count++;
1✔
556

557
            if ($count % 1000 === 0) {
1✔
558
                error_log(date("Y-m-d H:i:s", time()) . "...processing $count");
×
559
            }
560
        }
561

562
        fclose($out);
1✔
563

564
        # Now unquote any "\N" into \N.
565
        error_log(date("Y-m-d H:i:s", time()) . "...finished loop, split");
1✔
566
        $data = file_get_contents("$outputFile.tmp");
1✔
567
        $data = str_replace('"\N"', '\N', $data);
1✔
568
        file_put_contents($outputFile, $data);
1✔
569
        error_log(date("Y-m-d H:i:s", time()) . "...written file $outputFile");
1✔
570
    }
571

572
    public function prepareForLoadCSV() {
573
        error_log(date("Y-m-d H:i:s", time()) . "...DROP jobs_new");
1✔
574
        $this->dbhm->preExec("DROP TABLE IF EXISTS jobs_new;");
1✔
575
        error_log(date("Y-m-d H:i:s", time()) . "...CREATE jobs_new");
1✔
576
        $this->dbhm->preExec("CREATE TABLE jobs_new LIKE jobs;");
1✔
577
        $this->dbhm->preExec("SET GLOBAL local_infile=1;");
1✔
578
    }
579

580
    public function loadCSV($csv) {
581
        # Percona cluster has non-standard behaviour for LOAD DATA INFILE, and commits are performed every 10K rows.
582
        # But even this seems to place too much load on cluster syncing, and can cause lockups.  So we split the CSV
583
        # into 1K rows and load each of them in turn.
584
        error_log(date("Y-m-d H:i:s", time()) . "...Split CSV file");
1✔
585
        system("cd /tmp/; rm feed-split*; split -1000 $csv feed-split");
1✔
586
        error_log(date("Y-m-d H:i:s", time()) . "...load files");
1✔
587
        foreach (glob('/tmp/feed-split*') as $fn) {
1✔
588
            set_time_limit(600);
1✔
589
            $this->dbhm->preExec("LOAD DATA LOCAL INFILE '$fn' INTO TABLE jobs_new
1✔
590
            CHARACTER SET latin1
591
            FIELDS TERMINATED BY ',' 
592
            OPTIONALLY ENCLOSED BY '\"' 
593
            LINES TERMINATED BY '\n'
594
            (id, location, title, city, state, zip, country, job_type, posted_at, job_reference, company,
595
             mobile_friendly_apply, category, html_jobs, url, body, cpc, @GEOM, clickability,
596
             bodyhash, seenat, visible) SET geometry = ST_GeomFromText(@GEOM, " . $this->dbhm->SRID() . ");");
1✔
597
            error_log(date("Y-m-d H:i:s", time()) . "...loaded file $fn");
1✔
598
        }
599
        error_log(date("Y-m-d H:i:s", time()) . "...finished file load");
1✔
600
    }
601

602
    public function deleteSpammyJobs() {
603
        # There are some "spammy" jobs which are posted with identical descriptions across the UK.  They feel scuzzy, so
604
        # remove them.
605
        $spamcount = 0;
1✔
606
        error_log(date("Y-m-d H:i:s", time()) . "Count spammy jobs");
1✔
607
        $spams = $this->dbhr->preQuery("SELECT COUNT(*) as count, title, bodyhash FROM jobs_new GROUP BY bodyhash HAVING count > 50 AND bodyhash IS NOT NULL;");
1✔
608

609
        error_log(date("Y-m-d H:i:s", time()) . "Delete spammy jobs");
1✔
610

611
        foreach ($spams as $spam) {
1✔
612
            error_log("Delete spammy job {$spam['title']} * {$spam['count']}");
×
613
            set_time_limit(600);
×
614
            $spamcount += $spam['count'];
×
615

616
            do {
617
                $this->dbhm->preExec("DELETE FROM jobs_new WHERE bodyhash = ? LIMIT 1;", [
×
618
                    $spam['bodyhash']
×
619
                ]);
620
            } while ($this->dbhm->rowsAffected());
×
621
        }
622
    }
623

624
    public function swapTables() {
625
        # We want to swap the jobs_new table with the jobs table, atomically.
626
        error_log(date("Y-m-d H:i:s", time()) . " Swap tables...");
1✔
627
        $this->dbhm->preExec("DROP TABLE IF EXISTS jobs_old;");
1✔
628
        $this->dbhm->preExec("RENAME TABLE jobs TO jobs_old, jobs_new TO jobs;");
1✔
629
        $this->dbhm->preExec("DROP TABLE IF EXISTS jobs_old;");
1✔
630
        error_log(date("Y-m-d H:i:s", time()) . "...tables swapped...");
1✔
631
    }
632
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc