• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tomasnorre / crawler / 3696698329

pending completion
3696698329

Pull #988

github

GitHub
Merge d1d32bd2d into 3f94d6a4e
Pull Request #988: [WIP][FEATURE] Setup new Backend Module

417 of 417 new or added lines in 9 files covered. (100.0%)

1601 of 2523 relevant lines covered (63.46%)

3.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.74
/Classes/Controller/CrawlerController.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace AOE\Crawler\Controller;
6

7
/*
8
 * (c) 2021 Tomas Norre Mikkelsen <tomasnorre@gmail.com>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21

22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Converter\JsonCompatibilityConverter;
24
use AOE\Crawler\Crawler;
25
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
26
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
27
use AOE\Crawler\Domain\Repository\ProcessRepository;
28
use AOE\Crawler\Domain\Repository\QueueRepository;
29
use AOE\Crawler\Event\AfterQueueItemAddedEvent;
30
use AOE\Crawler\Event\AfterUrlAddedToQueueEvent;
31
use AOE\Crawler\Event\BeforeQueueItemAddedEvent;
32
use AOE\Crawler\QueueExecutor;
33
use AOE\Crawler\Service\ConfigurationService;
34
use AOE\Crawler\Service\PageService;
35
use AOE\Crawler\Service\UrlService;
36
use AOE\Crawler\Value\QueueRow;
37
use PDO;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Core\Bootstrap;
45
use TYPO3\CMS\Core\Database\ConnectionPool;
46
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
47
use TYPO3\CMS\Core\EventDispatcher\EventDispatcher;
48
use TYPO3\CMS\Core\Imaging\Icon;
49
use TYPO3\CMS\Core\Imaging\IconFactory;
50
use TYPO3\CMS\Core\Type\Bitmask\Permission;
51
use TYPO3\CMS\Core\Utility\DebugUtility;
52
use TYPO3\CMS\Core\Utility\GeneralUtility;
53
use TYPO3\CMS\Core\Utility\MathUtility;
54

55
/**
56
 * Class CrawlerController
57
 *
58
 * @package AOE\Crawler\Controller
59
 */
60
class CrawlerController implements LoggerAwareInterface
61
{
62
    use LoggerAwareTrait;
63

64
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
65

66
    public int $setID = 0;
67
    public string $processID = '';
68
    public array $duplicateTrack = [];
69
    public array $downloadUrls = [];
70
    public array $incomingProcInstructions = [];
71
    public array $incomingConfigurationSelection = [];
72
    public bool $registerQueueEntriesInternallyOnly = false;
73
    public array $queueEntries = [];
74
    public array $urlList = [];
75
    public array $extensionSettings = [];
76

77
    /**
78
     * Mount Point
79
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
80
     */
81
    public bool $MP = false;
82
    protected QueueRepository $queueRepository;
83
    protected ProcessRepository $processRepository;
84
    protected ConfigurationRepository $configurationRepository;
85
    protected QueueExecutor $queueExecutor;
86
    protected int $maximumUrlsToCompile = 10000;
87
    protected IconFactory $iconFactory;
88

89
    /**
90
     * @var BackendUserAuthentication|null
91
     */
92
    private $backendUser;
93
    private int $scheduledTime = 0;
94
    private int $reqMinute = 0;
95
    private bool $submitCrawlUrls = false;
96
    private bool $downloadCrawlUrls = false;
97
    private PageRepository $pageRepository;
98
    private Crawler $crawler;
99
    private ConfigurationService $configurationService;
100
    private UrlService $urlService;
101
    private EventDispatcher $eventDispatcher;
102

103
    /************************************
104
     *
105
     * Getting URLs based on Page TSconfig
106
     *
107
     ************************************/
108

109
    public function __construct()
110
    {
111
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
16✔
112
        $this->queueRepository = GeneralUtility::makeInstance(QueueRepository::class);
16✔
113
        $this->processRepository = GeneralUtility::makeInstance(ProcessRepository::class);
16✔
114
        $this->configurationRepository = GeneralUtility::makeInstance(ConfigurationRepository::class);
16✔
115
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
16✔
116
        $this->eventDispatcher = GeneralUtility::makeInstance(EventDispatcher::class);
16✔
117
        $this->queueExecutor = GeneralUtility::makeInstance(
16✔
118
            QueueExecutor::class,
16✔
119
            $crawlStrategyFactory,
16✔
120
            $this->eventDispatcher
16✔
121
        );
16✔
122
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
16✔
123
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
16✔
124
        $this->configurationService = GeneralUtility::makeInstance(ConfigurationService::class);
16✔
125
        $this->urlService = GeneralUtility::makeInstance(UrlService::class);
16✔
126

127
        /** @var ExtensionConfigurationProvider $configurationProvider */
128
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
16✔
129
        $settings = $configurationProvider->getExtensionConfiguration();
16✔
130
        $this->extensionSettings = is_array($settings) ? $settings : [];
16✔
131

132
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
16✔
133
            $this->extensionSettings['countInARun'] = 100;
×
134
        }
135

136
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange(
16✔
137
            $this->extensionSettings['processLimit'],
16✔
138
            1,
16✔
139
            99,
16✔
140
            1
16✔
141
        );
16✔
142
        $this->setMaximumUrlsToCompile(
16✔
143
            MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)
16✔
144
        );
16✔
145
    }
146

147
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
148
    {
149
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
16✔
150
    }
151

152
    /**
153
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
154
     */
155
    public function setExtensionSettings(array $extensionSettings): void
156
    {
157
        $this->extensionSettings = $extensionSettings;
1✔
158
    }
159

160
    /**
161
     * Wrapper method for getUrlsForPageId()
162
     * It returns an array of configurations and no urls!
163
     *
164
     * @param array $pageRow Page record with at least dok-type and uid columns.
165
     * @see getUrlsForPageId()
166
     */
167
    public function getUrlsForPageRow(array $pageRow, string &$skipMessage = ''): array
168
    {
169
        $pageRowUid = intval($pageRow['uid']);
13✔
170
        if (!$pageRowUid) {
13✔
171
            $skipMessage = 'PageUid "' . $pageRow['uid'] . '" was not an integer';
2✔
172
            return [];
2✔
173
        }
174

175
        $message = $this->getPageService()->checkIfPageShouldBeSkipped($pageRow);
11✔
176
        if ($message === false) {
11✔
177
            $res = $this->getUrlsForPageId($pageRowUid);
10✔
178
            $skipMessage = '';
10✔
179
        } else {
180
            $skipMessage = $message;
2✔
181
            $res = [];
2✔
182
        }
183

184
        return $res;
11✔
185
    }
186

187
    /**
188
     * Creates a list of URLs from input array (and submits them to queue if asked for)
189
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
190
     *
191
     * @param array $vv Information about URLs from pageRow to crawl.
192
     * @param array $pageRow Page row
193
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
194
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
195
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
196
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
197
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
198
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
199
     * @param array $incomingProcInstructions Array of processing instructions
200
     * @return string List of URLs (meant for display in backend module)
201
     */
202
    public function urlListFromUrlArray(
203
        array $vv,
204
        array $pageRow,
205
        int $scheduledTime,
206
        int $reqMinute,
207
        bool $submitCrawlUrls,
208
        bool $downloadCrawlUrls,
209
        array &$duplicateTrack,
210
        array &$downloadUrls,
211
        array $incomingProcInstructions
212
    ): string {
213
        if (! is_array($vv['URLs'])) {
8✔
214
            return 'ERROR - no URL generated';
×
215
        }
216
        $urlLog = [];
8✔
217
        $pageId = (int) $pageRow['uid'];
8✔
218
        $configurationHash = $this->getConfigurationHash($vv);
8✔
219
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist(
8✔
220
            $pageId,
8✔
221
            $configurationHash
8✔
222
        );
8✔
223

224
        $urlService = new UrlService();
8✔
225

226
        foreach ($vv['URLs'] as $urlQuery) {
8✔
227
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'] ?? '', $incomingProcInstructions)) {
8✔
228
                continue;
×
229
            }
230
            $url = $urlService->getUrlFromPageAndQueryParameters(
8✔
231
                $pageId,
8✔
232
                $urlQuery,
8✔
233
                $vv['subCfg']['baseUrl'] ?? null,
8✔
234
                $vv['subCfg']['force_ssl'] ?? 0
8✔
235
            );
8✔
236

237
            if (! $url instanceof UriInterface) {
8✔
238
                continue;
×
239
            }
240

241
            $url = (string) $url;
8✔
242

243
            // Create key by which to determine unique-ness:
244
            $uKey = $url . '|' . ($vv['subCfg']['userGroups'] ?? '') . '|' . ($vv['subCfg']['procInstrFilter'] ?? '');
8✔
245

246
            if (isset($duplicateTrack[$uKey])) {
8✔
247
                //if the url key is registered just display it and do not resubmit is
248
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
×
249
            } else {
250
                // Scheduled time:
251
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
8✔
252
                $schTime = intval($schTime / 60) * 60;
8✔
253
                $formattedDate = BackendUtility::datetime($schTime);
8✔
254
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
8✔
255
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
8✔
256

257
                // Submit for crawling!
258
                if ($submitCrawlUrls) {
8✔
259
                    $added = $this->addUrl(
7✔
260
                        $pageId,
7✔
261
                        $url,
7✔
262
                        $vv['subCfg'],
7✔
263
                        $scheduledTime,
7✔
264
                        $configurationHash,
7✔
265
                        $skipInnerCheck
7✔
266
                    );
7✔
267
                    if ($added === false) {
7✔
268
                        $urlList .= ' (URL already existed)';
7✔
269
                    }
270
                } elseif ($downloadCrawlUrls) {
1✔
271
                    $downloadUrls[$url] = $url;
1✔
272
                }
273
                $urlLog[] = $urlList;
8✔
274
            }
275
            $duplicateTrack[$uKey] = true;
8✔
276
        }
277

278
        // Todo: Find a better option to have this correct in both backend (<br>) and cli (<new line>)
279
        return implode('<br>', $urlLog);
8✔
280
    }
281

282
    /**
283
     * Returns true if input processing instruction is among registered ones.
284
     *
285
     * @param string $piString PI to test
286
     * @param array $incomingProcInstructions Processing instructions
287
     * @return boolean
288
     */
289
    public function drawURLs_PIfilter(string $piString, array $incomingProcInstructions)
290
    {
291
        if (empty($incomingProcInstructions)) {
13✔
292
            return true;
9✔
293
        }
294

295
        foreach ($incomingProcInstructions as $pi) {
4✔
296
            if (GeneralUtility::inList($piString, $pi)) {
4✔
297
                return true;
2✔
298
            }
299
        }
300
        return false;
2✔
301
    }
302

303
    public function getPageTSconfigForId(int $id): array
304
    {
305
        if (! $this->MP) {
9✔
306
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
9✔
307
        } else {
308
            // TODO: Please check, this makes no sense to split a boolean value.
309
            [, $mountPointId] = explode('-', $this->MP);
×
310
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
×
311
        }
312

313
        // Call a hook to alter configuration
314
        if (
315
            isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])
9✔
316
            && is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])
9✔
317
        ) {
318
            $params = [
×
319
                'pageId' => $id,
×
320
                'pageTSConfig' => &$pageTSconfig,
×
321
            ];
×
322
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
×
323
                GeneralUtility::callUserFunction($userFunc, $params, $this);
×
324
            }
325
        }
326
        return $pageTSconfig;
9✔
327
    }
328

329
    /**
330
     * This method returns an array of configurations.
331
     * Adds no urls!
332
     */
333
    public function getUrlsForPageId(int $pageId): array
334
    {
335
        // Get page TSconfig for page ID
336
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
8✔
337

338
        $mountPoint = is_string($this->MP) ? $this->MP : '';
8✔
339

340
        $res = [];
8✔
341

342
        // Fetch Crawler Configuration from pageTSConfig
343
        $res = $this->configurationService->getConfigurationFromPageTS($pageTSconfig, $pageId, $res, $mountPoint);
8✔
344

345
        // Get configuration from tx_crawler_configuration records up the rootline
346
        $res = $this->configurationService->getConfigurationFromDatabase($pageId, $res);
8✔
347

348
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
8✔
349
            $params = [
×
350
                'res' => &$res,
×
351
            ];
×
352
            GeneralUtility::callUserFunction($func, $params, $this);
×
353
        }
354
        return $res;
8✔
355
    }
356

357
    /**
358
     * Find all configurations of subpages of a page
359
     * TODO: Write Functional Tests
360
     */
361
    public function getConfigurationsForBranch(int $rootid, int $depth): array
362
    {
363
        $configurationsForBranch = [];
1✔
364
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
1✔
365
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
1✔
366
        foreach ($sets as $key => $value) {
1✔
367
            if (! is_array($value)) {
×
368
                continue;
×
369
            }
370
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
×
371
        }
372
        $pids = [];
1✔
373
        $rootLine = BackendUtility::BEgetRootLine($rootid);
1✔
374
        foreach ($rootLine as $node) {
1✔
375
            $pids[] = $node['uid'];
1✔
376
        }
377
        /* @var PageTreeView $tree */
378
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1✔
379
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1✔
380
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
1✔
381
        $tree->getTree($rootid, $depth, '');
1✔
382
        foreach ($tree->tree as $node) {
1✔
383
            $pids[] = $node['row']['uid'];
×
384
        }
385

386
        $configurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($rootid, $pids);
1✔
387

388
        foreach ($configurations as $configuration) {
1✔
389
            $configurationsForBranch[] = $configuration['name'];
1✔
390
        }
391
        return $configurationsForBranch;
1✔
392
    }
393

394
    /************************************
395
     *
396
     * Crawler log
397
     *
398
     ************************************/
399

400
    /**
401
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
402
     *
403
     * @param integer $setId Set ID
404
     * @param array $params Parameters to pass to call back function
405
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
406
     * @param integer $page_id Page ID to attach it to
407
     * @param integer $schedule Time at which to activate
408
     */
409
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
410
    {
411
        if (! is_array($params)) {
×
412
            $params = [];
×
413
        }
414
        $params['_CALLBACKOBJ'] = $callBack;
×
415

416
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
×
417
            ->insert(
×
418
                QueueRepository::TABLE_NAME,
×
419
                [
×
420
                    'page_id' => (int) $page_id,
×
421
                    'parameters' => json_encode($params),
×
422
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
×
423
                    'exec_time' => 0,
×
424
                    'set_id' => (int) $setId,
×
425
                    'result_data' => '',
×
426
                ]
×
427
            );
×
428
    }
429

430
    /************************************
431
     *
432
     * URL setting
433
     *
434
     ************************************/
435

436
    /**
437
     * Setting a URL for crawling:
438
     *
439
     * @param integer $id Page ID
440
     * @param string $url Complete URL
441
     * @param array $subCfg Sub configuration array (from TS config)
442
     * @param integer $tstamp Scheduled-time
443
     * @param string $configurationHash (optional) configuration hash
444
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
445
     * @return bool
446
     */
447
    public function addUrl(
448
        $id,
449
        $url,
450
        array $subCfg,
451
        $tstamp,
452
        $configurationHash = '',
453
        $skipInnerDuplicationCheck = false
454
    ) {
455
        $urlAdded = false;
11✔
456
        $rows = [];
11✔
457

458
        // Creating parameters:
459
        $parameters = [
11✔
460
            'url' => $url,
11✔
461
        ];
11✔
462

463
        // fe user group simulation:
464
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'] ?? '', true)));
11✔
465
        if ($uGs) {
11✔
466
            $parameters['feUserGroupList'] = $uGs;
1✔
467
        }
468

469
        // Setting processing instructions
470
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter'] ?? '');
11✔
471
        if (is_array($subCfg['procInstrParams.'] ?? false)) {
11✔
472
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
8✔
473
        }
474

475
        // Compile value array:
476
        $parameters_serialized = json_encode($parameters);
11✔
477
        $fieldArray = [
11✔
478
            'page_id' => (int) $id,
11✔
479
            'parameters' => $parameters_serialized,
11✔
480
            'parameters_hash' => md5($parameters_serialized),
11✔
481
            'configuration_hash' => $configurationHash,
11✔
482
            'scheduled' => $tstamp,
11✔
483
            'exec_time' => 0,
11✔
484
            'set_id' => (int) $this->setID,
11✔
485
            'result_data' => '',
11✔
486
            'configuration' => $subCfg['key'],
11✔
487
        ];
11✔
488

489
        if ($this->registerQueueEntriesInternallyOnly) {
11✔
490
            //the entries will only be registered and not stored to the database
491
            $this->queueEntries[] = $fieldArray;
2✔
492
        } else {
493
            if (! $skipInnerDuplicationCheck) {
9✔
494
                // check if there is already an equal entry
495
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
5✔
496
                    (bool) $this->extensionSettings['enableTimeslot'],
5✔
497
                    $tstamp,
5✔
498
                    $this->getCurrentTime(),
5✔
499
                    $fieldArray['page_id'],
5✔
500
                    $fieldArray['parameters_hash']
5✔
501
                );
5✔
502
            }
503
            if ($rows === []) {
9✔
504
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(
8✔
505
                    QueueRepository::TABLE_NAME
8✔
506
                );
8✔
507
                $connectionForCrawlerQueue->insert(QueueRepository::TABLE_NAME, $fieldArray);
8✔
508
                $uid = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid');
8✔
509
                $rows[] = $uid;
8✔
510
                $urlAdded = true;
8✔
511

512
                $this->eventDispatcher->dispatch(new AfterUrlAddedToQueueEvent($uid, $fieldArray));
8✔
513
            }
514
        }
515

516
        return $urlAdded;
11✔
517
    }
518

519
    /**
520
     * Returns the current system time
521
     *
522
     * @return int
523
     */
524
    public function getCurrentTime()
525
    {
526
        return time();
9✔
527
    }
528

529
    /************************************
530
     *
531
     * URL reading
532
     *
533
     ************************************/
534

535
    /**
536
     * Read URL for single queue entry
537
     *
538
     * @param integer $queueId
539
     * @param boolean $force If set, will process even if exec_time has been set!
540
     *
541
     * @return int|null
542
     */
543
    public function readUrl($queueId, $force = false, string $processId = '')
544
    {
545
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(
2✔
546
            QueueRepository::TABLE_NAME
2✔
547
        );
2✔
548
        $ret = 0;
2✔
549
        $this->logger->debug('crawler-readurl start ' . microtime(true));
2✔
550

551
        $queryBuilder
2✔
552
            ->select('*')
2✔
553
            ->from(QueueRepository::TABLE_NAME)
2✔
554
            ->where(
2✔
555
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
2✔
556
            );
2✔
557
        if (! $force) {
2✔
558
            $queryBuilder
2✔
559
                ->andWhere('exec_time = 0')
2✔
560
                ->andWhere('process_scheduled > 0');
2✔
561
        }
562
        $queueRec = $queryBuilder->execute()->fetch();
2✔
563

564
        if (! is_array($queueRec)) {
2✔
565
            return;
×
566
        }
567

568
        /** @var BeforeQueueItemAddedEvent $event */
569
        $event = $this->eventDispatcher->dispatch(new BeforeQueueItemAddedEvent((int) $queueId, $queueRec));
2✔
570
        $queueRec = $event->getQueueRecord();
2✔
571

572
        // Set exec_time to lock record:
573
        $field_array = ['exec_time' => $this->getCurrentTime()];
2✔
574

575
        if (! empty($processId)) {
2✔
576
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
577
            $field_array['process_id_completed'] = $processId;
2✔
578
        }
579

580
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
2✔
581
            ->update(QueueRepository::TABLE_NAME, $field_array, ['qid' => (int) $queueId]);
2✔
582

583
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
2✔
584
        if ($result === 'ERROR' || ($result['content'] ?? null) === null) {
2✔
585
            $resultData = 'An errors happened';
2✔
586
        } else {
587
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
588
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
×
589
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
×
590

591
            //atm there's no need to point to specific pollable extensions
592
            if (
593
                is_array($resultData)
×
594
                && isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])
×
595
                && is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])
×
596
            ) {
597
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
×
598
                    // only check the success value if the instruction is runnig
599
                    // it is important to name the pollSuccess key same as the procInstructions key
600
                    if (is_array($resultData['parameters']['procInstructions'])
×
601
                        && in_array($pollable, $resultData['parameters']['procInstructions'], true)
×
602
                    ) {
603
                        if (! empty($resultData['success'][$pollable])) {
×
604
                            $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
×
605
                        }
606
                    }
607
                }
608
            }
609
        }
610
        // Set result in log which also denotes the end of the processing of this entry.
611
        $field_array = ['result_data' => json_encode($result)];
2✔
612

613
        /** @var AfterQueueItemAddedEvent $event */
614
        $event = $this->eventDispatcher->dispatch(new AfterQueueItemAddedEvent($queueId, $field_array));
2✔
615
        $field_array = $event->getFieldArray();
2✔
616

617
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
2✔
618
            ->update(QueueRepository::TABLE_NAME, $field_array, ['qid' => (int) $queueId]);
2✔
619

620
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
2✔
621
        return $ret;
2✔
622
    }
623

624
    /**
625
     * Read URL for not-yet-inserted log-entry
626
     *
627
     * @param array $field_array Queue field array,
628
     *
629
     * @return array|bool|mixed|string
630
     */
631
    public function readUrlFromArray($field_array)
632
    {
633
        // Set exec_time to lock record:
634
        $field_array['exec_time'] = $this->getCurrentTime();
1✔
635
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(
1✔
636
            QueueRepository::TABLE_NAME
1✔
637
        );
1✔
638
        $connectionForCrawlerQueue->insert(QueueRepository::TABLE_NAME, $field_array);
1✔
639
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid');
1✔
640
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1✔
641

642
        // Set result in log which also denotes the end of the processing of this entry.
643
        $field_array = ['result_data' => json_encode($result)];
1✔
644

645
        /** @var AfterQueueItemAddedEvent $event */
646
        $event = $this->eventDispatcher->dispatch(new AfterQueueItemAddedEvent($queueId, $field_array));
1✔
647
        $field_array = $event->getFieldArray();
1✔
648

649
        $connectionForCrawlerQueue->update(QueueRepository::TABLE_NAME, $field_array, ['qid' => $queueId]);
1✔
650

651
        return $result;
1✔
652
    }
653

654
    /*****************************
655
     *
656
     * Compiling URLs to crawl - tools
657
     *
658
     *****************************/
659

660
    /**
661
     * This draws the pageTree with URLs for e.g the Backend Log Module
662
     *
663
     * @param integer $id Root page id to start from.
664
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
665
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
666
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
667
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
668
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
669
     * @param array $incomingProcInstructions Array of processing instructions
670
     * @param array $configurationSelection Array of configuration keys
671
     * @return array
672
     */
673
    public function getPageTreeAndUrls(
674
        $id,
675
        $depth,
676
        $scheduledTime,
677
        $reqMinute,
678
        $submitCrawlUrls,
679
        $downloadCrawlUrls,
680
        array $incomingProcInstructions,
681
        array $configurationSelection
682
    ) {
683
        $this->scheduledTime = $scheduledTime;
5✔
684
        $this->reqMinute = $reqMinute;
5✔
685
        $this->submitCrawlUrls = $submitCrawlUrls;
5✔
686
        $this->downloadCrawlUrls = $downloadCrawlUrls;
5✔
687
        $this->incomingProcInstructions = $incomingProcInstructions;
5✔
688
        $this->incomingConfigurationSelection = $configurationSelection;
5✔
689

690
        $this->duplicateTrack = [];
5✔
691
        $this->downloadUrls = [];
5✔
692

693
        // Drawing tree:
694
        /* @var PageTreeView $tree */
695
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
5✔
696
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
5✔
697
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
5✔
698

699
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
5✔
700
        if (is_array($pageInfo)) {
5✔
701
            // Set root row:
702
            $tree->tree[] = [
5✔
703
                'row' => $pageInfo,
5✔
704
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
5✔
705
            ];
5✔
706
        }
707

708
        // Get branch beneath:
709
        if ($depth) {
5✔
710
            $tree->getTree($id, $depth, '');
1✔
711
        }
712

713
        $queueRows = [];
5✔
714

715
        // Traverse page tree:
716
        foreach ($tree->tree as $data) {
5✔
717
            $this->MP = false;
5✔
718

719
            // recognize mount points
720
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
5✔
721
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
×
722

723
                // fetch mounted pages
724
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
×
725

726
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
×
727
                $mountTree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
×
728
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
×
729

730
                foreach ($mountTree->tree as $mountData) {
×
731
                    $queueRows = array_merge($queueRows, $this->drawURLs_addRowsForPage(
×
732
                        $mountData['row'],
×
733
                        BackendUtility::getRecordTitle('pages', $mountData['row'], true),
×
734
                        (string) $data['HTML']
×
735
                    ));
×
736
                }
737

738
                // replace page when mount_pid_ol is enabled
739
                if ($mountpage[0]['mount_pid_ol']) {
×
740
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
×
741
                } else {
742
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
743
                    $this->MP = false;
×
744
                }
745
            }
746

747
            $queueRows = array_merge($queueRows, $this->drawURLs_addRowsForPage(
5✔
748
                $data['row'],
5✔
749
                BackendUtility::getRecordTitle('pages', $data['row'], true),
5✔
750
                (string) $data['HTML']
5✔
751
            ));
5✔
752
        }
753

754
        return $queueRows;
5✔
755
    }
756

757
    /**
758
     * Create the rows for display of the page tree
759
     * For each page a number of rows are shown displaying GET variable configuration
760
     */
761
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle, string $pageTitleHTML = ''): array
762
    {
763
        $skipMessage = '';
5✔
764

765
        // Get list of configurations
766
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
5✔
767
        $configurations = ConfigurationService::removeDisallowedConfigurations(
5✔
768
            $this->incomingConfigurationSelection,
5✔
769
            $configurations
5✔
770
        );
5✔
771

772
        // Traverse parameter combinations:
773
        $c = 0;
5✔
774

775
        $queueRowCollection = [];
5✔
776

777
        if (! empty($configurations)) {
5✔
778
            foreach ($configurations as $confKey => $confArray) {
5✔
779
                // Title column:
780
                if (! $c) {
5✔
781
                    $queueRow = new QueueRow($pageTitle);
5✔
782
                    $queueRow->setPageTitleHTML($pageTitleHTML);
5✔
783
                } else {
784
                    $queueRow = new QueueRow();
×
785
                    $queueRow->setPageTitleHTML($pageTitleHTML);
×
786
                }
787

788
                if (! in_array(
5✔
789
                    $pageRow['uid'],
5✔
790
                    $this->configurationService->expandExcludeString($confArray['subCfg']['exclude'] ?? ''),
5✔
791
                    true
5✔
792
                )) {
5✔
793
                    // URL list:
794
                    $urlList = $this->urlListFromUrlArray(
5✔
795
                        $confArray,
5✔
796
                        $pageRow,
5✔
797
                        $this->scheduledTime,
5✔
798
                        $this->reqMinute,
5✔
799
                        $this->submitCrawlUrls,
5✔
800
                        $this->downloadCrawlUrls,
5✔
801
                        $this->duplicateTrack,
5✔
802
                        $this->downloadUrls,
5✔
803
                        // if empty the urls won't be filtered by processing instructions
804
                        $this->incomingProcInstructions
5✔
805
                    );
5✔
806

807
                    // Expanded parameters:
808
                    $paramExpanded = '';
5✔
809
                    $calcAccu = [];
5✔
810
                    $calcRes = 1;
5✔
811
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
5✔
812
                        $paramExpanded .= '
×
813
                            <tr>
814
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
×
815
                            '(' . count($gVal) . ')' .
×
816
                            '</td>
×
817
                                <td nowrap="nowrap">' . nl2br(
×
818
                                htmlspecialchars(implode(chr(10), $gVal))
×
819
                            ) . '</td>
×
820
                            </tr>
821
                        ';
×
822
                        $calcRes *= count($gVal);
×
823
                        $calcAccu[] = count($gVal);
×
824
                    }
825
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
5✔
826
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
5✔
827

828
                    // Options
829
                    $queueRowOptionCollection = [];
5✔
830
                    if ($confArray['subCfg']['userGroups'] ?? false) {
5✔
831
                        $queueRowOptionCollection[] = 'User Groups: ' . $confArray['subCfg']['userGroups'];
×
832
                    }
833
                    if ($confArray['subCfg']['procInstrFilter'] ?? false) {
5✔
834
                        $queueRowOptionCollection[] = 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'];
×
835
                    }
836

837
                    // Remove empty array entries;
838
                    $queueRowOptionCollection = array_filter($queueRowOptionCollection);
5✔
839

840
                    $parameterConfig = nl2br(
5✔
841
                        htmlspecialchars(rawurldecode(
5✔
842
                            trim(str_replace(
5✔
843
                                '&',
5✔
844
                                chr(10) . '&',
5✔
845
                                GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'] ?? [])
5✔
846
                            ))
5✔
847
                        ))
5✔
848
                    );
5✔
849
                    $queueRow->setValuesExpanded($paramExpanded);
5✔
850
                    $queueRow->setConfigurationKey($confKey);
5✔
851
                    $queueRow->setUrls($urlList);
5✔
852
                    $queueRow->setOptions($queueRowOptionCollection);
5✔
853
                    $queueRow->setParameters(DebugUtility::viewArray($confArray['subCfg']['procInstrParams.'] ?? []));
5✔
854
                    $queueRow->setParameterConfig($parameterConfig);
5✔
855

856
                    $queueRowCollection[] = $queueRow;
5✔
857
                } else {
858
                    $queueRow->setConfigurationKey($confKey);
×
859
                    $queueRow->setMessage('(Page is excluded in this configuration)');
×
860
                    $queueRowCollection[] = $queueRow;
×
861
                }
862

863
                $c++;
5✔
864
            }
865
        } else {
866
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1✔
867
            $queueRow = new QueueRow($pageTitle);
1✔
868
            $queueRow->setPageTitleHTML($pageTitleHTML);
1✔
869
            $queueRow->setMessage($message);
1✔
870
            $queueRowCollection[] = $queueRow;
1✔
871
        }
872

873
        return $queueRowCollection;
5✔
874
    }
875

876
    /**
877
     * Returns a md5 hash generated from a serialized configuration array.
878
     *
879
     * @return string
880
     */
881
    protected function getConfigurationHash(array $configuration)
882
    {
883
        unset($configuration['paramExpanded']);
14✔
884
        unset($configuration['URLs']);
14✔
885
        return md5(serialize($configuration));
14✔
886
    }
887

888
    protected function getPageService(): PageService
889
    {
890
        return new PageService();
8✔
891
    }
892

893
    /**
894
     * @return BackendUserAuthentication
895
     */
896
    private function getBackendUser()
897
    {
898
        // Make sure the _cli_ user is loaded
899
        Bootstrap::initializeBackendAuthentication();
6✔
900
        if ($this->backendUser === null) {
6✔
901
            $this->backendUser = $GLOBALS['BE_USER'];
6✔
902
        }
903
        return $this->backendUser;
6✔
904
    }
905
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc