• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tomasnorre / crawler / 19727593680

27 Nov 2025 06:44AM UTC coverage: 69.2% (-0.01%) from 69.211%
19727593680

push

github

web-flow
[CI] Bring back Psalm (#1226)

1 of 1 new or added line in 1 file covered. (100.0%)

2 existing lines in 2 files now uncovered.

1930 of 2789 relevant lines covered (69.2%)

3.2 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

79.95
/Classes/Controller/CrawlerController.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace AOE\Crawler\Controller;
6

7
/*
8
 * (c) 2021 Tomas Norre Mikkelsen <tomasnorre@gmail.com>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21

22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Converter\JsonCompatibilityConverter;
24
use AOE\Crawler\Crawler;
25
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
26
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
27
use AOE\Crawler\Domain\Repository\ProcessRepository;
28
use AOE\Crawler\Domain\Repository\QueueRepository;
29
use AOE\Crawler\Event\AfterQueueItemAddedEvent;
30
use AOE\Crawler\Event\AfterUrlAddedToQueueEvent;
31
use AOE\Crawler\Event\BeforeQueueItemAddedEvent;
32
use AOE\Crawler\QueueExecutor;
33
use AOE\Crawler\Service\ConfigurationService;
34
use AOE\Crawler\Service\PageService;
35
use AOE\Crawler\Service\ProcessInstructionService;
36
use AOE\Crawler\Service\UrlService;
37
use AOE\Crawler\Value\QueueRow;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Core\Bootstrap;
45
use TYPO3\CMS\Core\Database\ConnectionPool;
46
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
47
use TYPO3\CMS\Core\EventDispatcher\EventDispatcher;
48
use TYPO3\CMS\Core\Imaging\IconFactory;
49
use TYPO3\CMS\Core\Imaging\IconSize;
50
use TYPO3\CMS\Core\Type\Bitmask\Permission;
51
use TYPO3\CMS\Core\Utility\DebugUtility;
52
use TYPO3\CMS\Core\Utility\GeneralUtility;
53
use TYPO3\CMS\Core\Utility\MathUtility;
54

55
/**
56
 * @package AOE\Crawler\Controller
57
 * @internal since v12.0.0
58
 */
59
class CrawlerController implements LoggerAwareInterface
60
{
61
    use LoggerAwareTrait;
62

63
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
64

65
    public int $setID = 0;
66
    public string $processID = '';
67
    public array $duplicateTrack = [];
68
    public array $downloadUrls = [];
69
    public array $incomingProcInstructions = [];
70
    public array $incomingConfigurationSelection = [];
71
    public bool $registerQueueEntriesInternallyOnly = false;
72
    public array $queueEntries = [];
73
    public array $urlList = [];
74
    public array $extensionSettings = [];
75

76
    /**
77
     * Mount Point
78
     */
79
    public ?string $MP = null;
80
    protected QueueRepository $queueRepository;
81
    protected ProcessRepository $processRepository;
82
    protected ConfigurationRepository $configurationRepository;
83
    protected QueueExecutor $queueExecutor;
84
    protected int $maximumUrlsToCompile = 10000;
85
    protected IconFactory $iconFactory;
86

87
    /**
88
     * @var BackendUserAuthentication|null
89
     */
90
    private $backendUser;
91
    private int $scheduledTime = 0;
92
    private int $reqMinute = 0;
93
    private bool $submitCrawlUrls = false;
94
    private bool $downloadCrawlUrls = false;
95
    private PageRepository $pageRepository;
96
    private ConfigurationService $configurationService;
97
    private UrlService $urlService;
98
    private EventDispatcher $eventDispatcher;
99

100
    /************************************
101
     *
102
     * Getting URLs based on Page TSconfig
103
     *
104
     ************************************/
105

106
    public function __construct()
107
    {
108
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
16✔
109
        $this->queueRepository = GeneralUtility::makeInstance(QueueRepository::class);
16✔
110
        $this->processRepository = GeneralUtility::makeInstance(ProcessRepository::class);
16✔
111
        $this->configurationRepository = GeneralUtility::makeInstance(ConfigurationRepository::class);
16✔
112
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
16✔
113
        $this->eventDispatcher = GeneralUtility::makeInstance(EventDispatcher::class);
16✔
114
        $this->queueExecutor = GeneralUtility::makeInstance(
16✔
115
            QueueExecutor::class,
16✔
116
            $crawlStrategyFactory,
16✔
117
            $this->eventDispatcher
16✔
118
        );
16✔
119
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
16✔
120
        GeneralUtility::makeInstance(Crawler::class);
16✔
121
        $this->configurationService = GeneralUtility::makeInstance(
16✔
122
            ConfigurationService::class,
16✔
123
            GeneralUtility::makeInstance(UrlService::class),
16✔
124
            $this->configurationRepository
16✔
125
        );
16✔
126
        $this->urlService = GeneralUtility::makeInstance(UrlService::class);
16✔
127

128
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
16✔
129
        $this->extensionSettings = $configurationProvider->getExtensionConfiguration();
16✔
130

131
        if (abs((int) $this->extensionSettings['countInARun']) === 0) {
16✔
132
            $this->extensionSettings['countInARun'] = 100;
×
133
        }
134

135
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange(
16✔
136
            $this->extensionSettings['processLimit'],
16✔
137
            1,
16✔
138
            99,
16✔
139
            1
16✔
140
        );
16✔
141
        $this->setMaximumUrlsToCompile(
16✔
142
            MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)
16✔
143
        );
16✔
144
    }
145

146
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
147
    {
148
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
16✔
149
    }
150

151
    /**
152
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
153
     */
154
    public function setExtensionSettings(array $extensionSettings): void
155
    {
156
        $this->extensionSettings = $extensionSettings;
1✔
157
    }
158

159
    /**
160
     * Wrapper method for getUrlsForPageId()
161
     * It returns an array of configurations and no urls!
162
     *
163
     * @param array $pageRow Page record with at least dok-type and uid columns.
164
     * @see getUrlsForPageId()
165
     */
166
    public function getUrlsForPageRow(array $pageRow, string &$skipMessage = ''): array
167
    {
168
        if (!isset($pageRow['uid'])) {
14✔
169
            $skipMessage = "pageRow['uid'] is missing";
1✔
170
            return [];
1✔
171
        }
172

173
        $pageRowUid = intval($pageRow['uid']);
13✔
174
        if (!$pageRowUid) {
13✔
175
            $skipMessage = 'PageUid "' . $pageRow['uid'] . '" was not an integer';
2✔
176
            return [];
2✔
177
        }
178

179
        $message = $this->getPageService()->checkIfPageShouldBeSkipped($pageRow);
11✔
180
        if ($message === false) {
11✔
181
            $res = $this->getUrlsForPageId($pageRowUid);
10✔
182
            $skipMessage = '';
10✔
183
        } else {
184
            $skipMessage = $message;
2✔
185
            $res = [];
2✔
186
        }
187

188
        return $res;
11✔
189
    }
190

191
    /**
192
     * Creates a list of URLs from input array (and submits them to queue if asked for)
193
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
194
     *
195
     * @param array $vv Information about URLs from pageRow to crawl.
196
     * @param array $pageRow Page row
197
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
198
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
199
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
200
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
201
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
202
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
203
     * @param array $incomingProcInstructions Array of processing instructions
204
     * @return string List of URLs (meant for display in backend module)
205
     */
206
    public function urlListFromUrlArray(
207
        array $vv,
208
        array $pageRow,
209
        int $scheduledTime,
210
        int $reqMinute,
211
        bool $submitCrawlUrls,
212
        bool $downloadCrawlUrls,
213
        array &$duplicateTrack,
214
        array &$downloadUrls,
215
        array $incomingProcInstructions
216
    ): string {
217
        if (!is_array($vv['URLs'])) {
8✔
218
            return 'ERROR - no URL generated';
×
219
        }
220
        $urlLog = [];
8✔
221
        $pageId = (int) $pageRow['uid'];
8✔
222
        $configurationHash = $this->getConfigurationHash($vv);
8✔
223
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist(
8✔
224
            $pageId,
8✔
225
            $configurationHash
8✔
226
        );
8✔
227

228
        $processInstructionService = new ProcessInstructionService();
8✔
229

230
        foreach ($vv['URLs'] as $urlQuery) {
8✔
231
            if (!$processInstructionService->isAllowed(
8✔
232
                $vv['subCfg']['procInstrFilter'] ?? '',
8✔
233
                $incomingProcInstructions
8✔
234
            )) {
8✔
235
                continue;
×
236
            }
237
            $url = $this->urlService->getUrlFromPageAndQueryParameters(
8✔
238
                $pageId,
8✔
239
                $urlQuery,
8✔
240
                $vv['subCfg']['baseUrl'] ?? null,
8✔
241
                (int) ($vv['subCfg']['force_ssl'] ?? 0)
8✔
242
            );
8✔
243

244
            if (!$url instanceof UriInterface) {
8✔
245
                continue;
×
246
            }
247

248
            $url = (string) $url;
8✔
249

250
            // Create key by which to determine unique-ness:
251
            $uKey = $url . '|' . ($vv['subCfg']['userGroups'] ?? '') . '|' . ($vv['subCfg']['procInstrFilter'] ?? '');
8✔
252

253
            if (isset($duplicateTrack[$uKey])) {
8✔
254
                //if the url key is registered just display it and do not resubmit is
255
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
×
256
            } else {
257
                // Scheduled time:
258
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
8✔
259
                $schTime = intval($schTime / 60) * 60;
8✔
260
                $formattedDate = BackendUtility::datetime($schTime);
8✔
261
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
8✔
262
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
8✔
263

264
                // Submit for crawling!
265
                if ($submitCrawlUrls) {
8✔
266
                    $added = $this->addUrl(
7✔
267
                        $pageId,
7✔
268
                        $url,
7✔
269
                        $vv['subCfg'],
7✔
270
                        $scheduledTime,
7✔
271
                        $configurationHash,
7✔
272
                        $skipInnerCheck
7✔
273
                    );
7✔
274
                    if ($added === false) {
7✔
275
                        $urlList .= ' (URL already existed)';
3✔
276
                    }
277
                } elseif ($downloadCrawlUrls) {
1✔
278
                    $downloadUrls[$url] = $url;
1✔
279
                }
280
                $urlLog[] = $urlList;
8✔
281
            }
282
            $duplicateTrack[$uKey] = true;
8✔
283
        }
284

285
        // Todo: Find a better option to have this correct in both backend (<br>) and cli (<new line>)
286
        return implode('<br>', $urlLog);
8✔
287
    }
288

289
    /**
290
     * Returns true if input processing instruction is among registered ones.
291
     *
292
     * @param string $piString PI to test
293
     * @param array $incomingProcInstructions Processing instructions
294
     * @return boolean
295
     * @deprecated since 11.0.3 will be removed in v13.x
296
     */
297
    public function drawURLs_PIfilter(string $piString, array $incomingProcInstructions)
298
    {
299
        $processInstructionService = new ProcessInstructionService();
5✔
300
        return $processInstructionService->isAllowed($piString, $incomingProcInstructions);
5✔
301
    }
302

303
    public function getPageTSconfigForId(int $id): array
304
    {
305
        if (!$this->MP) {
9✔
306
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
9✔
307
        } else {
308
            [, $mountPointId] = explode('-', $this->MP);
×
309
            $pageTSconfig = BackendUtility::getPagesTSconfig((int) $mountPointId);
×
310
        }
311

312
        // Call a hook to alter configuration
313
        if (
314
            isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])
9✔
315
            && is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])
9✔
316
        ) {
317
            $params = [
×
318
                'pageId' => $id,
×
319
                'pageTSConfig' => &$pageTSconfig,
×
320
            ];
×
321
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
×
322
                GeneralUtility::callUserFunction($userFunc, $params, $this);
×
323
            }
324
        }
325
        return $pageTSconfig;
9✔
326
    }
327

328
    /**
329
     * This method returns an array of configurations.
330
     * Adds no urls!
331
     */
332
    public function getUrlsForPageId(int $pageId): array
333
    {
334
        // Get page TSconfig for page ID
335
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
8✔
336

337
        $mountPoint = $this->MP ?? '';
8✔
338

339
        $res = [];
8✔
340

341
        // Fetch Crawler Configuration from pageTSConfig
342
        $res = $this->configurationService->getConfigurationFromPageTS($pageTSconfig, $pageId, $res, $mountPoint);
8✔
343

344
        // Get configuration from tx_crawler_configuration records up the rootline
345
        $res = $this->configurationService->getConfigurationFromDatabase($pageId, $res);
8✔
346

347
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
8✔
348
            $params = [
×
349
                'res' => &$res,
×
350
            ];
×
351
            GeneralUtility::callUserFunction($func, $params, $this);
×
352
        }
353
        return $res;
8✔
354
    }
355

356
    /**
357
     * Find all configurations of subpages of a page
358
     * TODO: Write Functional Tests
359
     */
360
    public function getConfigurationsForBranch(int $rootid, int $depth): array
361
    {
362
        $configurationsForBranch = [];
1✔
363
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
1✔
364
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
1✔
365
        foreach ($sets as $key => $value) {
1✔
366
            if (!is_array($value)) {
×
367
                continue;
×
368
            }
369
            $configurationsForBranch[] = str_ends_with($key, '.') ? substr($key, 0, -1) : $key;
×
370
        }
371
        $pids = [];
1✔
372
        $rootLine = BackendUtility::BEgetRootLine($rootid);
1✔
373
        foreach ($rootLine as $node) {
1✔
374
            $pids[] = $node['uid'];
1✔
375
        }
376
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1✔
377
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1✔
378
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
1✔
379
        $tree->getTree($rootid, $depth, '');
1✔
380
        foreach ($tree->tree as $node) {
1✔
381
            $pids[] = $node['row']['uid'];
×
382
        }
383

384
        $configurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($rootid, $pids);
1✔
385

386
        foreach ($configurations as $configuration) {
1✔
387
            $configurationsForBranch[] = $configuration['name'];
1✔
388
        }
389
        return $configurationsForBranch;
1✔
390
    }
391

392
    /************************************
393
     *
394
     * Crawler log
395
     *
396
     ************************************/
397

398
    /**
399
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
400
     *
401
     * @param integer $setId Set ID
402
     * @param array $params Parameters to pass to call back function
403
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
404
     * @param integer $page_id Page ID to attach it to
405
     * @param integer $schedule Time at which to activate
406
     *
407
     * @deprecated since 12.0.5 will be removed in 14.x
408
     */
409
    public function addQueueEntry_callBack(
410
        int $setId,
411
        array $params,
412
        string $callBack,
413
        int $page_id = 0,
414
        int $schedule = 0
415
    ): void {
416
        $params['_CALLBACKOBJ'] = $callBack;
×
417

418
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
×
419
            ->insert(
×
420
                QueueRepository::TABLE_NAME,
×
421
                [
×
422
                    'page_id' => (int) $page_id,
×
423
                    'parameters' => json_encode($params),
×
424
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
×
425
                    'exec_time' => 0,
×
426
                    'set_id' => (int) $setId,
×
427
                    'result_data' => '',
×
428
                ]
×
429
            );
×
430
    }
431

432
    /************************************
433
     *
434
     * URL setting
435
     *
436
     ************************************/
437

438
    /**
439
     * Setting a URL for crawling:
440
     *
441
     * @param integer $id Page ID
442
     * @param string $url Complete URL
443
     * @param array $subCfg Sub configuration array (from TS config)
444
     * @param integer $tstamp Scheduled-time
445
     * @param string $configurationHash (optional) configuration hash
446
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
447
     * @return bool
448
     */
449
    public function addUrl(
450
        $id,
451
        $url,
452
        array $subCfg,
453
        $tstamp,
454
        $configurationHash = '',
455
        $skipInnerDuplicationCheck = false
456
    ) {
457
        $urlAdded = false;
11✔
458
        $rows = [];
11✔
459

460
        // Creating parameters:
461
        $parameters = [
11✔
462
            'url' => $url,
11✔
463
        ];
11✔
464

465
        // fe user group simulation:
466
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'] ?? '', true)));
11✔
467
        if ($uGs) {
11✔
468
            $parameters['feUserGroupList'] = $uGs;
1✔
469
        }
470

471
        // Setting processing instructions
472
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter'] ?? '');
11✔
473
        if (is_array($subCfg['procInstrParams.'] ?? false)) {
11✔
474
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
8✔
475
        }
476

477
        // Compile value array:
478
        $parameters_serialized = json_encode($parameters) ?: '';
11✔
479
        $fieldArray = [
11✔
480
            'page_id' => (int) $id,
11✔
481
            'parameters' => $parameters_serialized,
11✔
482
            'parameters_hash' => md5($parameters_serialized),
11✔
483
            'configuration_hash' => $configurationHash,
11✔
484
            'scheduled' => $tstamp,
11✔
485
            'exec_time' => 0,
11✔
486
            'set_id' => (int) $this->setID,
11✔
487
            'result_data' => '',
11✔
488
            'configuration' => $subCfg['key'],
11✔
489
        ];
11✔
490

491
        if ($this->registerQueueEntriesInternallyOnly) {
11✔
492
            //the entries will only be registered and not stored to the database
493
            $this->queueEntries[] = $fieldArray;
2✔
494
        } else {
495
            if (!$skipInnerDuplicationCheck) {
9✔
496
                // check if there is already an equal entry
497
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
5✔
498
                    (bool) $this->extensionSettings['enableTimeslot'],
5✔
499
                    $tstamp,
5✔
500
                    $this->getCurrentTime(),
5✔
501
                    $fieldArray['page_id'],
5✔
502
                    $fieldArray['parameters_hash']
5✔
503
                );
5✔
504
            }
505
            if ($rows === []) {
9✔
506
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(
8✔
507
                    QueueRepository::TABLE_NAME
8✔
508
                );
8✔
509
                $connectionForCrawlerQueue->insert(QueueRepository::TABLE_NAME, $fieldArray);
8✔
510
                $uid = $connectionForCrawlerQueue->lastInsertId();
8✔
511
                $rows[] = $uid;
8✔
512
                $urlAdded = true;
8✔
513

514
                $this->eventDispatcher->dispatch(new AfterUrlAddedToQueueEvent($uid, $fieldArray));
8✔
515
            }
516
        }
517

518
        return $urlAdded;
11✔
519
    }
520

521
    /**
522
     * Returns the current system time
523
     *
524
     * @return int
525
     */
526
    public function getCurrentTime()
527
    {
528
        return time();
9✔
529
    }
530

531
    /************************************
532
     *
533
     * URL reading
534
     *
535
     ************************************/
536

537
    /**
538
     * Read URL for single queue entry
539
     *
540
     * @param integer $queueId
541
     * @param boolean $force If set, will process even if exec_time has been set!
542
     *
543
     * @return int|null
544
     */
545
    public function readUrl(int $queueId, bool $force = false, string $processId = '')
546
    {
547
        $ret = 0;
2✔
548
        $this->logger?->debug('crawler-readurl start ' . microtime(true));
2✔
549

550
        $queueRec = $this->queueRepository->getQueueEntriesByQid($queueId, $force);
2✔
551

552
        if (!is_array($queueRec)) {
2✔
553
            return null;
×
554
        }
555

556
        $event = $this->eventDispatcher->dispatch(new BeforeQueueItemAddedEvent((int) $queueId, $queueRec));
2✔
557
        $queueRec = $event->getQueueRecord();
2✔
558

559
        // Set exec_time to lock record:
560
        $field_array = [
2✔
561
            'exec_time' => $this->getCurrentTime(),
2✔
562
        ];
2✔
563

564
        $field_array['process_id_completed'] = $processId;
2✔
565

566
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
2✔
567
            ->update(QueueRepository::TABLE_NAME, $field_array, [
2✔
568
                'qid' => (int) $queueId,
2✔
569
            ]);
2✔
570

571
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
2✔
572
        if ($result === 'ERROR' || ($result['content'] ?? null) === null) {
2✔
573
        } else {
UNCOV
574
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
×
575
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
×
576

577
            //atm there's no need to point to specific pollable extensions
578
            if (
579
                is_array($resultData)
×
580
                && isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])
×
581
                && is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])
×
582
            ) {
583
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
×
584
                    // only check the success value if the instruction is runnig
585
                    // it is important to name the pollSuccess key same as the procInstructions key
586
                    if (is_array($resultData['parameters']['procInstructions'])
×
587
                        && in_array($pollable, $resultData['parameters']['procInstructions'], true)
×
588
                    ) {
589
                        if (!empty($resultData['success'][$pollable])) {
×
590
                            $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
×
591
                        }
592
                    }
593
                }
594
            }
595
        }
596
        // Set result in log which also denotes the end of the processing of this entry.
597
        $field_array = [
2✔
598
            'result_data' => json_encode($result),
2✔
599
        ];
2✔
600

601
        $this->eventDispatcher->dispatch(new AfterQueueItemAddedEvent($queueId, $field_array));
2✔
602
        $this->logger?->debug('crawler-readurl stop ' . microtime(true));
2✔
603

604
        return $ret;
2✔
605
    }
606

607
    /**
608
     * Read URL for not-yet-inserted log-entry
609
     *
610
     * @param array $field_array Queue field array,
611
     *
612
     * @return array|bool|mixed|string
613
     */
614
    public function readUrlFromArray($field_array)
615
    {
616
        // Set exec_time to lock record:
617
        $field_array['exec_time'] = $this->getCurrentTime();
1✔
618
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(
1✔
619
            QueueRepository::TABLE_NAME
1✔
620
        );
1✔
621
        $connectionForCrawlerQueue->insert(QueueRepository::TABLE_NAME, $field_array);
1✔
622
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId();
1✔
623
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1✔
624

625
        // Set result in log which also denotes the end of the processing of this entry.
626
        $field_array = [
1✔
627
            'result_data' => json_encode($result),
1✔
628
        ];
1✔
629

630
        $this->eventDispatcher->dispatch(new AfterQueueItemAddedEvent($queueId, $field_array));
1✔
631

632
        return $result;
1✔
633
    }
634

635
    /*****************************
636
     *
637
     * Compiling URLs to crawl - tools
638
     *
639
     *****************************/
640

641
    /**
642
     * This draws the pageTree with URLs for e.g the Backend Log Module
643
     *
644
     * @param integer $id Root page id to start from.
645
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
646
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
647
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
648
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
649
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
650
     * @param array $incomingProcInstructions Array of processing instructions
651
     * @param array $configurationSelection Array of configuration keys
652
     * @return array
653
     */
654
    public function getPageTreeAndUrls(
655
        $id,
656
        $depth,
657
        $scheduledTime,
658
        $reqMinute,
659
        $submitCrawlUrls,
660
        $downloadCrawlUrls,
661
        array $incomingProcInstructions,
662
        array $configurationSelection
663
    ) {
664
        $this->scheduledTime = $scheduledTime;
5✔
665
        $this->reqMinute = $reqMinute;
5✔
666
        $this->submitCrawlUrls = $submitCrawlUrls;
5✔
667
        $this->downloadCrawlUrls = $downloadCrawlUrls;
5✔
668
        $this->incomingProcInstructions = $incomingProcInstructions;
5✔
669
        $this->incomingConfigurationSelection = $configurationSelection;
5✔
670

671
        $this->duplicateTrack = [];
5✔
672
        $this->downloadUrls = [];
5✔
673

674
        // Drawing tree:
675
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
5✔
676
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
5✔
677
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
5✔
678

679
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
5✔
680
        if (is_array($pageInfo)) {
5✔
681
            // Set root row:
682
            $tree->tree[] = [
5✔
683
                'row' => $pageInfo,
5✔
684
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, IconSize::SMALL),
5✔
685
            ];
5✔
686
        }
687

688
        // Get branch beneath:
689
        if ($depth) {
5✔
690
            $tree->getTree($id, $depth, '');
1✔
691
        }
692

693
        $queueRows = [];
5✔
694

695
        // Traverse page tree:
696
        foreach ($tree->tree as $data) {
5✔
697
            $this->MP = null;
5✔
698

699
            // recognize mount points
700
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
5✔
701
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
×
702

703
                // fetch mounted pages
704
                $this->MP = $mountpage['mount_pid'] . '-' . $data['row']['uid'];
×
705

706
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
×
707
                $mountTree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
×
708
                $mountTree->getTree($mountpage['mount_pid'], $depth);
×
709

710
                foreach ($mountTree->tree as $mountData) {
×
711
                    $queueRows = array_merge($queueRows, $this->drawURLs_addRowsForPage(
×
712
                        $mountData['row'],
×
713
                        BackendUtility::getRecordTitle('pages', $mountData['row'], true),
×
714
                        (string) $data['HTML']
×
715
                    ));
×
716
                }
717

718
                // replace page when mount_pid_ol is enabled
719
                if ($mountpage['mount_pid_ol']) {
×
720
                    $data['row']['uid'] = $mountpage['mount_pid'];
×
721
                } else {
722
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
723
                    $this->MP = null;
×
724
                }
725
            }
726

727
            $queueRows = array_merge($queueRows, $this->drawURLs_addRowsForPage(
5✔
728
                $data['row'],
5✔
729
                BackendUtility::getRecordTitle('pages', $data['row'], true),
5✔
730
                (string) $data['HTML']
5✔
731
            ));
5✔
732
        }
733

734
        return $queueRows;
5✔
735
    }
736

737
    /**
738
     * Create the rows for display of the page tree
739
     * For each page a number of rows are shown displaying GET variable configuration
740
     */
741
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle, string $pageTitleHTML = ''): array
742
    {
743
        $skipMessage = '';
5✔
744

745
        // Get list of configurations
746
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
5✔
747
        $configurations = ConfigurationService::removeDisallowedConfigurations(
5✔
748
            $this->incomingConfigurationSelection,
5✔
749
            $configurations
5✔
750
        );
5✔
751

752
        // Traverse parameter combinations:
753
        $c = 0;
5✔
754

755
        $queueRowCollection = [];
5✔
756

757
        if (!empty($configurations)) {
5✔
758
            foreach ($configurations as $confKey => $confArray) {
5✔
759
                // Title column:
760
                if (!$c) {
5✔
761
                    $queueRow = new QueueRow($pageTitle);
5✔
762
                } else {
763
                    $queueRow = new QueueRow();
×
764
                }
765
                $queueRow->setPageTitleHTML($pageTitleHTML);
5✔
766

767
                if (!in_array(
5✔
768
                    $pageRow['uid'],
5✔
769
                    $this->configurationService->expandExcludeString($confArray['subCfg']['exclude'] ?? ''),
5✔
770
                    true
5✔
771
                )) {
5✔
772
                    // URL list:
773
                    $urlList = $this->urlListFromUrlArray(
5✔
774
                        $confArray,
5✔
775
                        $pageRow,
5✔
776
                        $this->scheduledTime,
5✔
777
                        $this->reqMinute,
5✔
778
                        $this->submitCrawlUrls,
5✔
779
                        $this->downloadCrawlUrls,
5✔
780
                        $this->duplicateTrack,
5✔
781
                        $this->downloadUrls,
5✔
782
                        // if empty the urls won't be filtered by processing instructions
783
                        $this->incomingProcInstructions
5✔
784
                    );
5✔
785

786
                    // Expanded parameters:
787
                    $paramExpanded = '';
5✔
788
                    $calcAccu = [];
5✔
789
                    $calcRes = 1;
5✔
790
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
5✔
791
                        $paramExpanded .= '
×
792
                            <tr>
793
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
×
794
                            '(' . count($gVal) . ')' .
×
795
                            '</td>
×
796
                                <td nowrap="nowrap">' . nl2br(
×
797
                                htmlspecialchars(implode(chr(10), $gVal))
×
798
                            ) . '</td>
×
799
                            </tr>
800
                        ';
×
801
                        $calcRes *= count($gVal);
×
802
                        $calcAccu[] = count($gVal);
×
803
                    }
804
                    if (!empty($paramExpanded)) {
5✔
805
                        $paramExpanded = "<table>{$paramExpanded}</table>";
×
806
                    }
807
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
5✔
808

809
                    // Options
810
                    $queueRowOptionCollection = [];
5✔
811
                    if ($confArray['subCfg']['userGroups'] ?? false) {
5✔
812
                        $queueRowOptionCollection[] = 'User Groups: ' . $confArray['subCfg']['userGroups'];
×
813
                    }
814
                    if ($confArray['subCfg']['procInstrFilter'] ?? false) {
5✔
815
                        $queueRowOptionCollection[] = 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'];
×
816
                    }
817

818
                    $parameterConfig = nl2br(
5✔
819
                        htmlspecialchars(rawurldecode(
5✔
820
                            trim(str_replace(
5✔
821
                                '&',
5✔
822
                                chr(10) . '&',
5✔
823
                                GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'] ?? [])
5✔
824
                            ))
5✔
825
                        ))
5✔
826
                    );
5✔
827
                    $queueRow->setValuesExpanded($paramExpanded);
5✔
828
                    $queueRow->setConfigurationKey($confKey);
5✔
829
                    $queueRow->setUrls($urlList);
5✔
830
                    $queueRow->setOptions($queueRowOptionCollection);
5✔
831
                    $queueRow->setParameters(DebugUtility::viewArray($confArray['subCfg']['procInstrParams.'] ?? []));
5✔
832
                    $queueRow->setParameterConfig($parameterConfig);
5✔
833
                } else {
834
                    $queueRow->setConfigurationKey($confKey);
×
835
                    $queueRow->setMessage('(Page is excluded in this configuration)');
×
836
                }
837
                $queueRowCollection[] = $queueRow;
5✔
838

839
                $c++;
5✔
840
            }
841
        } else {
842
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1✔
843
            $queueRow = new QueueRow($pageTitle);
1✔
844
            $queueRow->setPageTitleHTML($pageTitleHTML);
1✔
845
            $queueRow->setMessage($message);
1✔
846
            $queueRowCollection[] = $queueRow;
1✔
847
        }
848

849
        return $queueRowCollection;
5✔
850
    }
851

852
    /**
853
     * Returns a md5 hash generated from a serialized configuration array.
854
     *
855
     * @return string
856
     */
857
    protected function getConfigurationHash(array $configuration)
858
    {
859
        unset($configuration['paramExpanded']);
14✔
860
        unset($configuration['URLs']);
14✔
861
        return md5(serialize($configuration));
14✔
862
    }
863

864
    protected function getPageService(): PageService
865
    {
866
        return new PageService();
8✔
867
    }
868

869
    /**
870
     * @return BackendUserAuthentication
871
     */
872
    private function getBackendUser()
873
    {
874
        // Make sure the _cli_ user is loaded
875
        Bootstrap::initializeBackendAuthentication();
6✔
876
        if ($this->backendUser === null) {
6✔
877
            $this->backendUser = $GLOBALS['BE_USER'];
6✔
878
        }
879
        return $this->backendUser;
6✔
880
    }
881
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc