• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tomasnorre / crawler / 17511552440

06 Sep 2025 07:12AM UTC coverage: 68.713% (+0.1%) from 68.597%
17511552440

push

github

tomasnorre
[FEATURE] Expose error when crawl response has not X-T3Crawler-Meta header

8 of 8 new or added lines in 1 file covered. (100.0%)

7 existing lines in 1 file now uncovered.

1880 of 2736 relevant lines covered (68.71%)

3.26 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.1
/Classes/Command/BuildQueueCommand.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace AOE\Crawler\Command;
6

7
/*
8
 * (c) 2021 Tomas Norre Mikkelsen <tomasnorre@gmail.com>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21

22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Controller\CrawlerController;
24
use AOE\Crawler\Converter\JsonCompatibilityConverter;
25
use AOE\Crawler\Domain\Model\Reason;
26
use AOE\Crawler\Domain\Repository\QueueRepository;
27
use AOE\Crawler\Event\InvokeQueueChangeEvent;
28
use AOE\Crawler\Utility\MessageUtility;
29
use AOE\Crawler\Value\QueueRow;
30
use Symfony\Component\Console\Command\Command;
31
use Symfony\Component\Console\Helper\ProgressBar;
32
use Symfony\Component\Console\Input\InputArgument;
33
use Symfony\Component\Console\Input\InputInterface;
34
use Symfony\Component\Console\Input\InputOption;
35
use Symfony\Component\Console\Output\OutputInterface;
36
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
37
use TYPO3\CMS\Core\EventDispatcher\EventDispatcher;
38
use TYPO3\CMS\Core\Utility\GeneralUtility;
39
use TYPO3\CMS\Core\Utility\MathUtility;
40

41
/**
42
 * @internal since v12.0.0
43
 */
44
class BuildQueueCommand extends Command
45
{
46
    public function __construct(
47
        private readonly JsonCompatibilityConverter $jsonCompatibilityConverter,
48
        private readonly EventDispatcher $eventDispatcher,
49
        private readonly QueueRepository $queueRepository,
50
        private readonly PageRepository $pageRepository,
51
        private readonly CrawlerController $crawlerController,
52
    ) {
53
        parent::__construct();
6✔
54
    }
55

56
    protected function configure(): void
57
    {
58
        $this->setDescription('Create entries in the queue that can be processed at once');
6✔
59

60
        $this->setHelp(
6✔
61
            'Try "typo3 help crawler:buildQueue" to see your options' . chr(10) . chr(10) .
6✔
62
            'Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
6✔
63
It can put entries in the queue from command line options, return the list of URLs and even execute
64
all entries right away without having to queue them up - this can be useful for immediate re-cache,
65
re-indexing or static publishing from command line.' . chr(10) . chr(10) .
6✔
66
            '
6✔
67
            Examples:
68
              --- Re-cache pages from page 7 and two levels down, executed immediately
69
              $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec
70

71
              --- Put entries for re-caching pages from page 7 into queue, 4 every minute.
72
              $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4
73
            '
6✔
74
        );
6✔
75

76
        $this->addArgument('page', InputArgument::REQUIRED, 'The page from where the queue building should start');
6✔
77

78
        $this->addArgument('conf', InputArgument::REQUIRED, 'A comma separated list of crawler configurations');
6✔
79

80
        $this->addOption(
6✔
81
            'depth',
6✔
82
            'd',
6✔
83
            InputOption::VALUE_OPTIONAL,
6✔
84
            'Tree depth, 0-99\', "How many levels under the \'page_id\' to include.',
6✔
85
            '0'
6✔
86
        );
6✔
87

88
        $this->addOption(
6✔
89
            'mode',
6✔
90
            'm',
6✔
91
            InputOption::VALUE_OPTIONAL,
6✔
92
            'Specifies output modes url : Will list URLs which wget could use as input. queue: Will put entries in queue table. exec: Will execute all entries right away!'
6✔
93
        );
6✔
94

95
        $this->addOption(
6✔
96
            'number',
6✔
97
            '',
6✔
98
            InputOption::VALUE_OPTIONAL,
6✔
99
            'Specifies how many items are put in the queue per minute. Only valid for output mode "queue"',
6✔
100
            '0'
6✔
101
        );
6✔
102
    }
103

104
    /**
105
     * Crawler Command - Submitting URLs to be crawled.
106
     *
107
     * Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
108
     * It can put entries in the queue from command line options, return the list of URLs and even execute
109
     * all entries right away without having to queue them up - this can be useful for immediate re-cache,
110
     * re-indexing or static publishing from command line.
111
     *
112
     * Examples:
113
     *
114
     * --- Re-cache pages from page 7 and two levels down, executed immediately
115
     * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec
116
     *
117
     *
118
     * --- Put entries for re-caching pages from page 7 into queue, 4 every minute.
119
     * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4
120
     */
121
    protected function execute(InputInterface $input, OutputInterface $output): int
122
    {
123
        $mode = $input->getOption('mode') ?? 'queue';
6✔
124

125
        $extensionSettings = GeneralUtility::makeInstance(
6✔
126
            ExtensionConfigurationProvider::class
6✔
127
        )->getExtensionConfiguration();
6✔
128

129
        if ($mode === 'exec') {
6✔
130
            $this->crawlerController->registerQueueEntriesInternallyOnly = true;
1✔
131
        }
132

133
        $pageId = MathUtility::forceIntegerInRange((int) $input->getArgument('page'), 0);
6✔
134
        if ($pageId === 0 || empty($this->pageRepository->getPage($pageId))) {
6✔
135
            $message = "Page {$pageId} is not a valid page, please check you root page id and try again.";
1✔
136
            MessageUtility::addErrorMessage($message);
1✔
137
            $output->writeln("<info>{$message}</info>");
1✔
138
            return Command::FAILURE;
1✔
139
        }
140

141
        $configurationKeys = $this->getConfigurationKeys((string) $input->getArgument('conf'));
5✔
142

143
        if ($mode === 'queue' || $mode === 'exec') {
5✔
144
            $reason = new Reason();
4✔
145
            $reason->setReason(Reason::REASON_CLI_SUBMIT);
4✔
146
            $reason->setDetailText('The cli script of the crawler added to the queue');
4✔
147
            $this->eventDispatcher->dispatch(new InvokeQueueChangeEvent($reason));
4✔
148
        }
149

150
        if ($extensionSettings['cleanUpOldQueueEntries']) {
5✔
151
            $this->queueRepository->cleanUpOldQueueEntries();
5✔
152
        }
153

154
        $this->crawlerController->setID = GeneralUtility::md5int(microtime());
5✔
155
        $queueRows = $this->getQueueRows($pageId, $input, $mode, $configurationKeys);
5✔
156

157
        match ($mode) {
5✔
158
            'url' => $output->writeln(
1✔
159
                '<info>' . implode(PHP_EOL, $this->crawlerController->downloadUrls) . PHP_EOL . '</info>'
1✔
160
            ),
1✔
161
            'exec' => $this->outputModeExec($output, $queueRows),
1✔
162
            'queue' => $this->outputModeQueue($output, $queueRows),
3✔
163
            default => $this->outputModeDefault($output, $queueRows),
×
164
        };
5✔
165

166
        return Command::SUCCESS;
5✔
167
    }
168

169
    /**
170
     * Obtains configuration keys from the CLI arguments
171
     */
172
    private function getConfigurationKeys(string $conf): array
173
    {
174
        $parameter = trim($conf);
5✔
175
        return $parameter !== '' ? GeneralUtility::trimExplode(',', $parameter) : [];
5✔
176
    }
177

178
    private function outputUrls(array $queueRows, OutputInterface $output): void
179
    {
180
        /** @var QueueRow $row */
181
        foreach ($queueRows as $row) {
4✔
182
            if (empty($row->message)) {
4✔
183
                $output->writeln('<info>' . $row->urls . '</info>');
4✔
184
            } else {
185
                $output->writeln(
1✔
186
                    '<comment>Page "' . $row->pageTitle . '" is not added to queue' . $row->message . '</comment>'
1✔
187
                );
1✔
188
            }
189
        }
190
    }
191

192
    private function outputModeDefault(OutputInterface $output, array $queueRows): void
193
    {
194
        $output->writeln(
×
195
            '<info>' . count(
×
196
                $this->crawlerController->urlList
×
197
            ) . ' entries found for processing. (Use "mode" to decide action):</info>' . PHP_EOL
×
198
        );
×
199
        $this->outputUrls($queueRows, $output);
×
200
    }
201

202
    private function outputModeQueue(OutputInterface $output, array $queueRows): void
203
    {
204
        $output->writeln(
3✔
205
            '<info>Putting ' . count($this->crawlerController->urlList) . ' entries in queue:</info>' . PHP_EOL
3✔
206
        );
3✔
207
        $this->outputUrls($queueRows, $output);
3✔
208
    }
209

210
    private function outputModeExec(OutputInterface $output, array $queueRows): void
211
    {
212
        $progressBar = new ProgressBar($output);
1✔
213
        $output->writeln(
1✔
214
            '<info>Executing ' . count($this->crawlerController->urlList) . ' requests right away:</info>'
1✔
215
        );
1✔
216
        $this->outputUrls($queueRows, $output);
1✔
217
        $output->writeln('<info>Processing</info>' . PHP_EOL);
1✔
218

219
        foreach ($progressBar->iterate($this->crawlerController->queueEntries) as $queueRec) {
1✔
220
            $p = $this->jsonCompatibilityConverter->convert($queueRec['parameters']);
1✔
221
            if (is_bool($p)) {
1✔
222
                continue;
×
223
            }
224

225
            $progressBar->clear();
1✔
226
            if (empty($p['procInstructions'][0])) {
1✔
227
                $procInstructionsString = '';
1✔
228
            } else {
229
                $procInstructionsString = ' (' . implode(',', $p['procInstructions']) . ')';
×
230
            }
231
            $output->writeln('<info>' . $p['url'] . $procInstructionsString . ' => ' . '</info>');
1✔
232
            $progressBar->display();
1✔
233

234
            $result = $this->crawlerController->readUrlFromArray($queueRec);
1✔
235

236
            $resultContent = $result['content'] ?? '';
1✔
237
            $requestResult = $this->jsonCompatibilityConverter->convert($resultContent);
1✔
238

239
            $progressBar->clear();
1✔
240
            if (is_array($requestResult)) {
1✔
241
                $resLog = array_key_exists('log', $requestResult)
1✔
242
                && is_array($requestResult['log']) ? chr(9) . chr(9) .
1✔
243
                    implode(PHP_EOL . chr(9) . chr(9), $requestResult['log']) : '';
1✔
244
                $output->writeln('<info>OK: ' . $resLog . '</info>' . PHP_EOL);
1✔
245
            } else {
UNCOV
246
                $output->writeln(
×
UNCOV
247
                    '<error>Error checking Crawler Result:  ' . substr(
×
UNCOV
248
                        (string) preg_replace('/\s+/', ' ', strip_tags((string) $resultContent)),
×
UNCOV
249
                        0,
×
UNCOV
250
                        30000
×
UNCOV
251
                    ) . '...' . PHP_EOL . '</error>' . PHP_EOL
×
UNCOV
252
                );
×
253
            }
254
            $progressBar->display();
1✔
255
        }
256
        $output->writeln('');
1✔
257
    }
258

259
    private function getQueueRows(int $pageId, InputInterface $input, mixed $mode, array $configurationKeys): array
260
    {
261
        return $this->crawlerController->getPageTreeAndUrls(
5✔
262
            $pageId,
5✔
263
            MathUtility::forceIntegerInRange((int) $input->getOption('depth'), 0, 99),
5✔
264
            $this->crawlerController->getCurrentTime(),
5✔
265
            MathUtility::forceIntegerInRange((int) $input->getOption('number') ?: 30, 1, 1000),
5✔
266
            $mode === 'queue' || $mode === 'exec',
5✔
267
            $mode === 'url',
5✔
268
            [],
5✔
269
            $configurationKeys
5✔
270
        );
5✔
271
    }
272
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc