• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tomasnorre / crawler / 13166490870

05 Feb 2025 09:01PM UTC coverage: 67.753% (-1.4%) from 69.194%
13166490870

push

github

web-flow
[TASK] Refactor commands (#1127)

42 of 75 new or added lines in 2 files covered. (56.0%)

19 existing lines in 2 files now uncovered.

1830 of 2701 relevant lines covered (67.75%)

3.27 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

73.6
/Classes/Command/BuildQueueCommand.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace AOE\Crawler\Command;
6

7
/*
8
 * (c) 2021 Tomas Norre Mikkelsen <tomasnorre@gmail.com>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21

22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Controller\CrawlerController;
24
use AOE\Crawler\Converter\JsonCompatibilityConverter;
25
use AOE\Crawler\Domain\Model\Reason;
26
use AOE\Crawler\Domain\Repository\QueueRepository;
27
use AOE\Crawler\Event\InvokeQueueChangeEvent;
28
use AOE\Crawler\Utility\MessageUtility;
29
use AOE\Crawler\Value\QueueRow;
30
use Symfony\Component\Console\Command\Command;
31
use Symfony\Component\Console\Helper\ProgressBar;
32
use Symfony\Component\Console\Input\InputArgument;
33
use Symfony\Component\Console\Input\InputInterface;
34
use Symfony\Component\Console\Input\InputOption;
35
use Symfony\Component\Console\Output\OutputInterface;
36
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
37
use TYPO3\CMS\Core\EventDispatcher\EventDispatcher;
38
use TYPO3\CMS\Core\Utility\GeneralUtility;
39
use TYPO3\CMS\Core\Utility\MathUtility;
40

41
/**
42
 * @internal since v12.0.0
43
 */
44
class BuildQueueCommand extends Command
45
{
46
    public function __construct(
47
        private readonly JsonCompatibilityConverter $jsonCompatibilityConverter,
48
        private readonly EventDispatcher $eventDispatcher,
49
        private readonly QueueRepository $queueRepository,
50
        private readonly PageRepository $pageRepository,
51
        private readonly CrawlerController $crawlerController,
52
    ) {
53
        parent::__construct();
6✔
54
    }
55

56
    protected function configure(): void
57
    {
58
        $this->setDescription('Create entries in the queue that can be processed at once');
6✔
59

60
        $this->setHelp(
6✔
61
            'Try "typo3 help crawler:buildQueue" to see your options' . chr(10) . chr(10) .
6✔
62
            'Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
6✔
63
It can put entries in the queue from command line options, return the list of URLs and even execute
64
all entries right away without having to queue them up - this can be useful for immediate re-cache,
65
re-indexing or static publishing from command line.' . chr(10) . chr(10) .
6✔
66
            '
6✔
67
            Examples:
68
              --- Re-cache pages from page 7 and two levels down, executed immediately
69
              $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec
70

71
              --- Put entries for re-caching pages from page 7 into queue, 4 every minute.
72
              $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4
73
            '
6✔
74
        );
6✔
75

76
        $this->addArgument('page', InputArgument::REQUIRED, 'The page from where the queue building should start');
6✔
77

78
        $this->addArgument('conf', InputArgument::REQUIRED, 'A comma separated list of crawler configurations');
6✔
79

80
        $this->addOption(
6✔
81
            'depth',
6✔
82
            'd',
6✔
83
            InputOption::VALUE_OPTIONAL,
6✔
84
            'Tree depth, 0-99\', "How many levels under the \'page_id\' to include.',
6✔
85
            '0'
6✔
86
        );
6✔
87

88
        $this->addOption(
6✔
89
            'mode',
6✔
90
            'm',
6✔
91
            InputOption::VALUE_OPTIONAL,
6✔
92
            'Specifies output modes url : Will list URLs which wget could use as input. queue: Will put entries in queue table. exec: Will execute all entries right away!'
6✔
93
        );
6✔
94

95
        $this->addOption(
6✔
96
            'number',
6✔
97
            '',
6✔
98
            InputOption::VALUE_OPTIONAL,
6✔
99
            'Specifies how many items are put in the queue per minute. Only valid for output mode "queue"',
6✔
100
            '0'
6✔
101
        );
6✔
102
    }
103

104
    /**
105
     * Crawler Command - Submitting URLs to be crawled.
106
     *
107
     * Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
108
     * It can put entries in the queue from command line options, return the list of URLs and even execute
109
     * all entries right away without having to queue them up - this can be useful for immediate re-cache,
110
     * re-indexing or static publishing from command line.
111
     *
112
     * Examples:
113
     *
114
     * --- Re-cache pages from page 7 and two levels down, executed immediately
115
     * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec
116
     *
117
     *
118
     * --- Put entries for re-caching pages from page 7 into queue, 4 every minute.
119
     * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4
120
     */
121
    protected function execute(InputInterface $input, OutputInterface $output): int
122
    {
123
        $mode = $input->getOption('mode') ?? 'queue';
6✔
124

125
        $extensionSettings = GeneralUtility::makeInstance(
6✔
126
            ExtensionConfigurationProvider::class
6✔
127
        )->getExtensionConfiguration();
6✔
128

129
        /** @var CrawlerController $crawlerController */
130
        $crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
6✔
131

132
        if ($mode === 'exec') {
6✔
133
            $crawlerController->registerQueueEntriesInternallyOnly = true;
1✔
134
        }
135

136
        $pageId = MathUtility::forceIntegerInRange((int) $input->getArgument('page'), 0);
6✔
137
        if ($pageId === 0 || empty($this->pageRepository->getPage($pageId))) {
6✔
138
            $message = "Page {$pageId} is not a valid page, please check you root page id and try again.";
1✔
139
            MessageUtility::addErrorMessage($message);
1✔
140
            $output->writeln("<info>{$message}</info>");
1✔
141
            return Command::FAILURE;
1✔
142
        }
143

144
        $configurationKeys = $this->getConfigurationKeys((string) $input->getArgument('conf'));
5✔
145

146
        if ($mode === 'queue' || $mode === 'exec') {
5✔
147
            $reason = new Reason();
4✔
148
            $reason->setReason(Reason::REASON_CLI_SUBMIT);
4✔
149
            $reason->setDetailText('The cli script of the crawler added to the queue');
4✔
150
            $this->eventDispatcher->dispatch(new InvokeQueueChangeEvent($reason));
4✔
151
        }
152

153
        if ($extensionSettings['cleanUpOldQueueEntries']) {
5✔
154
            $this->queueRepository->cleanUpOldQueueEntries();
5✔
155
        }
156

157
        $this->crawlerController->setID = GeneralUtility::md5int(microtime());
5✔
158
        $queueRows = $this->getQueueRows($pageId, $input, $mode, $configurationKeys);
5✔
159

160
        match ($mode) {
5✔
161
            'url' => $output->writeln(
1✔
162
                '<info>' . implode(PHP_EOL, $this->crawlerController->downloadUrls) . PHP_EOL . '</info>'
1✔
163
            ),
1✔
164
            'exec' => $this->outputModeExec($output, $queueRows),
1✔
165
            'queue' => $this->outputModeQueue($output, $queueRows),
3✔
NEW
166
            default => $this->outputModeDefault($output, $queueRows),
×
167
        };
5✔
168

169
        return Command::SUCCESS;
5✔
170
    }
171

172
    /**
173
     * Obtains configuration keys from the CLI arguments
174
     */
175
    private function getConfigurationKeys(string $conf): array
176
    {
177
        $parameter = trim($conf);
5✔
178
        return $parameter !== '' ? GeneralUtility::trimExplode(',', $parameter) : [];
5✔
179
    }
180

181
    private function outputUrls(array $queueRows, OutputInterface $output): void
182
    {
183
        /** @var QueueRow $row */
184
        foreach ($queueRows as $row) {
4✔
185
            if (empty($row->message)) {
4✔
186
                $output->writeln('<info>' . $row->urls . '</info>');
4✔
187
            } else {
188
                $output->writeln(
1✔
189
                    '<comment>Page "' . $row->pageTitle . '" is not added to queue' . $row->message . '</comment>'
1✔
190
                );
1✔
191
            }
192
        }
193
    }
194

195
    private function outputModeDefault(OutputInterface $output, array $queueRows): void
196
    {
NEW
197
        $output->writeln(
×
NEW
198
            '<info>' . count(
×
NEW
199
                $this->crawlerController->urlList
×
NEW
200
            ) . ' entries found for processing. (Use "mode" to decide action):</info>' . PHP_EOL
×
NEW
201
        );
×
NEW
202
        $this->outputUrls($queueRows, $output);
×
203
    }
204

205
    private function outputModeQueue(OutputInterface $output, array $queueRows): void
206
    {
207
        $output->writeln(
3✔
208
            '<info>Putting ' . count($this->crawlerController->urlList) . ' entries in queue:</info>' . PHP_EOL
3✔
209
        );
3✔
210
        $this->outputUrls($queueRows, $output);
3✔
211
    }
212

213
    private function outputModeExec(OutputInterface $output, array $queueRows): void
214
    {
215
        $progressBar = new ProgressBar($output);
1✔
216
        $output->writeln(
1✔
217
            '<info>Executing ' . count($this->crawlerController->urlList) . ' requests right away:</info>'
1✔
218
        );
1✔
219
        $this->outputUrls($queueRows, $output);
1✔
220
        $output->writeln('<info>Processing</info>' . PHP_EOL);
1✔
221

222
        foreach ($progressBar->iterate($this->crawlerController->queueEntries) as $queueRec) {
1✔
NEW
223
            $p = $this->jsonCompatibilityConverter->convert($queueRec['parameters']);
×
NEW
224
            if (is_bool($p)) {
×
NEW
225
                continue;
×
226
            }
227

NEW
228
            $progressBar->clear();
×
NEW
229
            if (empty($p['procInstructions'][0])) {
×
NEW
230
                $procInstructionsString = '';
×
231
            } else {
NEW
232
                $procInstructionsString = ' (' . implode(',', $p['procInstructions']) . ')';
×
233
            }
NEW
234
            $output->writeln('<info>' . $p['url'] . $procInstructionsString . ' => ' . '</info>');
×
NEW
235
            $progressBar->display();
×
236

NEW
237
            $result = $this->crawlerController->readUrlFromArray($queueRec);
×
238

NEW
239
            $resultContent = $result['content'] ?? '';
×
NEW
240
            $requestResult = $this->jsonCompatibilityConverter->convert($resultContent);
×
241

NEW
242
            $progressBar->clear();
×
NEW
243
            if (is_array($requestResult)) {
×
NEW
244
                $resLog = array_key_exists('log', $requestResult)
×
NEW
245
                && is_array($requestResult['log']) ? chr(9) . chr(9) .
×
NEW
246
                    implode(PHP_EOL . chr(9) . chr(9), $requestResult['log']) : '';
×
NEW
247
                $output->writeln('<info>OK: ' . $resLog . '</info>' . PHP_EOL);
×
248
            } else {
NEW
249
                $output->writeln(
×
NEW
250
                    '<error>Error checking Crawler Result:  ' . substr(
×
NEW
251
                        (string) preg_replace('/\s+/', ' ', strip_tags((string) $resultContent)),
×
NEW
252
                        0,
×
NEW
253
                        30000
×
NEW
254
                    ) . '...' . PHP_EOL . '</error>' . PHP_EOL
×
NEW
255
                );
×
256
            }
NEW
257
            $progressBar->display();
×
258
        }
259
        $output->writeln('');
1✔
260
    }
261

262
    private function getQueueRows(int $pageId, InputInterface $input, mixed $mode, array $configurationKeys): array
263
    {
264
        return $this->crawlerController->getPageTreeAndUrls(
5✔
265
            $pageId,
5✔
266
            MathUtility::forceIntegerInRange((int) $input->getOption('depth'), 0, 99),
5✔
267
            $this->crawlerController->getCurrentTime(),
5✔
268
            MathUtility::forceIntegerInRange((int) $input->getOption('number') ?: 30, 1, 1000),
5✔
269
            $mode === 'queue' || $mode === 'exec',
5✔
270
            $mode === 'url',
5✔
271
            [],
5✔
272
            $configurationKeys
5✔
273
        );
5✔
274
    }
275
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc