• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tomasnorre / crawler / 11237471329

08 Oct 2024 02:20PM UTC coverage: 68.586% (-1.3%) from 69.862%
11237471329

push

github

web-flow
ci: Update coveralls workflow (#1109)

1834 of 2674 relevant lines covered (68.59%)

3.37 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.08
/Classes/Command/BuildQueueCommand.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace AOE\Crawler\Command;
6

7
/*
8
 * (c) 2021 Tomas Norre Mikkelsen <tomasnorre@gmail.com>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21

22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Controller\CrawlerController;
24
use AOE\Crawler\Converter\JsonCompatibilityConverter;
25
use AOE\Crawler\Domain\Model\Reason;
26
use AOE\Crawler\Domain\Repository\QueueRepository;
27
use AOE\Crawler\Event\InvokeQueueChangeEvent;
28
use AOE\Crawler\Utility\MessageUtility;
29
use AOE\Crawler\Value\QueueRow;
30
use Symfony\Component\Console\Command\Command;
31
use Symfony\Component\Console\Helper\ProgressBar;
32
use Symfony\Component\Console\Input\InputArgument;
33
use Symfony\Component\Console\Input\InputInterface;
34
use Symfony\Component\Console\Input\InputOption;
35
use Symfony\Component\Console\Output\OutputInterface;
36
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
37
use TYPO3\CMS\Core\EventDispatcher\EventDispatcher;
38
use TYPO3\CMS\Core\Utility\GeneralUtility;
39
use TYPO3\CMS\Core\Utility\MathUtility;
40

41
/**
42
 * @internal since v12.0.0
43
 */
44
class BuildQueueCommand extends Command
45
{
46
    protected function configure(): void
47
    {
48
        $this->setDescription('Create entries in the queue that can be processed at once');
6✔
49

50
        $this->setHelp(
6✔
51
            'Try "typo3 help crawler:buildQueue" to see your options' . chr(10) . chr(10) .
6✔
52
            'Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
6✔
53
It can put entries in the queue from command line options, return the list of URLs and even execute
54
all entries right away without having to queue them up - this can be useful for immediate re-cache,
55
re-indexing or static publishing from command line.' . chr(10) . chr(10) .
6✔
56
            '
6✔
57
            Examples:
58
              --- Re-cache pages from page 7 and two levels down, executed immediately
59
              $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec
60

61
              --- Put entries for re-caching pages from page 7 into queue, 4 every minute.
62
              $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4
63
            '
6✔
64
        );
6✔
65

66
        $this->addArgument('page', InputArgument::REQUIRED, 'The page from where the queue building should start');
6✔
67

68
        $this->addArgument('conf', InputArgument::REQUIRED, 'A comma separated list of crawler configurations');
6✔
69

70
        $this->addOption(
6✔
71
            'depth',
6✔
72
            'd',
6✔
73
            InputOption::VALUE_OPTIONAL,
6✔
74
            'Tree depth, 0-99\', "How many levels under the \'page_id\' to include.',
6✔
75
            '0'
6✔
76
        );
6✔
77

78
        $this->addOption(
6✔
79
            'mode',
6✔
80
            'm',
6✔
81
            InputOption::VALUE_OPTIONAL,
6✔
82
            'Specifies output modes url : Will list URLs which wget could use as input. queue: Will put entries in queue table. exec: Will execute all entries right away!'
6✔
83
        );
6✔
84

85
        $this->addOption(
6✔
86
            'number',
6✔
87
            '',
6✔
88
            InputOption::VALUE_OPTIONAL,
6✔
89
            'Specifies how many items are put in the queue per minute. Only valid for output mode "queue"',
6✔
90
            '0'
6✔
91
        );
6✔
92
    }
93

94
    /**
95
     * Crawler Command - Submitting URLs to be crawled.
96
     *
97
     * Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
98
     * It can put entries in the queue from command line options, return the list of URLs and even execute
99
     * all entries right away without having to queue them up - this can be useful for immediate re-cache,
100
     * re-indexing or static publishing from command line.
101
     *
102
     * Examples:
103
     *
104
     * --- Re-cache pages from page 7 and two levels down, executed immediately
105
     * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec
106
     *
107
     *
108
     * --- Put entries for re-caching pages from page 7 into queue, 4 every minute.
109
     * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4
110
     */
111
    protected function execute(InputInterface $input, OutputInterface $output): int
112
    {
113
        /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
114
        $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
6✔
115
        $mode = $input->getOption('mode') ?? 'queue';
6✔
116

117
        $extensionSettings = GeneralUtility::makeInstance(
6✔
118
            ExtensionConfigurationProvider::class
6✔
119
        )->getExtensionConfiguration();
6✔
120
        $eventDispatcher = GeneralUtility::makeInstance(EventDispatcher::class);
6✔
121

122
        /** @var CrawlerController $crawlerController */
123
        $crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
6✔
124
        /** @var QueueRepository $queueRepository */
125
        $queueRepository = GeneralUtility::makeInstance(QueueRepository::class);
6✔
126
        $pageRepository = GeneralUtility::makeInstance(PageRepository::class);
6✔
127

128
        if ($mode === 'exec') {
6✔
129
            $crawlerController->registerQueueEntriesInternallyOnly = true;
1✔
130
        }
131

132
        $pageId = MathUtility::forceIntegerInRange((int) $input->getArgument('page'), 0);
6✔
133
        if ($pageId === 0 || empty($pageRepository->getPage($pageId))) {
6✔
134
            $message = "Page {$pageId} is not a valid page, please check you root page id and try again.";
1✔
135
            MessageUtility::addErrorMessage($message);
1✔
136
            $output->writeln("<info>{$message}</info>");
1✔
137
            return Command::FAILURE;
1✔
138
        }
139

140
        $configurationKeys = $this->getConfigurationKeys((string) $input->getArgument('conf'));
5✔
141

142
        if ($mode === 'queue' || $mode === 'exec') {
5✔
143
            $reason = new Reason();
4✔
144
            $reason->setReason(Reason::REASON_CLI_SUBMIT);
4✔
145
            $reason->setDetailText('The cli script of the crawler added to the queue');
4✔
146
            $eventDispatcher->dispatch(new InvokeQueueChangeEvent($reason));
4✔
147
        }
148

149
        if ($extensionSettings['cleanUpOldQueueEntries']) {
5✔
150
            $queueRepository->cleanUpOldQueueEntries();
5✔
151
        }
152

153
        $crawlerController->setID = GeneralUtility::md5int(microtime());
5✔
154
        $queueRows = $crawlerController->getPageTreeAndUrls(
5✔
155
            $pageId,
5✔
156
            MathUtility::forceIntegerInRange((int) $input->getOption('depth'), 0, 99),
5✔
157
            $crawlerController->getCurrentTime(),
5✔
158
            MathUtility::forceIntegerInRange((int) $input->getOption('number') ?: 30, 1, 1000),
5✔
159
            $mode === 'queue' || $mode === 'exec',
5✔
160
            $mode === 'url',
5✔
161
            [],
5✔
162
            $configurationKeys
5✔
163
        );
5✔
164

165
        if ($mode === 'url') {
5✔
166
            $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->downloadUrls) . PHP_EOL . '</info>');
1✔
167
        } elseif ($mode === 'exec') {
4✔
168
            $progressBar = new ProgressBar($output);
1✔
169
            $output->writeln('<info>Executing ' . count($crawlerController->urlList) . ' requests right away:</info>');
1✔
170
            $this->outputUrls($queueRows, $output);
1✔
171
            $output->writeln('<info>Processing</info>' . PHP_EOL);
1✔
172

173
            foreach ($progressBar->iterate($crawlerController->queueEntries) as $queueRec) {
1✔
174
                $p = $jsonCompatibilityConverter->convert($queueRec['parameters']);
1✔
175
                if (is_bool($p)) {
1✔
176
                    continue;
×
177
                }
178

179
                $progressBar->clear();
1✔
180
                if (empty($p['procInstructions'][0])) {
1✔
181
                    $procInstructionsString = '';
1✔
182
                } else {
183
                    $procInstructionsString = ' (' . implode(',', $p['procInstructions']) . ')';
×
184
                }
185
                $output->writeln('<info>' . $p['url'] . $procInstructionsString . ' => ' . '</info>');
1✔
186
                $progressBar->display();
1✔
187

188
                $result = $crawlerController->readUrlFromArray($queueRec);
1✔
189

190
                $resultContent = $result['content'] ?? '';
1✔
191
                $requestResult = $jsonCompatibilityConverter->convert($resultContent);
1✔
192

193
                $progressBar->clear();
1✔
194
                if (is_array($requestResult)) {
1✔
195
                    $resLog = array_key_exists('log', $requestResult)
×
196
                    && is_array($requestResult['log']) ? chr(9) . chr(9) .
×
197
                        implode(PHP_EOL . chr(9) . chr(9), $requestResult['log']) : '';
×
198
                    $output->writeln('<info>OK: ' . $resLog . '</info>' . PHP_EOL);
×
199
                } else {
200
                    $output->writeln(
1✔
201
                        '<error>Error checking Crawler Result:  ' . substr(
1✔
202
                            (string) preg_replace('/\s+/', ' ', strip_tags((string) $resultContent)),
1✔
203
                            0,
1✔
204
                            30000
1✔
205
                        ) . '...' . PHP_EOL . '</error>' . PHP_EOL
1✔
206
                    );
1✔
207
                }
208
                $progressBar->display();
1✔
209
            }
210
            $output->writeln('');
1✔
211
        } elseif ($mode === 'queue') {
3✔
212
            $output->writeln(
3✔
213
                '<info>Putting ' . count($crawlerController->urlList) . ' entries in queue:</info>' . PHP_EOL
3✔
214
            );
3✔
215
            $this->outputUrls($queueRows, $output);
3✔
216
        } else {
217
            $output->writeln(
×
218
                '<info>' . count(
×
219
                    $crawlerController->urlList
×
220
                ) . ' entries found for processing. (Use "mode" to decide action):</info>' . PHP_EOL
×
221
            );
×
222
            $this->outputUrls($queueRows, $output);
×
223
        }
224

225
        return Command::SUCCESS;
5✔
226
    }
227

228
    /**
229
     * Obtains configuration keys from the CLI arguments
230
     */
231
    private function getConfigurationKeys(string $conf): array
232
    {
233
        $parameter = trim($conf);
5✔
234
        return $parameter !== '' ? GeneralUtility::trimExplode(',', $parameter) : [];
5✔
235
    }
236

237
    private function outputUrls(array $queueRows, OutputInterface $output): void
238
    {
239
        /** @var QueueRow $row */
240
        foreach ($queueRows as $row) {
4✔
241
            if (empty($row->message)) {
4✔
242
                $output->writeln('<info>' . $row->urls . '</info>');
4✔
243
            } else {
244
                $output->writeln(
1✔
245
                    '<comment>Page "' . $row->pageTitle . '" is not added to queue' . $row->message . '</comment>'
1✔
246
                );
1✔
247
            }
248
        }
249
    }
250
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc