• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tomasnorre / crawler / 3696698329

pending completion
3696698329

Pull #988

github

GitHub
Merge d1d32bd2d into 3f94d6a4e
Pull Request #988: [WIP][FEATURE] Setup new Backend Module

417 of 417 new or added lines in 9 files covered. (100.0%)

1601 of 2523 relevant lines covered (63.46%)

3.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

90.6
/Classes/Command/BuildQueueCommand.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace AOE\Crawler\Command;
6

7
/*
8
 * (c) 2021 Tomas Norre Mikkelsen <tomasnorre@gmail.com>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21

22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Controller\CrawlerController;
24
use AOE\Crawler\Converter\JsonCompatibilityConverter;
25
use AOE\Crawler\Domain\Model\Reason;
26
use AOE\Crawler\Domain\Repository\QueueRepository;
27
use AOE\Crawler\Event\InvokeQueueChangeEvent;
28
use AOE\Crawler\Utility\MessageUtility;
29
use AOE\Crawler\Value\QueueRow;
30
use Symfony\Component\Console\Command\Command;
31
use Symfony\Component\Console\Helper\ProgressBar;
32
use Symfony\Component\Console\Input\InputArgument;
33
use Symfony\Component\Console\Input\InputInterface;
34
use Symfony\Component\Console\Input\InputOption;
35
use Symfony\Component\Console\Output\OutputInterface;
36
use TYPO3\CMS\Core\EventDispatcher\EventDispatcher;
37
use TYPO3\CMS\Core\Utility\GeneralUtility;
38
use TYPO3\CMS\Core\Utility\MathUtility;
39

40
class BuildQueueCommand extends Command
41
{
42
    protected function configure(): void
43
    {
44
        $this->setDescription('Create entries in the queue that can be processed at once');
6✔
45

46
        $this->setHelp(
6✔
47
            'Try "typo3 help crawler:buildQueue" to see your options' . chr(10) . chr(10) .
6✔
48
            'Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
6✔
49
It can put entries in the queue from command line options, return the list of URLs and even execute
50
all entries right away without having to queue them up - this can be useful for immediate re-cache,
51
re-indexing or static publishing from command line.' . chr(10) . chr(10) .
6✔
52
            '
6✔
53
            Examples:
54
              --- Re-cache pages from page 7 and two levels down, executed immediately
55
              $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec
56

57
              --- Put entries for re-caching pages from page 7 into queue, 4 every minute.
58
              $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4
59
            '
6✔
60
        );
6✔
61

62
        $this->addArgument('page', InputArgument::REQUIRED, 'The page from where the queue building should start');
6✔
63

64
        $this->addArgument('conf', InputArgument::REQUIRED, 'A comma separated list of crawler configurations');
6✔
65

66
        $this->addOption(
6✔
67
            'depth',
6✔
68
            'd',
6✔
69
            InputOption::VALUE_OPTIONAL,
6✔
70
            'Tree depth, 0-99\', "How many levels under the \'page_id\' to include.',
6✔
71
            '0'
6✔
72
        );
6✔
73

74
        $this->addOption(
6✔
75
            'mode',
6✔
76
            'm',
6✔
77
            InputOption::VALUE_OPTIONAL,
6✔
78
            'Specifies output modes url : Will list URLs which wget could use as input. queue: Will put entries in queue table. exec: Will execute all entries right away!'
6✔
79
        );
6✔
80

81
        $this->addOption(
6✔
82
            'number',
6✔
83
            '',
6✔
84
            InputOption::VALUE_OPTIONAL,
6✔
85
            'Specifies how many items are put in the queue per minute. Only valid for output mode "queue"',
6✔
86
            '0'
6✔
87
        );
6✔
88
    }
89

90
    /**
91
     * Crawler Command - Submitting URLs to be crawled.
92
     *
93
     * Works as a CLI interface to some functionality from the Web > Info > Site Crawler module;
94
     * It can put entries in the queue from command line options, return the list of URLs and even execute
95
     * all entries right away without having to queue them up - this can be useful for immediate re-cache,
96
     * re-indexing or static publishing from command line.
97
     *
98
     * Examples:
99
     *
100
     * --- Re-cache pages from page 7 and two levels down, executed immediately
101
     * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 2 --mode exec
102
     *
103
     *
104
     * --- Put entries for re-caching pages from page 7 into queue, 4 every minute.
105
     * $ typo3 crawler:buildQueue 7 defaultConfiguration --depth 0 --mode queue --number 4
106
     */
107
    protected function execute(InputInterface $input, OutputInterface $output): int
108
    {
109
        /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
110
        $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
6✔
111
        $mode = $input->getOption('mode') ?? 'queue';
6✔
112

113
        $extensionSettings = GeneralUtility::makeInstance(
6✔
114
            ExtensionConfigurationProvider::class
6✔
115
        )->getExtensionConfiguration();
6✔
116
        $eventDispatcher = GeneralUtility::makeInstance(EventDispatcher::class);
6✔
117

118
        /** @var CrawlerController $crawlerController */
119
        $crawlerController = GeneralUtility::makeInstance(CrawlerController::class);
6✔
120
        /** @var QueueRepository $queueRepository */
121
        $queueRepository = GeneralUtility::makeInstance(QueueRepository::class);
6✔
122

123
        if ($mode === 'exec') {
6✔
124
            $crawlerController->registerQueueEntriesInternallyOnly = true;
1✔
125
        }
126

127
        $pageId = MathUtility::forceIntegerInRange((int) $input->getArgument('page'), 0);
6✔
128
        if ($pageId === 0) {
6✔
129
            $message = "Page ${pageId} is not a valid page, please check you root page id and try again.";
1✔
130
            MessageUtility::addErrorMessage($message);
1✔
131
            $output->writeln("<info>${message}</info>");
1✔
132
            return Command::FAILURE;
1✔
133
        }
134

135
        $configurationKeys = $this->getConfigurationKeys((string) $input->getArgument('conf'));
5✔
136

137
        if ($mode === 'queue' || $mode === 'exec') {
5✔
138
            $reason = new Reason();
4✔
139
            $reason->setReason(Reason::REASON_CLI_SUBMIT);
4✔
140
            $reason->setDetailText('The cli script of the crawler added to the queue');
4✔
141
            $eventDispatcher->dispatch(new InvokeQueueChangeEvent($reason));
4✔
142
        }
143

144
        if ($extensionSettings['cleanUpOldQueueEntries']) {
5✔
145
            $queueRepository->cleanUpOldQueueEntries();
5✔
146
        }
147

148
        $crawlerController->setID = GeneralUtility::md5int(microtime());
5✔
149
        $queueRows = $crawlerController->getPageTreeAndUrls(
5✔
150
            $pageId,
5✔
151
            MathUtility::forceIntegerInRange((int) $input->getOption('depth'), 0, 99),
5✔
152
            $crawlerController->getCurrentTime(),
5✔
153
            MathUtility::forceIntegerInRange((int) $input->getOption('number') ?: 30, 1, 1000),
5✔
154
            $mode === 'queue' || $mode === 'exec',
5✔
155
            $mode === 'url',
5✔
156
            [],
5✔
157
            $configurationKeys
5✔
158
        );
5✔
159

160
        if ($mode === 'url') {
5✔
161
            $output->writeln('<info>' . implode(PHP_EOL, $crawlerController->downloadUrls) . PHP_EOL . '</info>');
1✔
162
        } elseif ($mode === 'exec') {
4✔
163
            $progressBar = new ProgressBar($output);
1✔
164
            $output->writeln('<info>Executing ' . count($crawlerController->urlList) . ' requests right away:</info>');
1✔
165
            $this->outputUrls($queueRows, $output);
1✔
166
            $output->writeln('<info>Processing</info>' . PHP_EOL);
1✔
167

168
            foreach ($progressBar->iterate($crawlerController->queueEntries) as $queueRec) {
1✔
169
                $p = $jsonCompatibilityConverter->convert($queueRec['parameters']);
1✔
170
                if (is_bool($p)) {
1✔
171
                    continue;
×
172
                }
173

174
                $progressBar->clear();
1✔
175
                $output->writeln(
1✔
176
                    '<info>' . $p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ' . '</info>' . PHP_EOL
1✔
177
                );
1✔
178
                $progressBar->display();
1✔
179

180
                $result = $crawlerController->readUrlFromArray($queueRec);
1✔
181

182
                $resultContent = $result['content'] ?? '';
1✔
183
                $requestResult = $jsonCompatibilityConverter->convert($resultContent);
1✔
184

185
                $progressBar->clear();
1✔
186
                if (is_array($requestResult)) {
1✔
187
                    $resLog = array_key_exists('log', $requestResult)
×
188
                    && is_array($requestResult['log']) ? PHP_EOL . chr(9) . chr(9) .
×
189
                        implode(PHP_EOL . chr(9) . chr(9), $requestResult['log']) : '';
×
190
                    $output->writeln('<info>OK: ' . $resLog . '</info>' . PHP_EOL);
×
191
                } else {
192
                    $output->writeln(
1✔
193
                        '<error>Error checking Crawler Result:  ' . substr(
1✔
194
                            preg_replace('/\s+/', ' ', strip_tags($resultContent)),
1✔
195
                            0,
1✔
196
                            30000
1✔
197
                        ) . '...' . PHP_EOL . '</error>' . PHP_EOL
1✔
198
                    );
1✔
199
                }
200
                $progressBar->display();
1✔
201
            }
202
            $output->writeln('');
1✔
203
        } elseif ($mode === 'queue') {
3✔
204
            $output->writeln(
3✔
205
                '<info>Putting ' . count($crawlerController->urlList) . ' entries in queue:</info>' . PHP_EOL
3✔
206
            );
3✔
207
            $this->outputUrls($queueRows, $output);
3✔
208
        } else {
209
            $output->writeln(
×
210
                '<info>' . count(
×
211
                    $crawlerController->urlList
×
212
                ) . ' entries found for processing. (Use "mode" to decide action):</info>' . PHP_EOL
×
213
            );
×
214
            $this->outputUrls($queueRows, $output);
×
215
        }
216

217
        return Command::SUCCESS;
5✔
218
    }
219

220
    /**
221
     * Obtains configuration keys from the CLI arguments
222
     */
223
    private function getConfigurationKeys(string $conf): array
224
    {
225
        $parameter = trim($conf);
5✔
226
        return ($parameter !== '' ? GeneralUtility::trimExplode(',', $parameter) : []);
5✔
227
    }
228

229
    private function outputUrls(array $queueRows, OutputInterface $output): void
230
    {
231
        /** @var QueueRow $row */
232
        foreach ($queueRows as $row) {
4✔
233
            if (empty($row->message)) {
4✔
234
                $output->writeln('<info>' . $row->urls . '</info>');
4✔
235
            } else {
236
                $output->writeln('<warning>' . $row->pageTitle . ': ' . $row->message . '</warning>');
1✔
237
            }
238
        }
239
    }
240
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc