• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tomasnorre / crawler / 3696698329

pending completion
3696698329

Pull #988

github

GitHub
Merge d1d32bd2d into 3f94d6a4e
Pull Request #988: [WIP][FEATURE] Setup new Backend Module

417 of 417 new or added lines in 9 files covered. (100.0%)

1601 of 2523 relevant lines covered (63.46%)

3.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

87.27
/Classes/Command/ProcessQueueCommand.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace AOE\Crawler\Command;
6

7
/*
8
 * (c) 2021 Tomas Norre Mikkelsen <tomasnorre@gmail.com>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21

22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Controller\CrawlerController;
24
use AOE\Crawler\Crawler;
25
use AOE\Crawler\Domain\Model\Process;
26
use AOE\Crawler\Domain\Repository\ProcessRepository;
27
use AOE\Crawler\Domain\Repository\QueueRepository;
28
use Symfony\Component\Console\Command\Command;
29
use Symfony\Component\Console\Input\InputInterface;
30
use Symfony\Component\Console\Input\InputOption;
31
use Symfony\Component\Console\Output\OutputInterface;
32
use TYPO3\CMS\Core\Utility\GeneralUtility;
33

34
class ProcessQueueCommand extends Command
35
{
36
    private const CLI_STATUS_NOTHING_PROCCESSED = 0;
37
    private const CLI_STATUS_REMAIN = 1;
38
    private const CLI_STATUS_PROCESSED = 2;
39
    private const CLI_STATUS_ABORTED = 4;
40
    private const CLI_STATUS_POLLABLE_PROCESSED = 8;
41

42
    private Crawler $crawler;
43
    private CrawlerController $crawlerController;
44
    private ProcessRepository $processRepository;
45
    private QueueRepository $queueRepository;
46
    private string $processId;
47
    private array $extensionSettings;
48

49
    public function __construct(
50
        Crawler $crawler,
51
        CrawlerController $crawlerController,
52
        ProcessRepository $processRepository,
53
        QueueRepository $queueRepository,
54
        string $name = null
×
55
    ) {
56
        parent::__construct($name);
2✔
57
        $this->crawler = $crawler;
2✔
58
        $this->crawlerController = $crawlerController;
2✔
59
        $this->processRepository = $processRepository;
2✔
60
        $this->queueRepository = $queueRepository;
2✔
61
        $this->processId = md5(microtime() . random_bytes(12));
2✔
62
    }
63

64
    /**
65
     * Crawler Command - Crawling the URLs from the queue
66
     *
67
     * Examples:
68
     *
69
     * --- Will trigger the crawler which starts to process the queue entries
70
     * $ typo3 crawler:crawlQueue
71
     *
72
     * @return int
73
     */
74
    public function execute(InputInterface $input, OutputInterface $output)
75
    {
76
        $amount = $input->getOption('amount');
2✔
77
        $sleeptime = $input->getOption('sleeptime');
2✔
78
        $sleepafter = $input->getOption('sleepafter');
2✔
79

80
        $this->extensionSettings = $this->getExtensionSettings();
2✔
81

82
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
2✔
83

84
        /** @var QueueRepository $queueRepository */
85
        $queueRepository = GeneralUtility::makeInstance(QueueRepository::class);
2✔
86
        /** @var ProcessRepository $processRepository */
87
        $processRepository = GeneralUtility::makeInstance(ProcessRepository::class);
2✔
88

89
        /** @var Crawler $crawler */
90
        $crawler = GeneralUtility::makeInstance(Crawler::class);
2✔
91

92
        if (!$crawler->isDisabled() && $this->checkAndAcquireNewProcess($this->processId)) {
2✔
93
            $countInARun = $amount ? (int) $amount : (int) $this->extensionSettings['countInARun'];
2✔
94
            $sleepAfterFinish = $sleepafter ? (int) $sleepafter : (int) $this->extensionSettings['sleepAfterFinish'];
2✔
95
            $sleepTime = $sleeptime ? (int) $sleeptime : (int) $this->extensionSettings['sleepTime'];
2✔
96

97
            try {
98
                // Run process:
99
                $result = $this->runProcess($countInARun, $sleepTime, $sleepAfterFinish);
2✔
100
            } catch (\Throwable $e) {
×
101
                $output->writeln('<warning>' . get_class($e) . ': ' . $e->getMessage() . '</warning>');
×
102
                $result = self::CLI_STATUS_ABORTED;
×
103
            }
104

105
            // Cleanup
106
            $processRepository->deleteProcessesWithoutItemsAssigned();
2✔
107
            $processRepository->markRequestedProcessesAsNotActive([$this->processId]);
2✔
108
            $queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries([$this->processId]);
2✔
109

110
            $output->writeln(
2✔
111
                '<info>Unprocessed Items remaining:' . count(
2✔
112
                    $queueRepository->getUnprocessedItems()
2✔
113
                ) . ' (' . $this->processId . ')</info>'
2✔
114
            );
2✔
115
            $result |= (count(
2✔
116
                $queueRepository->getUnprocessedItems()
2✔
117
            ) > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
2✔
118
        } else {
119
            $result |= self::CLI_STATUS_ABORTED;
×
120
        }
121

122
        $output->writeln((string) $result);
2✔
123
        return $result & self::CLI_STATUS_ABORTED;
2✔
124
    }
125

126
    protected function configure(): void
127
    {
128
        $this->setDescription('Trigger the crawler to process the queue entries');
2✔
129

130
        $this->setHelp(
2✔
131
            'Crawler Command - Crawling the URLs from the queue' . chr(10) . chr(10) .
2✔
132
            '
2✔
133
            Examples:
134
              --- Will trigger the crawler which starts to process the queue entries
135
              $ typo3 crawler:processqueue --amount 15 --sleepafter 5 --sleeptime 2
136
            '
2✔
137
        );
2✔
138
        $this->addOption(
2✔
139
            'amount',
2✔
140
            '',
2✔
141
            InputOption::VALUE_OPTIONAL,
2✔
142
            'How many pages should be crawled during that run',
2✔
143
            '0'
2✔
144
        );
2✔
145

146
        $this->addOption(
2✔
147
            'sleepafter',
2✔
148
            '',
2✔
149
            InputOption::VALUE_OPTIONAL,
2✔
150
            'Amount of seconds which the system should use to relax after all crawls are done',
2✔
151
            '0'
2✔
152
        );
2✔
153

154
        $this->addOption(
2✔
155
            'sleeptime',
2✔
156
            '',
2✔
157
            InputOption::VALUE_OPTIONAL,
2✔
158
            'Amount of microseconds which the system should use to relax between crawls'
2✔
159
        );
2✔
160
    }
161

162
    /**
163
     * Running the functionality of the CLI (crawling URLs from queue)
164
     */
165
    private function runProcess(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
166
    {
167
        $result = 0;
2✔
168
        $counter = 0;
2✔
169

170
        // Clean up the queue
171
        $this->queueRepository->cleanupQueue();
2✔
172

173
        // Select entries:
174
        $records = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
2✔
175

176
        if (!empty($records)) {
2✔
177
            $quidList = [];
2✔
178

179
            foreach ($records as $record) {
2✔
180
                $quidList[] = $record['qid'];
2✔
181
            }
182

183
            //save the number of assigned queue entries to determine how many have been processed later
184
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds(
2✔
185
                $quidList,
2✔
186
                $this->processId
2✔
187
            );
2✔
188
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $this->processId);
2✔
189

190
            if ($numberOfAffectedRows !== count($quidList)) {
2✔
191
                return ($result | self::CLI_STATUS_ABORTED);
×
192
            }
193

194
            foreach ($records as $record) {
2✔
195
                $result |= $this->crawlerController->readUrl($record['qid'], false, $this->processId);
2✔
196

197
                $counter++;
2✔
198
                // Just to relax the system
199
                usleep($sleepTime);
2✔
200

201
                // if during the start and the current read url the cli has been disable we need to return from the function
202
                // mark the process NOT as ended.
203
                if ($this->crawler->isDisabled()) {
2✔
204
                    return ($result | self::CLI_STATUS_ABORTED);
×
205
                }
206

207
                if (!$this->processRepository->isProcessActive($this->processId)) {
2✔
208
                    $result |= self::CLI_STATUS_ABORTED;
×
209
                    //possible timeout
210
                    break;
×
211
                }
212
            }
213

214
            sleep($sleepAfterFinish);
2✔
215
        }
216

217
        if ($counter > 0) {
2✔
218
            $result |= self::CLI_STATUS_PROCESSED;
2✔
219
        }
220

221
        return $result;
2✔
222
    }
223

224
    /**
225
     * Try to acquire a new process with the given id
226
     * also performs some auto-cleanup for orphan processes
227
     */
228
    private function checkAndAcquireNewProcess(string $id): bool
229
    {
230
        $returnValue = true;
2✔
231

232
        $systemProcessId = getmypid();
2✔
233
        if (!$systemProcessId) {
2✔
234
            return false;
×
235
        }
236

237
        $processCount = 0;
2✔
238
        $orphanProcesses = [];
2✔
239

240
        $activeProcesses = $this->processRepository->findAllActive();
2✔
241

242
        /** @var Process $process */
243
        foreach ($activeProcesses as $process) {
2✔
244
            if ($process->getTtl() < time()) {
×
245
                $orphanProcesses[] = $process->getProcessId();
×
246
            } else {
247
                $processCount++;
×
248
            }
249
        }
250

251
        // if there are less than allowed active processes then add a new one
252
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
2✔
253
            $this->processRepository->addProcess($id, $systemProcessId);
2✔
254
        } else {
255
            $returnValue = false;
×
256
        }
257

258
        $this->processRepository->deleteProcessesMarkedAsDeleted();
2✔
259
        $this->processRepository->markRequestedProcessesAsNotActive($orphanProcesses);
2✔
260
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($orphanProcesses);
2✔
261

262
        return $returnValue;
2✔
263
    }
264

265
    private function getExtensionSettings(): array
266
    {
267
        return GeneralUtility::makeInstance(ExtensionConfigurationProvider::class)->getExtensionConfiguration();
2✔
268
    }
269
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc