• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tomasnorre / crawler / 11237471329

08 Oct 2024 02:20PM UTC coverage: 68.586% (-1.3%) from 69.862%
11237471329

push

github

web-flow
ci: Update coveralls workflow (#1109)

1834 of 2674 relevant lines covered (68.59%)

3.37 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

88.07
/Classes/Command/ProcessQueueCommand.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace AOE\Crawler\Command;
6

7
/*
8
 * (c) 2021 Tomas Norre Mikkelsen <tomasnorre@gmail.com>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21

22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Controller\CrawlerController;
24
use AOE\Crawler\Crawler;
25
use AOE\Crawler\Domain\Model\Process;
26
use AOE\Crawler\Domain\Repository\ProcessRepository;
27
use AOE\Crawler\Domain\Repository\QueueRepository;
28
use Symfony\Component\Console\Command\Command;
29
use Symfony\Component\Console\Input\InputInterface;
30
use Symfony\Component\Console\Input\InputOption;
31
use Symfony\Component\Console\Output\OutputInterface;
32
use TYPO3\CMS\Core\Utility\GeneralUtility;
33

34
/**
35
 * @internal since v12.0.0
36
 */
37
class ProcessQueueCommand extends Command
38
{
39
    private const CLI_STATUS_NOTHING_PROCCESSED = 0;
40
    private const CLI_STATUS_REMAIN = 1;
41
    private const CLI_STATUS_PROCESSED = 2;
42
    private const CLI_STATUS_ABORTED = 4;
43
    private const CLI_STATUS_POLLABLE_PROCESSED = 8;
44

45
    private Crawler $crawler;
46
    private CrawlerController $crawlerController;
47
    private ProcessRepository $processRepository;
48
    private QueueRepository $queueRepository;
49
    private string $processId;
50
    private array $extensionSettings;
51

52
    public function __construct(
53
        Crawler $crawler,
54
        CrawlerController $crawlerController,
55
        ProcessRepository $processRepository,
56
        QueueRepository $queueRepository,
57
        string $name = null
58
    ) {
59
        parent::__construct($name);
2✔
60
        $this->crawler = $crawler;
2✔
61
        $this->crawlerController = $crawlerController;
2✔
62
        $this->processRepository = $processRepository;
2✔
63
        $this->queueRepository = $queueRepository;
2✔
64
        $this->processId = md5(microtime() . random_bytes(12));
2✔
65
    }
66

67
    /**
68
     * Crawler Command - Crawling the URLs from the queue
69
     *
70
     * Examples:
71
     *
72
     * --- Will trigger the crawler which starts to process the queue entries
73
     * $ typo3 crawler:crawlQueue
74
     */
75
    public function execute(InputInterface $input, OutputInterface $output): int
76
    {
77
        $amount = $input->getOption('amount');
2✔
78
        $sleeptime = $input->getOption('sleeptime');
2✔
79
        $sleepafter = $input->getOption('sleepafter');
2✔
80

81
        $this->extensionSettings = $this->getExtensionSettings();
2✔
82

83
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
2✔
84

85
        /** @var QueueRepository $queueRepository */
86
        $queueRepository = GeneralUtility::makeInstance(QueueRepository::class);
2✔
87
        /** @var ProcessRepository $processRepository */
88
        $processRepository = GeneralUtility::makeInstance(ProcessRepository::class);
2✔
89

90
        /** @var Crawler $crawler */
91
        $crawler = GeneralUtility::makeInstance(Crawler::class);
2✔
92

93
        if (!$crawler->isDisabled() && $this->checkAndAcquireNewProcess($this->processId)) {
2✔
94
            $countInARun = $amount ? (int) $amount : (int) $this->extensionSettings['countInARun'];
2✔
95
            $sleepAfterFinish = $sleepafter ? (int) $sleepafter : (int) $this->extensionSettings['sleepAfterFinish'];
2✔
96
            $sleepTime = $sleeptime ? (int) $sleeptime : (int) $this->extensionSettings['sleepTime'];
2✔
97

98
            try {
99
                // Run process:
100
                $result = $this->runProcess($countInARun, $sleepTime, $sleepAfterFinish);
2✔
101
            } catch (\Throwable $e) {
×
102
                $output->writeln('<warning>' . get_class($e) . ': ' . $e->getMessage() . '</warning>');
×
103
                $result = self::CLI_STATUS_ABORTED;
×
104
            }
105

106
            // Cleanup
107
            $processRepository->deleteProcessesWithoutItemsAssigned();
2✔
108
            $processRepository->markRequestedProcessesAsNotActive([$this->processId]);
2✔
109
            $queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries([$this->processId]);
2✔
110

111
            $output->writeln(
2✔
112
                '<info>Unprocessed Items remaining:' . count(
2✔
113
                    $queueRepository->getUnprocessedItems()
2✔
114
                ) . ' (' . $this->processId . ')</info>'
2✔
115
            );
2✔
116
            $result |= (count(
2✔
117
                $queueRepository->getUnprocessedItems()
2✔
118
            ) > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
2✔
119
        } else {
120
            $result |= self::CLI_STATUS_ABORTED;
×
121
        }
122

123
        $output->writeln((string) $result);
2✔
124
        return $result & self::CLI_STATUS_ABORTED;
2✔
125
    }
126

127
    protected function configure(): void
128
    {
129
        $this->setDescription('Trigger the crawler to process the queue entries');
2✔
130

131
        $this->setHelp(
2✔
132
            'Crawler Command - Crawling the URLs from the queue' . chr(10) . chr(10) .
2✔
133
            '
2✔
134
            Examples:
135
              --- Will trigger the crawler which starts to process the queue entries
136
              $ typo3 crawler:processqueue --amount 15 --sleepafter 5 --sleeptime 2
137
            '
2✔
138
        );
2✔
139
        $this->addOption(
2✔
140
            'amount',
2✔
141
            '',
2✔
142
            InputOption::VALUE_OPTIONAL,
2✔
143
            'How many pages should be crawled during that run',
2✔
144
            '0'
2✔
145
        );
2✔
146

147
        $this->addOption(
2✔
148
            'sleepafter',
2✔
149
            '',
2✔
150
            InputOption::VALUE_OPTIONAL,
2✔
151
            'Amount of seconds which the system should use to relax after all crawls are done',
2✔
152
            '0'
2✔
153
        );
2✔
154

155
        $this->addOption(
2✔
156
            'sleeptime',
2✔
157
            '',
2✔
158
            InputOption::VALUE_OPTIONAL,
2✔
159
            'Amount of microseconds which the system should use to relax between crawls'
2✔
160
        );
2✔
161
    }
162

163
    /**
164
     * Running the functionality of the CLI (crawling URLs from queue)
165
     */
166
    private function runProcess(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
167
    {
168
        $result = 0;
2✔
169
        $counter = 0;
2✔
170

171
        // Clean up the queue
172
        $this->queueRepository->cleanupQueue();
2✔
173

174
        // Select entries:
175
        $records = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
2✔
176

177
        if (!empty($records)) {
2✔
178
            $quidList = [];
2✔
179

180
            foreach ($records as $record) {
2✔
181
                $quidList[] = $record['qid'];
2✔
182
            }
183

184
            //save the number of assigned queue entries to determine how many have been processed later
185
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds(
2✔
186
                $quidList,
2✔
187
                $this->processId
2✔
188
            );
2✔
189
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $this->processId);
2✔
190

191
            if ($numberOfAffectedRows !== count($quidList)) {
2✔
192
                return $result | self::CLI_STATUS_ABORTED;
×
193
            }
194

195
            foreach ($records as $record) {
2✔
196
                $result |= $this->crawlerController->readUrl($record['qid'], false, $this->processId);
2✔
197

198
                $counter++;
2✔
199
                // Just to relax the system
200
                usleep($sleepTime);
2✔
201

202
                // if during the start and the current read url the cli has been disable we need to return from the function
203
                // mark the process NOT as ended.
204
                if ($this->crawler->isDisabled()) {
2✔
205
                    return $result | self::CLI_STATUS_ABORTED;
×
206
                }
207

208
                if (!$this->processRepository->isProcessActive($this->processId)) {
2✔
209
                    $result |= self::CLI_STATUS_ABORTED;
×
210
                    //possible timeout
211
                    break;
×
212
                }
213
            }
214

215
            sleep($sleepAfterFinish);
2✔
216
        }
217

218
        if ($counter > 0) {
2✔
219
            $result |= self::CLI_STATUS_PROCESSED;
2✔
220
        }
221

222
        return $result;
2✔
223
    }
224

225
    /**
226
     * Try to acquire a new process with the given id
227
     * also performs some auto-cleanup for orphan processes
228
     */
229
    private function checkAndAcquireNewProcess(string $id): bool
230
    {
231
        $returnValue = true;
2✔
232

233
        $systemProcessId = getmypid();
2✔
234
        if (!$systemProcessId) {
2✔
235
            return false;
×
236
        }
237

238
        $processCount = 0;
2✔
239
        $orphanProcesses = [];
2✔
240

241
        $activeProcesses = $this->processRepository->findAllActive();
2✔
242

243
        /** @var Process $process */
244
        foreach ($activeProcesses as $process) {
2✔
245
            if ($process->getTtl() < time()) {
×
246
                $orphanProcesses[] = $process->getProcessId();
×
247
            } else {
248
                $processCount++;
×
249
            }
250
        }
251

252
        // if there are less than allowed active processes then add a new one
253
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
2✔
254
            $this->processRepository->addProcess($id, $systemProcessId);
2✔
255
        } else {
256
            $returnValue = false;
×
257
        }
258

259
        $this->processRepository->deleteProcessesMarkedAsDeleted();
2✔
260
        $this->processRepository->markRequestedProcessesAsNotActive($orphanProcesses);
2✔
261
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($orphanProcesses);
2✔
262

263
        return $returnValue;
2✔
264
    }
265

266
    private function getExtensionSettings(): array
267
    {
268
        return GeneralUtility::makeInstance(ExtensionConfigurationProvider::class)->getExtensionConfiguration();
2✔
269
    }
270
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc