• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tomasnorre / crawler / 24792029807

22 Apr 2026 05:11PM UTC coverage: 68.791% (-0.05%) from 68.841%
24792029807

push

github

web-flow
[TASK] Switch to NormalizedParams in SubProcessExecutionStrategy (#1275)

Resolves #1274

0 of 3 new or added lines in 1 file covered. (0.0%)

1900 of 2762 relevant lines covered (68.79%)

3.16 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

13.04
/Classes/CrawlStrategy/SubProcessExecutionStrategy.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace AOE\Crawler\CrawlStrategy;
6

7
/*
8
 * (c) 2020 AOE GmbH <dev@aoe.com>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21

22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Utility\PhpBinaryUtility;
24
use Psr\Http\Message\ServerRequestInterface;
25
use Psr\Http\Message\UriInterface;
26
use Psr\Log\LoggerAwareInterface;
27
use Psr\Log\LoggerAwareTrait;
28
use TYPO3\CMS\Core\Core\Environment;
29
use TYPO3\CMS\Core\Http\NormalizedParams;
30
use TYPO3\CMS\Core\Utility\CommandUtility;
31
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
32
use TYPO3\CMS\Core\Utility\GeneralUtility;
33

34
/**
35
 * Executes another process via shell_exec() to include cli/bootstrap.php which in turn
36
 * includes the index.php for frontend.
37
 * @internal since v12.0.0
38
 */
39
class SubProcessExecutionStrategy implements LoggerAwareInterface, CrawlStrategyInterface
40
{
41
    use LoggerAwareTrait;
42

43
    protected array $extensionSettings;
44

45
    public function __construct(?ExtensionConfigurationProvider $configurationProvider = null)
46
    {
47
        $configurationProvider ??= GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
3✔
48
        $this->extensionSettings = $configurationProvider->getExtensionConfiguration();
3✔
49
    }
50

51
    /**
52
     * Fetches a URL by calling a shell script.
53
     *
54
     * @return array|false See CrawlStrategyInterface::fetchUrlContents()
55
     */
56
    #[\Override]
57
    public function fetchUrlContents(UriInterface $url, string $crawlerId)
58
    {
59
        $url = (string) $url;
1✔
60
        $parsedUrl = parse_url($url);
1✔
61

62
        if (!isset($parsedUrl['scheme']) || !in_array($parsedUrl['scheme'], ['', 'http', 'https'], true)) {
1✔
63
            $this->logger?->debug(sprintf('Scheme does not match for url "%s"', $url), [
1✔
64
                'crawlerId' => $crawlerId,
1✔
65
            ]);
1✔
66
            return false;
1✔
67
        }
68

69
        if (!is_array($parsedUrl)) {
×
70
            return [];
×
71
        }
72

73
        $requestHeaders = $this->buildRequestHeaders($parsedUrl, $crawlerId);
×
74

75
        $commandParts = [
×
76
            ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php',
×
77
            $this->getFrontendBasePath(),
×
78
            $url,
×
79
            base64_encode(serialize($requestHeaders)),
×
80
        ];
×
81
        $commandParts = CommandUtility::escapeShellArguments($commandParts);
×
82
        $cmd = escapeshellcmd(PhpBinaryUtility::getPhpBinary());
×
83
        $cmd .= ' ' . implode(' ', $commandParts);
×
84

85
        $startTime = microtime(true);
×
86
        $content = $this->executeShellCommand($cmd);
×
87
        $this->logger?->info($url . ' ' . (microtime(true) - $startTime));
×
88

89
        if ($content === null || $content === false) {
×
90
            return false;
×
91
        }
92
        if (str_contains($content, 'typo3-error-page')) {
×
93
            preg_match('#class="typo3-error-page-statuscode">(.+?)</#s', $content, $matchStatus);
×
94
            preg_match('#class="typo3-error-page-title">(.+?)</#s', $content, $matchTitle);
×
95
            preg_match('#class="typo3-error-page-message">(.+?)</#s', $content, $matchMessage);
×
96
            $message = trim($matchStatus[1] ?? '')
×
97
                . ' ' . trim($matchTitle[1] ?? '')
×
98
                . ' - ' . trim($matchMessage[1] ?? '');
×
99
            $this->logger?->debug(
×
100
                sprintf('Error while opening "%s" - %s', $url, $message),
×
101
                [
×
102
                    'crawlerId' => $crawlerId,
×
103
                ]
×
104
            );
×
105
            return [
×
106
                'errorlog' => [$message],
×
107
                'content' => $content,
×
108
            ];
×
109
        }
110

111
        return [
×
112
            'content' => $content,
×
113
        ];
×
114
    }
115

116
    private function buildRequestHeaders(array $url, string $crawlerId): array
117
    {
118
        $reqHeaders = [];
×
119
        $reqHeaders[] = 'GET ' . $url['path'] . (isset($url['query']) ? '?' . $url['query'] : '') . ' HTTP/1.0';
×
120
        $reqHeaders[] = 'Host: ' . $url['host'];
×
121
        $reqHeaders[] = 'Connection: close';
×
122
        if (isset($url['user'], $url['pass']) && $url['user'] !== '' && $url['pass'] !== '') {
×
123
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
×
124
        }
125
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
×
126
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
×
127
        return $reqHeaders;
×
128
    }
129

130
    /**
131
     * Executes a shell command and returns the outputted result.
132
     *
133
     * @param string $command Shell command to be executed
134
     * @return string|false|null Outputted result of the command execution
135
     */
136
    private function executeShellCommand($command)
137
    {
138
        return shell_exec($command);
×
139
    }
140

141
    /**
142
     * Gets the base path of the website frontend.
143
     * (e.g. if you call http://mydomain.com/cms/index.php in
144
     * the browser the base path is "/cms/")
145
     *
146
     * @return string Base path of the website frontend
147
     */
148
    private function getFrontendBasePath()
149
    {
150
        $frontendBasePath = '/';
×
151

152
        // Get the path from the extension settings:
153
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
×
154
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
×
155
            // If empty, try to use config.absRefPrefix:
156
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
×
157
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
×
158
            // If not in CLI mode, the base path can be determined from the request
159
        } elseif (!Environment::isCli()) {
×
160
            /** @var NormalizedParams $normalizedParams */
NEW
161
            $normalizedParams = $this->getRequest()->getAttribute('normalizedParams');
×
NEW
162
            $frontendBasePath = $normalizedParams->getSitePath();
×
163
        }
164

165
        // Base path must be '/<pathSegements>/':
166
        if ($frontendBasePath !== '/') {
×
167
            $frontendBasePath = '/' . ltrim((string) $frontendBasePath, '/');
×
168
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
×
169
        }
170

171
        return $frontendBasePath;
×
172
    }
173

174
    private function getRequest(): ServerRequestInterface
175
    {
NEW
176
        return $GLOBALS['TYPO3_REQUEST'];
×
177
    }
178
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc