• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tomasnorre / crawler / 18322339174

07 Oct 2025 06:24PM UTC coverage: 69.284% (-0.01%) from 69.295%
18322339174

push

github

web-flow
[TASK] Cleanup SubProcessExecutionStrategy (#1195)

* [TASK] Cleanup SubProcessExecutionStrategy

* fix phpstan baseline

1 of 2 new or added lines in 1 file covered. (50.0%)

1897 of 2738 relevant lines covered (69.28%)

3.22 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

13.43
/Classes/CrawlStrategy/SubProcessExecutionStrategy.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace AOE\Crawler\CrawlStrategy;
6

7
/*
8
 * (c) 2020 AOE GmbH <dev@aoe.com>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21

22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Utility\PhpBinaryUtility;
24
use Psr\Http\Message\UriInterface;
25
use Psr\Log\LoggerAwareInterface;
26
use Psr\Log\LoggerAwareTrait;
27
use TYPO3\CMS\Core\Core\Environment;
28
use TYPO3\CMS\Core\Utility\CommandUtility;
29
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
30
use TYPO3\CMS\Core\Utility\GeneralUtility;
31

32
/**
33
 * Executes another process via shell_exec() to include cli/bootstrap.php which in turn
34
 * includes the index.php for frontend.
35
 * @internal since v12.0.0
36
 */
37
class SubProcessExecutionStrategy implements LoggerAwareInterface, CrawlStrategyInterface
38
{
39
    use LoggerAwareTrait;
40

41
    protected array $extensionSettings;
42

43
    public function __construct(?ExtensionConfigurationProvider $configurationProvider = null)
44
    {
45
        $configurationProvider ??= GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
3✔
46
        $this->extensionSettings = $configurationProvider->getExtensionConfiguration();
3✔
47
    }
48

49
    /**
50
     * Fetches a URL by calling a shell script.
51
     *
52
     * @return array|false See CrawlStrategyInterface::fetchUrlContents()
53
     */
54
    public function fetchUrlContents(UriInterface $url, string $crawlerId)
55
    {
56
        $url = (string) $url;
1✔
57
        $parsedUrl = parse_url($url);
1✔
58

59
        if (!isset($parsedUrl['scheme']) || !in_array($parsedUrl['scheme'], ['', 'http', 'https'], true)) {
1✔
60
            $this->logger?->debug(sprintf('Scheme does not match for url "%s"', $url), [
1✔
61
                'crawlerId' => $crawlerId,
1✔
62
            ]);
1✔
63
            return false;
1✔
64
        }
65

66
        if (!is_array($parsedUrl)) {
×
67
            return [];
×
68
        }
69

70
        $requestHeaders = $this->buildRequestHeaders($parsedUrl, $crawlerId);
×
71

72
        $commandParts = [
×
73
            ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php',
×
74
            $this->getFrontendBasePath(),
×
75
            $url,
×
76
            base64_encode(serialize($requestHeaders)),
×
77
        ];
×
78
        $commandParts = CommandUtility::escapeShellArguments($commandParts);
×
79
        $cmd = escapeshellcmd(PhpBinaryUtility::getPhpBinary());
×
80
        $cmd .= ' ' . implode(' ', $commandParts);
×
81

82
        $startTime = microtime(true);
×
83
        $content = $this->executeShellCommand($cmd);
×
84
        $this->logger?->info($url . ' ' . (microtime(true) - $startTime));
×
85

NEW
86
        if ($content === null || $content === false) {
×
87
            return false;
×
88
        }
89
        if (str_contains($content, 'typo3-error-page')) {
×
90
            preg_match('#class="typo3-error-page-statuscode">(.+?)</#s', $content, $matchStatus);
×
91
            preg_match('#class="typo3-error-page-title">(.+?)</#s', $content, $matchTitle);
×
92
            preg_match('#class="typo3-error-page-message">(.+?)</#s', $content, $matchMessage);
×
93
            $message = trim($matchStatus[1] ?? '')
×
94
                . ' ' . trim($matchTitle[1] ?? '')
×
95
                . ' - ' . trim($matchMessage[1] ?? '');
×
96
            $this->logger?->debug(
×
97
                sprintf('Error while opening "%s" - %s', $url, $message),
×
98
                [
×
99
                    'crawlerId' => $crawlerId,
×
100
                ]
×
101
            );
×
102
            return [
×
103
                'errorlog' => [$message],
×
104
                'content' => $content,
×
105
            ];
×
106
        }
107

108
        return [
×
109
            'content' => $content,
×
110
        ];
×
111
    }
112

113
    private function buildRequestHeaders(array $url, string $crawlerId): array
114
    {
115
        $reqHeaders = [];
×
116
        $reqHeaders[] = 'GET ' . $url['path'] . (isset($url['query']) ? '?' . $url['query'] : '') . ' HTTP/1.0';
×
117
        $reqHeaders[] = 'Host: ' . $url['host'];
×
118
        $reqHeaders[] = 'Connection: close';
×
119
        if (isset($url['user'], $url['pass']) && $url['user'] !== '' && $url['pass'] !== '') {
×
120
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
×
121
        }
122
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
×
123
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
×
124
        return $reqHeaders;
×
125
    }
126

127
    /**
128
     * Executes a shell command and returns the outputted result.
129
     *
130
     * @param string $command Shell command to be executed
131
     * @return string|false|null Outputted result of the command execution
132
     */
133
    private function executeShellCommand($command)
134
    {
135
        return shell_exec($command);
×
136
    }
137

138
    /**
139
     * Gets the base path of the website frontend.
140
     * (e.g. if you call http://mydomain.com/cms/index.php in
141
     * the browser the base path is "/cms/")
142
     *
143
     * @return string Base path of the website frontend
144
     */
145
    private function getFrontendBasePath()
146
    {
147
        $frontendBasePath = '/';
×
148

149
        // Get the path from the extension settings:
150
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
×
151
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
×
152
            // If empty, try to use config.absRefPrefix:
153
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
×
154
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
×
155
            // If not in CLI mode the base path can be determined from $_SERVER environment:
156
        } elseif (!Environment::isCli()) {
×
157
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
×
158
        }
159

160
        // Base path must be '/<pathSegements>/':
161
        if ($frontendBasePath !== '/') {
×
162
            $frontendBasePath = '/' . ltrim((string) $frontendBasePath, '/');
×
163
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
×
164
        }
165

166
        return $frontendBasePath;
×
167
    }
168
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc