• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tomasnorre / crawler / 17269487828

27 Aug 2025 02:19PM UTC coverage: 68.647% (-0.4%) from 69.077%
17269487828

push

github

web-flow
[FEATURE] Extract error information in SubProcess crawl strategy (#1161)

Up to now, the sub process execution strategy did not detect errors.
The HTML of error pages was provided as content, and the backend showed
an "OK" status.

Since cli/bootstrap.php does not set a non-zero exit code on errors,
we have to parse the HTML ourselves to detect if an error occured.

This patch supports TYPO3 error pages and extracts HTTP status code,
status message and error message from it.

Error informationare logged into the debug log as well as
made visible in the backend site crawler log view.

0 of 17 new or added lines in 1 file covered. (0.0%)

1872 of 2727 relevant lines covered (68.65%)

3.27 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

14.71
/Classes/CrawlStrategy/SubProcessExecutionStrategy.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace AOE\Crawler\CrawlStrategy;
6

7
/*
8
 * (c) 2020 AOE GmbH <dev@aoe.com>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21

22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Utility\PhpBinaryUtility;
24
use Psr\Http\Message\UriInterface;
25
use Psr\Log\LoggerAwareInterface;
26
use Psr\Log\LoggerAwareTrait;
27
use TYPO3\CMS\Core\Core\Environment;
28
use TYPO3\CMS\Core\Utility\CommandUtility;
29
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
30
use TYPO3\CMS\Core\Utility\GeneralUtility;
31

32
/**
33
 * Executes another process via shell_exec() to include cli/bootstrap.php which in turn
34
 * includes the index.php for frontend.
35
 * @internal since v12.0.0
36
 */
37
class SubProcessExecutionStrategy implements LoggerAwareInterface, CrawlStrategyInterface
38
{
39
    use LoggerAwareTrait;
40

41
    protected array $extensionSettings;
42

43
    public function __construct(?ExtensionConfigurationProvider $configurationProvider = null)
44
    {
45
        $configurationProvider ??= GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
3✔
46
        $settings = $configurationProvider->getExtensionConfiguration();
3✔
47
        $this->extensionSettings = is_array($settings) ? $settings : [];
3✔
48
    }
49

50
    /**
51
     * Fetches a URL by calling a shell script.
52
     *
53
     * @return array|bool|mixed
54
     */
55
    public function fetchUrlContents(UriInterface $url, string $crawlerId)
56
    {
57
        $url = (string) $url;
1✔
58
        $parsedUrl = parse_url($url);
1✔
59

60
        if (!isset($parsedUrl['scheme']) || !in_array($parsedUrl['scheme'], ['', 'http', 'https'], true)) {
1✔
61
            $this->logger?->debug(sprintf('Scheme does not match for url "%s"', $url), [
1✔
62
                'crawlerId' => $crawlerId,
1✔
63
            ]);
1✔
64
            return false;
1✔
65
        }
66

67
        if (!is_array($parsedUrl)) {
×
68
            return [];
×
69
        }
70

71
        $requestHeaders = $this->buildRequestHeaders($parsedUrl, $crawlerId);
×
72

73
        $commandParts = [
×
74
            ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php',
×
75
            $this->getFrontendBasePath(),
×
76
            $url,
×
77
            base64_encode(serialize($requestHeaders)),
×
78
        ];
×
79
        $commandParts = CommandUtility::escapeShellArguments($commandParts);
×
80
        $cmd = escapeshellcmd(PhpBinaryUtility::getPhpBinary());
×
81
        $cmd .= ' ' . implode(' ', $commandParts);
×
82

83
        $startTime = microtime(true);
×
84
        $content = $this->executeShellCommand($cmd);
×
85
        $this->logger?->info($url . ' ' . (microtime(true) - $startTime));
×
86

87
        if ($content === null) {
×
88
            return false;
×
89
        }
NEW
90
        if (str_contains($content, 'typo3-error-page')) {
×
NEW
91
            preg_match('#class="typo3-error-page-statuscode">(.+?)</#s', $content, $matchStatus);
×
NEW
92
            preg_match('#class="typo3-error-page-title">(.+?)</#s', $content, $matchTitle);
×
NEW
93
            preg_match('#class="typo3-error-page-message">(.+?)</#s', $content, $matchMessage);
×
NEW
94
            $message = trim($matchStatus[1] ?? '')
×
NEW
95
                . ' ' . trim($matchTitle[1] ?? '')
×
NEW
96
                . ' - ' . trim($matchMessage[1] ?? '');
×
NEW
97
            $this->logger?->debug(
×
NEW
98
                sprintf('Error while opening "%s" - %s', $url, $message),
×
NEW
99
                [
×
NEW
100
                    'crawlerId' => $crawlerId,
×
NEW
101
                ]
×
NEW
102
            );
×
NEW
103
            return [
×
NEW
104
                'errorlog' => [$message],
×
NEW
105
                'content' => $content,
×
NEW
106
            ];
×
107
        }
108

109
        return [
×
110
            'content' => $content,
×
111
        ];
×
112
    }
113

114
    private function buildRequestHeaders(array $url, string $crawlerId): array
115
    {
116
        $reqHeaders = [];
×
117
        $reqHeaders[] = 'GET ' . $url['path'] . (isset($url['query']) ? '?' . $url['query'] : '') . ' HTTP/1.0';
×
118
        $reqHeaders[] = 'Host: ' . $url['host'];
×
119
        $reqHeaders[] = 'Connection: close';
×
120
        if (isset($url['user'], $url['pass']) && $url['user'] !== '' && $url['pass'] !== '') {
×
121
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
×
122
        }
123
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
×
124
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
×
125
        return $reqHeaders;
×
126
    }
127

128
    /**
129
     * Executes a shell command and returns the outputted result.
130
     *
131
     * @param string $command Shell command to be executed
132
     * @return string|null Outputted result of the command execution
133
     */
134
    private function executeShellCommand($command)
135
    {
136
        return shell_exec($command);
×
137
    }
138

139
    /**
140
     * Gets the base path of the website frontend.
141
     * (e.g. if you call http://mydomain.com/cms/index.php in
142
     * the browser the base path is "/cms/")
143
     *
144
     * @return string Base path of the website frontend
145
     */
146
    private function getFrontendBasePath()
147
    {
148
        $frontendBasePath = '/';
×
149

150
        // Get the path from the extension settings:
151
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
×
152
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
×
153
            // If empty, try to use config.absRefPrefix:
154
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
×
155
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
×
156
            // If not in CLI mode the base path can be determined from $_SERVER environment:
157
        } elseif (!Environment::isCli()) {
×
158
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
×
159
        }
160

161
        // Base path must be '/<pathSegements>/':
162
        if ($frontendBasePath !== '/') {
×
163
            $frontendBasePath = '/' . ltrim((string) $frontendBasePath, '/');
×
164
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
×
165
        }
166

167
        return $frontendBasePath;
×
168
    }
169
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc