• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tomasnorre / crawler / 17290669502

28 Aug 2025 08:44AM UTC coverage: 74.685%. First build
17290669502

Pull #1162

github

web-flow
Merge b636485a6 into cdedf1aeb
Pull Request #1162: [FEATURE] Extract error information in SubProcess crawl strategy (TYPO3v11)

0 of 17 new or added lines in 1 file covered. (0.0%)

1953 of 2615 relevant lines covered (74.68%)

3.72 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

16.44
/Classes/CrawlStrategy/SubProcessExecutionStrategy.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace AOE\Crawler\CrawlStrategy;
6

7
/*
8
 * (c) 2020 AOE GmbH <dev@aoe.com>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21

22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Utility\PhpBinaryUtility;
24
use Psr\Http\Message\UriInterface;
25
use Psr\Log\LoggerAwareInterface;
26
use Psr\Log\LoggerAwareTrait;
27
use TYPO3\CMS\Core\Core\Environment;
28
use TYPO3\CMS\Core\Utility\CommandUtility;
29
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
30
use TYPO3\CMS\Core\Utility\GeneralUtility;
31

32
/**
33
 * Executes another process via shell_exec() to include cli/bootstrap.php which in turn
34
 * includes the index.php for frontend.
35
 */
36
class SubProcessExecutionStrategy implements LoggerAwareInterface, CrawlStrategy
37
{
38
    use LoggerAwareTrait;
39

40
    /**
41
     * @var array
42
     */
43
    protected $extensionSettings;
44

45
    public function __construct(?ExtensionConfigurationProvider $configurationProvider = null)
46
    {
47
        $configurationProvider = $configurationProvider ?? GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
3✔
48
        $settings = $configurationProvider->getExtensionConfiguration();
3✔
49
        $this->extensionSettings = is_array($settings) ? $settings : [];
3✔
50
    }
51

52
    /**
53
     * Fetches a URL by calling a shell script.
54
     *
55
     * @return array|bool|mixed
56
     */
57
    public function fetchUrlContents(UriInterface $url, string $crawlerId)
58
    {
59
        $url = (string) $url;
1✔
60
        $parsedUrl = parse_url($url);
1✔
61

62
        if ($parsedUrl === false) {
1✔
63
            $this->logger->debug(
×
64
                sprintf('Could not parse_url() for string "%s"', $url),
×
65
                ['crawlerId' => $crawlerId]
×
66
            );
×
67
            return false;
×
68
        }
69

70
        if (! isset($parsedUrl['scheme']) || ! in_array($parsedUrl['scheme'], ['', 'http', 'https'], true)) {
1✔
71
            $this->logger->debug(
1✔
72
                sprintf('Scheme does not match for url "%s"', $url),
1✔
73
                ['crawlerId' => $crawlerId]
1✔
74
            );
1✔
75
            return false;
1✔
76
        }
77

78
        if (! is_array($parsedUrl)) {
×
79
            return [];
×
80
        }
81

82
        $requestHeaders = $this->buildRequestHeaders($parsedUrl, $crawlerId);
×
83

84
        $commandParts = [
×
85
            ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php',
×
86
            $this->getFrontendBasePath(),
×
87
            $url,
×
88
            base64_encode(serialize($requestHeaders)),
×
89
        ];
×
90
        $commandParts = CommandUtility::escapeShellArguments($commandParts);
×
91
        $cmd = escapeshellcmd(PhpBinaryUtility::getPhpBinary());
×
92
        $cmd .= ' ' . implode(' ', $commandParts);
×
93

94
        $startTime = microtime(true);
×
95
        $content = $this->executeShellCommand($cmd);
×
96
        $this->logger->info($url . ' ' . (microtime(true) - $startTime));
×
97

98
        if ($content === null) {
×
99
            return false;
×
100
        }
NEW
101
        if (str_contains($content, 'typo3-error-page')) {
×
NEW
102
            preg_match('#class="typo3-error-page-statuscode">(.+?)</#s', $content, $matchStatus);
×
NEW
103
            preg_match('#class="typo3-error-page-title">(.+?)</#s', $content, $matchTitle);
×
NEW
104
            preg_match('#class="typo3-error-page-message">(.+?)</#s', $content, $matchMessage);
×
NEW
105
            $message = trim($matchStatus[1] ?? '')
×
NEW
106
                . ' ' . trim($matchTitle[1] ?? '')
×
NEW
107
                . ' - ' . trim($matchMessage[1] ?? '');
×
NEW
108
            $this->logger->debug(
×
NEW
109
                sprintf('Error while opening "%s" - %s', $url, $message),
×
NEW
110
                [
×
NEW
111
                    'crawlerId' => $crawlerId,
×
NEW
112
                ]
×
NEW
113
            );
×
NEW
114
            return [
×
NEW
115
                'errorlog' => [$message],
×
NEW
116
                'content' => $content,
×
NEW
117
            ];
×
118
        }
119

120
        return ['content' => $content];
×
121
    }
122

123
    private function buildRequestHeaders(array $url, string $crawlerId): array
124
    {
125
        $reqHeaders = [];
×
126
        $reqHeaders[] = 'GET ' . $url['path'] . (isset($url['query']) ? '?' . $url['query'] : '') . ' HTTP/1.0';
×
127
        $reqHeaders[] = 'Host: ' . $url['host'];
×
128
        $reqHeaders[] = 'Connection: close';
×
129
        if (isset($url['user'], $url['pass']) && $url['user'] !== '' && $url['pass'] !== '') {
×
130
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
×
131
        }
132
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
×
133
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
×
134
        return $reqHeaders;
×
135
    }
136

137
    /**
138
     * Executes a shell command and returns the outputted result.
139
     *
140
     * @param string $command Shell command to be executed
141
     * @return string|null Outputted result of the command execution
142
     */
143
    private function executeShellCommand($command)
144
    {
145
        return shell_exec($command);
×
146
    }
147

148
    /**
149
     * Gets the base path of the website frontend.
150
     * (e.g. if you call http://mydomain.com/cms/index.php in
151
     * the browser the base path is "/cms/")
152
     *
153
     * @return string Base path of the website frontend
154
     */
155
    private function getFrontendBasePath()
156
    {
157
        $frontendBasePath = '/';
×
158

159
        // Get the path from the extension settings:
160
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
×
161
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
×
162
        // If empty, try to use config.absRefPrefix:
163
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && ! empty($GLOBALS['TSFE']->absRefPrefix)) {
×
164
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
×
165
        // If not in CLI mode the base path can be determined from $_SERVER environment:
166
        } elseif (! Environment::isCli()) {
×
167
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
×
168
        }
169

170
        // Base path must be '/<pathSegements>/':
171
        if ($frontendBasePath !== '/') {
×
172
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
×
173
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
×
174
        }
175

176
        return $frontendBasePath;
×
177
    }
178
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc