• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

tomasnorre / crawler / 3696698329

pending completion
3696698329

Pull #988

github

GitHub
Merge d1d32bd2d into 3f94d6a4e
Pull Request #988: [WIP][FEATURE] Setup new Backend Module

417 of 417 new or added lines in 9 files covered. (100.0%)

1601 of 2523 relevant lines covered (63.46%)

3.23 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

16.98
/Classes/CrawlStrategy/SubProcessExecutionStrategy.php
1
<?php
2

3
declare(strict_types=1);
4

5
namespace AOE\Crawler\CrawlStrategy;
6

7
/*
8
 * (c) 2020 AOE GmbH <dev@aoe.com>
9
 *
10
 * This file is part of the TYPO3 Crawler Extension.
11
 *
12
 * It is free software; you can redistribute it and/or modify it under
13
 * the terms of the GNU General Public License, either version 2
14
 * of the License, or any later version.
15
 *
16
 * For the full copyright and license information, please read the
17
 * LICENSE.txt file that was distributed with this source code.
18
 *
19
 * The TYPO3 project - inspiring people to share!
20
 */
21

22
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
23
use AOE\Crawler\Utility\PhpBinaryUtility;
24
use Psr\Http\Message\UriInterface;
25
use Psr\Log\LoggerAwareInterface;
26
use Psr\Log\LoggerAwareTrait;
27
use TYPO3\CMS\Core\Core\Environment;
28
use TYPO3\CMS\Core\Utility\CommandUtility;
29
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
30
use TYPO3\CMS\Core\Utility\GeneralUtility;
31

32
/**
33
 * Executes another process via shell_exec() to include cli/bootstrap.php which in turn
34
 * includes the index.php for frontend.
35
 */
36
class SubProcessExecutionStrategy implements LoggerAwareInterface, CrawlStrategyInterface
37
{
38
    use LoggerAwareTrait;
39

40
    protected array $extensionSettings;
41

42
    public function __construct(?ExtensionConfigurationProvider $configurationProvider = null)
43
    {
44
        $configurationProvider ??= GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
3✔
45
        $settings = $configurationProvider->getExtensionConfiguration();
3✔
46
        $this->extensionSettings = is_array($settings) ? $settings : [];
3✔
47
    }
48

49
    /**
50
     * Fetches a URL by calling a shell script.
51
     *
52
     * @return array|bool|mixed
53
     */
54
    public function fetchUrlContents(UriInterface $url, string $crawlerId)
55
    {
56
        $url = (string) $url;
1✔
57
        $parsedUrl = parse_url($url);
1✔
58

59
        if ($parsedUrl === false) {
1✔
60
            $this->logger->debug(
×
61
                sprintf('Could not parse_url() for string "%s"', $url),
×
62
                ['crawlerId' => $crawlerId]
×
63
            );
×
64
            return false;
×
65
        }
66

67
        if (! isset($parsedUrl['scheme']) || ! in_array($parsedUrl['scheme'], ['', 'http', 'https'], true)) {
1✔
68
            $this->logger->debug(sprintf('Scheme does not match for url "%s"', $url), ['crawlerId' => $crawlerId]);
1✔
69
            return false;
1✔
70
        }
71

72
        if (! is_array($parsedUrl)) {
×
73
            return [];
×
74
        }
75

76
        $requestHeaders = $this->buildRequestHeaders($parsedUrl, $crawlerId);
×
77

78
        $commandParts = [
×
79
            ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php',
×
80
            $this->getFrontendBasePath(),
×
81
            $url,
×
82
            base64_encode(serialize($requestHeaders)),
×
83
        ];
×
84
        $commandParts = CommandUtility::escapeShellArguments($commandParts);
×
85
        $cmd = escapeshellcmd(PhpBinaryUtility::getPhpBinary());
×
86
        $cmd .= ' ' . implode(' ', $commandParts);
×
87

88
        $startTime = microtime(true);
×
89
        $content = $this->executeShellCommand($cmd);
×
90
        $this->logger->info($url . ' ' . (microtime(true) - $startTime));
×
91

92
        if ($content === null) {
×
93
            return false;
×
94
        }
95

96
        return ['content' => $content];
×
97
    }
98

99
    private function buildRequestHeaders(array $url, string $crawlerId): array
100
    {
101
        $reqHeaders = [];
×
102
        $reqHeaders[] = 'GET ' . $url['path'] . (isset($url['query']) ? '?' . $url['query'] : '') . ' HTTP/1.0';
×
103
        $reqHeaders[] = 'Host: ' . $url['host'];
×
104
        $reqHeaders[] = 'Connection: close';
×
105
        if (isset($url['user'], $url['pass']) && $url['user'] !== '' && $url['pass'] !== '') {
×
106
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
×
107
        }
108
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
×
109
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
×
110
        return $reqHeaders;
×
111
    }
112

113
    /**
114
     * Executes a shell command and returns the outputted result.
115
     *
116
     * @param string $command Shell command to be executed
117
     * @return string|null Outputted result of the command execution
118
     */
119
    private function executeShellCommand($command)
120
    {
121
        return shell_exec($command);
×
122
    }
123

124
    /**
125
     * Gets the base path of the website frontend.
126
     * (e.g. if you call http://mydomain.com/cms/index.php in
127
     * the browser the base path is "/cms/")
128
     *
129
     * @return string Base path of the website frontend
130
     */
131
    private function getFrontendBasePath()
132
    {
133
        $frontendBasePath = '/';
×
134

135
        // Get the path from the extension settings:
136
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
×
137
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
×
138
        // If empty, try to use config.absRefPrefix:
139
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && ! empty($GLOBALS['TSFE']->absRefPrefix)) {
×
140
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
×
141
        // If not in CLI mode the base path can be determined from $_SERVER environment:
142
        } elseif (! Environment::isCli()) {
×
143
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
×
144
        }
145

146
        // Base path must be '/<pathSegements>/':
147
        if ($frontendBasePath !== '/') {
×
148
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
×
149
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
×
150
        }
151

152
        return $frontendBasePath;
×
153
    }
154
}
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc