• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MenSeb / python-playground / 6685844579

29 Oct 2023 09:17PM UTC coverage: 1.484%. First build
6685844579

Pull #4

github

web-flow
Merge 39b7e6666 into 398800e7c
Pull Request #4: build(html-tracing): implement methods to clone a website

0 of 48 branches covered (0.0%)

Branch coverage included in aggregate %.

137 of 137 new or added lines in 5 files covered. (100.0%)

5 of 289 relevant lines covered (1.73%)

0.02 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0
/src/html_tracing/utilities/clone.py
1
"""Module Clone."""
2

3
from __future__ import annotations
×
4

5
import re
×
6
from pathlib import Path
×
7
from typing import TYPE_CHECKING
×
8

9
from bs4 import BeautifulSoup, ResultSet, Tag
×
10
from utilities.logger import logger
×
11

12
if TYPE_CHECKING:
×
13
    from utilities.session import Session
×
14

15

16
class Clone:
×
17
    """Interface representing clone utilities."""
18

19
    def __init__(
×
20
        self: Clone,
21
        domain: str,
22
        markup: str | bytes,
23
        folder: str = "temp",
24
        directory: Path = Path(__file__).parent,
25
    ) -> None:
26
        """Interface representing clone utilities.
27

28
        Args:
29
        ----
30
            domain (str):
31
                The website domain to clone.
32
            markup (str | bytes):
33
                The website markup to clone.
34
            folder (str, optional):
35
                The clone folder. Defaults to "temp".
36
            directory (Path, optional):
37
                The clone directory. Defaults to Path(__file__).parent.
38
        """
39
        self.assets: list[str] = []
×
40
        self.domain = domain
×
41
        self.soup = BeautifulSoup(markup=markup, features="html5lib")
×
42
        self.path = directory / folder / domain[domain.index("//") + 2 : -1]
×
43
        self.path_assets = Path("assets")
×
44
        self.setup()
×
45

46
    def setup(
×
47
        self: Clone,
48
    ) -> None:
49
        """Create the directory and folders for the cloned website."""
50
        logger.trace_()
×
51

52
        self.path.mkdir(exist_ok=True, parents=True)
×
53
        (self.path / self.path_assets).mkdir(exist_ok=True)
×
54

55
    def save_html(
×
56
        self: Clone,
57
    ) -> int:
58
        """Save the HTML clone."""
59
        logger.trace_()
×
60

61
        return (self.path / "index.html").write_text(
×
62
            data=self.soup.prettify(),
63
            encoding="utf-8",
64
        )
65

66
    def save_asset(
×
67
        self: Clone,
68
        data: bytes,
69
        filename: str,
70
    ) -> None:
71
        """Save an asset file.
72

73
        Args:
74
        ----
75
            data (bytes):
76
                The asset data.
77
            filename (str):
78
                The asset filename.
79
        """
80
        logger.trace_(msg=filename)
×
81

82
        (self.path / self.path_assets / filename).write_bytes(data=data)
×
83

84
    def sync_images(
×
85
        self: Clone,
86
        session: Session,
87
    ) -> None:
88
        """Sync the images from the cloned website.
89

90
        Args:
91
        ----
92
            session (Session):
93
                The requests session.
94
        """
95
        images: ResultSet[Tag] = self.soup.find_all(name="img")
×
96

97
        for image in images:
×
98
            if ("src" not in image.attrs and "data-cfsrc" not in image.attrs) or image[
×
99
                "src"
100
            ].startswith("https"):
101
                continue
×
102

103
            source = image["src"] if "src" in image.attrs else image["data-cfsrc"]
×
104
            path = Path(source)
×
105
            index = path.suffix.find("?")
×
106
            filename = path.name if index < 0 else path.stem + path.suffix[0:index]
×
107
            image["src"] = self.path_assets / filename
×
108
            self.assets.append(filename)
×
109
            response = session.request(url=self.domain + source)
×
110

111
            if response is not None:
×
112
                self.save_asset(
×
113
                    data=response.content,
114
                    filename=filename,
115
                )
116

117
    def sync_links(
×
118
        self: Clone,
119
        session: Session,
120
    ) -> None:
121
        """Sync the links from the cloned website.
122

123
        Args:
124
        ----
125
            session (Session):
126
                The requests session.
127
        """
128
        stylesheets: ResultSet[Tag] = self.soup.find_all(name="link")
×
129

130
        for stylesheet in stylesheets:
×
131
            if "href" not in stylesheet.attrs or stylesheet["href"].startswith("https"):
×
132
                continue
×
133

134
            source: str = stylesheet["href"]
×
135
            path = Path(source)
×
136
            index = path.suffix.find("?")
×
137
            filename = path.name if index < 0 else path.stem + path.suffix[0:index]
×
138
            self.assets.append(filename)
×
139
            stylesheet["href"] = self.path_assets / filename
×
140
            response = session.request(url=self.domain + source)
×
141

142
            if response is not None:
×
143
                self.save_asset(
×
144
                    data=response.content,
145
                    filename=filename,
146
                )
147

148
    def sync_scripts(
×
149
        self: Clone,
150
        session: Session,
151
        *,
152
        nosync: bool = True,
153
    ) -> None:
154
        """Sync the scripts from the cloned website.
155

156
        Args:
157
        ----
158
            session (Session):
159
                The requests session.
160
            nosync (bool, optional):
161
                If true, remove the scripts. Defaults to True.
162
        """
163
        noscripts: ResultSet[Tag] = self.soup.find_all(name="noscript")
×
164
        for noscript in noscripts:
×
165
            noscript.extract()
×
166

167
        scripts: ResultSet[Tag] = self.soup.find_all(name="script")
×
168
        for script in scripts:
×
169
            if nosync or "src" not in script.attrs or script["src"].startswith("https"):
×
170
                script.extract()
×
171
                continue
×
172

173
            source: str = script["src"]
×
174
            path = Path(source)
×
175
            index = path.suffix.find("?")
×
176
            filename = path.name if index < 0 else path.stem + path.suffix[0:index]
×
177
            self.assets.append(filename)
×
178
            script["src"] = self.path_assets / filename
×
179
            response = session.request(url=self.domain + source)
×
180

181
            if response is not None:
×
182
                self.save_asset(
×
183
                    data=response.content,
184
                    filename=filename,
185
                )
186

187
    def sync_fonts(
×
188
        self: Clone,
189
        session: Session,
190
    ) -> None:
191
        """Sync the fonts from the cloned website.
192

193
        Args:
194
            session (Session):
195
                The requests session.
196
        """
197
        stylesheets = list(
×
198
            filter(lambda filename: filename.endswith("css"), self.assets),
199
        )
200

201
        for stylesheet in stylesheets:
×
202
            path_stylesheet = self.path / self.path_assets / stylesheet
×
203
            content = path_stylesheet.read_text()
×
204

205
            sources: list[str] = re.findall(r"src: ?url\(([^)]+)\)", string=content)
×
206

207
            for source in sources:
×
208
                url = source.replace('"', "")
×
209
                response = session.request(self.domain + url)
×
210

211
                if response is not None:
×
212
                    path = Path(url)
×
213
                    index = path.suffix.find("?")
×
214
                    filename = (
×
215
                        path.name if index < 0 else path.stem + path.suffix[0:index]
216
                    )
217
                    self.assets.append(filename)
×
218
                    path_stylesheet.write_text(data=content.replace(url, filename))
×
219
                    self.save_asset(
×
220
                        data=response.content,
221
                        filename=filename,
222
                    )
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc