6685844579

Committed 29 Oct 2023 09:17PM UTC coverage: 1.484%. First build

Build # 6685844579

Build Type

Pull #4

github

Committed by

web-flow

Commit Message

Merge 39b7e6666 into 398800e7c

Pull Request Pull Request #4: build(html-tracing): implement methods to clone a website

Run Details

0 of 48 branches covered (0.0%)

Branch coverage included in aggregate %.

137 of 137 new or added lines in 5 files covered. (100.0%)

5 of 289 relevant lines covered (1.73%)

0.02 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/src/html_tracing/utilities/clone.py

"""Module Clone."""

from __future__ import annotations

import re
from pathlib import Path
from typing import TYPE_CHECKING

from bs4 import BeautifulSoup, ResultSet, Tag
from utilities.logger import logger

if TYPE_CHECKING:
    from utilities.session import Session


class Clone:
    """Interface representing clone utilities."""

    def __init__(
        self: Clone,
        domain: str,
        markup: str | bytes,
        folder: str = "temp",
        directory: Path = Path(__file__).parent,
    ) -> None:
        """Interface representing clone utilities.

        Args:
        ----
            domain (str):
                The website domain to clone.
            markup (str | bytes):
                The website markup to clone.
            folder (str, optional):
                The clone folder. Defaults to "temp".
            directory (Path, optional):
                The clone directory. Defaults to Path(__file__).parent.
        """
        self.assets: list[str] = []
        self.domain = domain
        self.soup = BeautifulSoup(markup=markup, features="html5lib")
        self.path = directory / folder / domain[domain.index("//") + 2 : -1]
        self.path_assets = Path("assets")
        self.setup()

    def setup(
        self: Clone,
    ) -> None:
        """Create the directory and folders for the cloned website."""
        logger.trace_()

        self.path.mkdir(exist_ok=True, parents=True)
        (self.path / self.path_assets).mkdir(exist_ok=True)

    def save_html(
        self: Clone,
    ) -> int:
        """Save the HTML clone."""
        logger.trace_()

        return (self.path / "index.html").write_text(
            data=self.soup.prettify(),
            encoding="utf-8",
        )

    def save_asset(
        self: Clone,
        data: bytes,
        filename: str,
    ) -> None:
        """Save an asset file.

        Args:
        ----
            data (bytes):
                The asset data.
            filename (str):
                The asset filename.
        """
        logger.trace_(msg=filename)

        (self.path / self.path_assets / filename).write_bytes(data=data)

    def sync_images(
        self: Clone,
        session: Session,
    ) -> None:
        """Sync the images from the cloned website.

        Args:
        ----
            session (Session):
                The requests session.
        """
        images: ResultSet[Tag] = self.soup.find_all(name="img")

        for image in images:
            if ("src" not in image.attrs and "data-cfsrc" not in image.attrs) or image[
                "src"
            ].startswith("https"):
                continue

            source = image["src"] if "src" in image.attrs else image["data-cfsrc"]
            path = Path(source)
            index = path.suffix.find("?")
            filename = path.name if index < 0 else path.stem + path.suffix[0:index]
            image["src"] = self.path_assets / filename
            self.assets.append(filename)
            response = session.request(url=self.domain + source)

            if response is not None:
                self.save_asset(
                    data=response.content,
                    filename=filename,
                )

    def sync_links(
        self: Clone,
        session: Session,
    ) -> None:
        """Sync the links from the cloned website.

        Args:
        ----
            session (Session):
                The requests session.
        """
        stylesheets: ResultSet[Tag] = self.soup.find_all(name="link")

        for stylesheet in stylesheets:
            if "href" not in stylesheet.attrs or stylesheet["href"].startswith("https"):
                continue

            source: str = stylesheet["href"]
            path = Path(source)
            index = path.suffix.find("?")
            filename = path.name if index < 0 else path.stem + path.suffix[0:index]
            self.assets.append(filename)
            stylesheet["href"] = self.path_assets / filename
            response = session.request(url=self.domain + source)

            if response is not None:
                self.save_asset(
                    data=response.content,
                    filename=filename,
                )

    def sync_scripts(
        self: Clone,
        session: Session,
        *,
        nosync: bool = True,
    ) -> None:
        """Sync the scripts from the cloned website.

        Args:
        ----
            session (Session):
                The requests session.
            nosync (bool, optional):
                If true, remove the scripts. Defaults to True.
        """
        noscripts: ResultSet[Tag] = self.soup.find_all(name="noscript")
        for noscript in noscripts:
            noscript.extract()

        scripts: ResultSet[Tag] = self.soup.find_all(name="script")
        for script in scripts:
            if nosync or "src" not in script.attrs or script["src"].startswith("https"):
                script.extract()
                continue

            source: str = script["src"]
            path = Path(source)
            index = path.suffix.find("?")
            filename = path.name if index < 0 else path.stem + path.suffix[0:index]
            self.assets.append(filename)
            script["src"] = self.path_assets / filename
            response = session.request(url=self.domain + source)

            if response is not None:
                self.save_asset(
                    data=response.content,
                    filename=filename,
                )

    def sync_fonts(
        self: Clone,
        session: Session,
    ) -> None:
        """Sync the fonts from the cloned website.

        Args:
            session (Session):
                The requests session.
        """
        stylesheets = list(
            filter(lambda filename: filename.endswith("css"), self.assets),
        )

        for stylesheet in stylesheets:
            path_stylesheet = self.path / self.path_assets / stylesheet
            content = path_stylesheet.read_text()

            sources: list[str] = re.findall(r"src: ?url\(([^)]+)\)", string=content)

            for source in sources:
                url = source.replace('"', "")
                response = session.request(self.domain + url)

                if response is not None:
                    path = Path(url)
                    index = path.suffix.find("?")
                    filename = (
                        path.name if index < 0 else path.stem + path.suffix[0:index]
                    )
                    self.assets.append(filename)
                    path_stylesheet.write_text(data=content.replace(url, filename))
                    self.save_asset(
                        data=response.content,
                        filename=filename,
                    )

1	"""Module Clone."""
2
3	from __future__ import annotations	×
4
5	import re	×
6	from pathlib import Path	×
7	from typing import TYPE_CHECKING	×
8
9	from bs4 import BeautifulSoup, ResultSet, Tag	×
10	from utilities.logger import logger	×
11
12	if TYPE_CHECKING:	×
13	from utilities.session import Session	×
14
15
16	class Clone:	×
17	"""Interface representing clone utilities."""
18
19	def __init__(	×
20	self: Clone,
21	domain: str,
22	markup: str \| bytes,
23	folder: str = "temp",
24	directory: Path = Path(__file__).parent,
25	) -> None:
26	"""Interface representing clone utilities.
27
28	Args:
29	----
30	domain (str):
31	The website domain to clone.
32	markup (str \| bytes):
33	The website markup to clone.
34	folder (str, optional):
35	The clone folder. Defaults to "temp".
36	directory (Path, optional):
37	The clone directory. Defaults to Path(__file__).parent.
38	"""
39	self.assets: list[str] = []	×
40	self.domain = domain	×
41	self.soup = BeautifulSoup(markup=markup, features="html5lib")	×
42	self.path = directory / folder / domain[domain.index("//") + 2 : -1]	×
43	self.path_assets = Path("assets")	×
44	self.setup()	×
45
46	def setup(	×
47	self: Clone,
48	) -> None:
49	"""Create the directory and folders for the cloned website."""
50	logger.trace_()	×
51
52	self.path.mkdir(exist_ok=True, parents=True)	×
53	(self.path / self.path_assets).mkdir(exist_ok=True)	×
54
55	def save_html(	×
56	self: Clone,
57	) -> int:
58	"""Save the HTML clone."""
59	logger.trace_()	×
60
61	return (self.path / "index.html").write_text(	×
62	data=self.soup.prettify(),
63	encoding="utf-8",
64	)
65
66	def save_asset(	×
67	self: Clone,
68	data: bytes,
69	filename: str,
70	) -> None:
71	"""Save an asset file.
72
73	Args:
74	----
75	data (bytes):
76	The asset data.
77	filename (str):
78	The asset filename.
79	"""
80	logger.trace_(msg=filename)	×
81
82	(self.path / self.path_assets / filename).write_bytes(data=data)	×
83
84	def sync_images(	×
85	self: Clone,
86	session: Session,
87	) -> None:
88	"""Sync the images from the cloned website.
89
90	Args:
91	----
92	session (Session):
93	The requests session.
94	"""
95	images: ResultSet[Tag] = self.soup.find_all(name="img")	×
96
97	for image in images:	×
98	if ("src" not in image.attrs and "data-cfsrc" not in image.attrs) or image[	×
99	"src"
100	].startswith("https"):
101	continue	×
102
103	source = image["src"] if "src" in image.attrs else image["data-cfsrc"]	×
104	path = Path(source)	×
105	index = path.suffix.find("?")	×
106	filename = path.name if index < 0 else path.stem + path.suffix[0:index]	×
107	image["src"] = self.path_assets / filename	×
108	self.assets.append(filename)	×
109	response = session.request(url=self.domain + source)	×
110
111	if response is not None:	×
112	self.save_asset(	×
113	data=response.content,
114	filename=filename,
115	)
116
117	def sync_links(	×
118	self: Clone,
119	session: Session,
120	) -> None:
121	"""Sync the links from the cloned website.
122
123	Args:
124	----
125	session (Session):
126	The requests session.
127	"""
128	stylesheets: ResultSet[Tag] = self.soup.find_all(name="link")	×
129
130	for stylesheet in stylesheets:	×
131	if "href" not in stylesheet.attrs or stylesheet["href"].startswith("https"):	×
132	continue	×
133
134	source: str = stylesheet["href"]	×
135	path = Path(source)	×
136	index = path.suffix.find("?")	×
137	filename = path.name if index < 0 else path.stem + path.suffix[0:index]	×
138	self.assets.append(filename)	×
139	stylesheet["href"] = self.path_assets / filename	×
140	response = session.request(url=self.domain + source)	×
141
142	if response is not None:	×
143	self.save_asset(	×
144	data=response.content,
145	filename=filename,
146	)
147
148	def sync_scripts(	×
149	self: Clone,
150	session: Session,
151	*,
152	nosync: bool = True,
153	) -> None:
154	"""Sync the scripts from the cloned website.
155
156	Args:
157	----
158	session (Session):
159	The requests session.
160	nosync (bool, optional):
161	If true, remove the scripts. Defaults to True.
162	"""
163	noscripts: ResultSet[Tag] = self.soup.find_all(name="noscript")	×
164	for noscript in noscripts:	×
165	noscript.extract()	×
166
167	scripts: ResultSet[Tag] = self.soup.find_all(name="script")	×
168	for script in scripts:	×
169	if nosync or "src" not in script.attrs or script["src"].startswith("https"):	×
170	script.extract()	×
171	continue	×
172
173	source: str = script["src"]	×
174	path = Path(source)	×
175	index = path.suffix.find("?")	×
176	filename = path.name if index < 0 else path.stem + path.suffix[0:index]	×
177	self.assets.append(filename)	×
178	script["src"] = self.path_assets / filename	×
179	response = session.request(url=self.domain + source)	×
180
181	if response is not None:	×
182	self.save_asset(	×
183	data=response.content,
184	filename=filename,
185	)
186
187	def sync_fonts(	×
188	self: Clone,
189	session: Session,
190	) -> None:
191	"""Sync the fonts from the cloned website.
192
193	Args:
194	session (Session):
195	The requests session.
196	"""
197	stylesheets = list(	×
198	filter(lambda filename: filename.endswith("css"), self.assets),
199	)
200
201	for stylesheet in stylesheets:	×
202	path_stylesheet = self.path / self.path_assets / stylesheet	×
203	content = path_stylesheet.read_text()	×
204
205	sources: list[str] = re.findall(r"src: ?url\(([^)]+)\)", string=content)	×
206
207	for source in sources:	×
208	url = source.replace('"', "")	×
209	response = session.request(self.domain + url)	×
210
211	if response is not None:	×
212	path = Path(url)	×
213	index = path.suffix.find("?")	×
214	filename = (	×
215	path.name if index < 0 else path.stem + path.suffix[0:index]
216	)
217	self.assets.append(filename)	×
218	path_stylesheet.write_text(data=content.replace(url, filename))	×
219	self.save_asset(	×
220	data=response.content,
221	filename=filename,
222	)

MenSeb / python-playground / 6685844579

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous