14430722668

Committed 13 Apr 2025 03:12PM UTC coverage: 87.512%. First build

Build # 14430722668

Build Type

Pull #94

github

Committed by

web-flow

Commit Message

Merge e3289599b into 265e9c2b5

Pull Request Pull Request #94: Ingestion v2.0.0

Run Details

512 of 605 new or added lines in 13 files covered. (84.63%)

890 of 1017 relevant lines covered (87.51%)

0.88 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

71.88

/src/utils.py

from datetime import date, datetime
import json
import logging
import os
import re

import awswrangler as wr
from nltk.sentiment import SentimentIntensityAnalyzer
import numpy as np
import pandas as pd
import requests


def filter_spread(value: str) -> str:
    """
    Filter out 3-digit values from the `spread` column
    in the Scrape Odds Function such as `-108` or `-112`

    Parameters:
        value (str): The original value from the spread column.

    Returns:
        The spread value without any 3-digit values present
    """
    parts = value.split()
    filtered_parts = [
        (
            part
            if (part[0] in ["+", "-"] and float(part[1:]) <= 25)
            or (part.isdigit() and int(part) <= 25)
            else ""
        )
        for part in parts
    ]
    result = " ".join(filtered_parts).strip()

    # this last part strips out a couple extra white spaces
    return re.sub(r"\s+", " ", result)


def get_season_type(todays_date: date | None = None) -> str:
    """
    Function to generate Season Type for a given Date.
    **2025-03-16 NOTE** this has been deprecated as this logic
    belongs in the dbt project

    Args:
        todays_date (date): The Date to generate a Season Type for.  Defaults to
            today's date.

    Returns:
        The Season Type for Given Date
    """
    if todays_date is None:
        todays_date = datetime.now().date()

    if todays_date < datetime(2025, 4, 15).date():
        season_type = "Regular Season"
    elif (todays_date >= datetime(2025, 4, 16).date()) & (
        todays_date < datetime(2025, 4, 21).date()
    ):
        season_type = "Play-In"
    else:
        season_type = "Playoffs"

    return season_type


def check_schedule(date: datetime.date) -> bool:
    """
    Small Function used in Boxscores + PBP Functions to check if
    there are any games scheduled for a given date.

    Args:
        date (datetime.date): The Date to check for games on.

    Returns:
        Boolean: True if there are games scheduled, False if not.
    """
    schedule_endpoint = f"https://api.jyablonski.dev/schedule?date={date}"
    schedule_data = requests.get(schedule_endpoint).json()

    return True if len(schedule_data) > 0 else False


def add_sentiment_analysis(df: pd.DataFrame, sentiment_col: str) -> pd.DataFrame:
    """
    Function to add Sentiment Analysis columns to a DataFrame via nltk Vader Lexicon.

    Args:
        df (pd.DataFrame): The Pandas DataFrame

        sentiment_col (str): The Column in the DataFrame to run Sentiment Analysis on
            (comments / tweets etc).

    Returns:
        The same DataFrame but with the Sentiment Analysis columns attached.
    """
    try:
        analyzer = SentimentIntensityAnalyzer()
        df["compound"] = [
            analyzer.polarity_scores(x)["compound"] for x in df[sentiment_col]
        ]
        df["neg"] = [analyzer.polarity_scores(x)["neg"] for x in df[sentiment_col]]
        df["neu"] = [analyzer.polarity_scores(x)["neu"] for x in df[sentiment_col]]
        df["pos"] = [analyzer.polarity_scores(x)["pos"] for x in df[sentiment_col]]
        df["sentiment"] = np.where(df["compound"] > 0, 1, 0)
        return df
    except Exception as e:
        logging.error(f"Error Occurred while adding Sentiment Analysis, {e}")
        raise


def get_leading_zeroes(value: int) -> str:
    """
    Function to add leading zeroes to a month (1 (January) -> 01).
    Used in the the `write_to_s3` function.

    Args:
        value (int): The value integer (created from `datetime.now().month`)

    Returns:
        The same value integer with a leading 0 if it is less than 10
            (Nov/Dec aka 11/12 unaffected).
    """
    if len(str(value)) > 1:
        return str(value)
    else:
        return f"0{value}"


def clean_player_names(name: str) -> str:
    """
    Function to remove suffixes from a player name.

    Args:
        name (str): The raw player name you wish to alter.

    Returns:
        str: Cleaned Name w/ no suffix bs
    """
    try:
        cleaned_name = (
            name.replace(" Jr.", "")
            .replace(" Sr.", "")
            .replace(" III", "")  # III HAS TO GO FIRST, OVER II
            .replace(" II", "")  # or else Robert Williams III -> Robert WilliamsI
            .replace(" IV", "")
        )
        return cleaned_name
    except Exception as e:
        logging.error(f"Error Occurred with Clean Player Names, {e}")
        raise


def write_to_s3(
    file_name: str,
    df: pd.DataFrame,
    date: datetime.date = datetime.now().date(),
    bucket: str = os.environ.get("S3_BUCKET"),
) -> None:
    """
    S3 Function using awswrangler to write file.  Only supports parquet right now.

    Args:
        file_name (str): The base name of the file (boxscores, opp_stats)

        df (pd.DataFrame): The Pandas DataFrame to write to S3

        bucket (str): The Bucket to write to.  Defaults to `os.environ.get('S3_BUCKET')`

        date (datetime.date): Date to partition the data by.
            Defaults to `datetime.now().date()`

    Returns:
        None, but writes the Pandas DataFrame to an S3 File.

    """
    year_partition = date.year
    month_partition = get_leading_zeroes(value=date.month)
    file_name_jn = f"{file_name}-{date}"
    try:
        if len(df) == 0:
            logging.info(f"Not storing {file_name} to s3 because it's empty.")
            pass
        else:
            wr.s3.to_parquet(
                df=df,
                path=f"s3://{bucket}/{file_name}/validated/year={year_partition}/month={month_partition}/{file_name_jn}.parquet",
                index=False,
            )
            logging.info(
                f"Storing {len(df)} {file_name} rows to S3 (s3://{bucket}/{file_name}/validated/{year_partition}/{file_name_jn}.parquet)"
            )
            return None
    except Exception as error:
        logging.error(f"S3 Storage Function Failed {file_name}, {error}")
        return None


def write_to_sql(con, table_name: str, df: pd.DataFrame, table_type: str) -> None:
    """
    Simple Wrapper Function to write a Pandas DataFrame to SQL

    Args:
        con (SQL Connection): The connection to the SQL DB.

        table_name (str): The Table name to write to SQL as.

        df (DataFrame): The Pandas DataFrame to store in SQL

        table_type (str): Whether the table should replace or append to an
            existing SQL Table under that name

    Returns:
        Writes the Pandas DataFrame to a Table in the Schema we connected to.

    """
    try:
        if len(df) == 0:
            logging.info(f"{table_name} is empty, not writing to SQL")
        else:
            df.to_sql(
                con=con,
                name=table_name,
                index=False,
                if_exists=table_type,
            )
            logging.info(
                f"Writing {len(df)} {table_name} rows to aws_{table_name}_source to SQL"
            )

        return None
    except Exception as error:
        logging.error(f"SQL Write Script Failed, {error}")
        return None


def query_logs(log_file: str = "logs/example.log") -> list[str]:
    """
    Small Function to read Logs CSV File and grab Errors

    Args:
        log_file (str): Optional String of the Log File Name

    Returns:
        list of Error Messages to be passed into Slack Function
    """
    logs = pd.read_csv(log_file, sep=r"\\t", engine="python", header=None)
    logs = logs.rename(columns={0: "errors"})
    logs = logs.query("errors.str.contains('Failed')", engine="python")
    logs = logs["errors"].to_list()

    logging.info(f"Returning {len(logs)} Failed Logs")
    return logs


def write_to_slack(
    errors: list, webhook_url: str = os.environ.get("WEBHOOK_URL", default="default")
) -> int | None:
    """ "
    Function to write Errors out to Slack.  Requires a pre-configured `webhook_url`
    to be setup.

    Args:
        errors (list): The list of Failed Tasks + their associated errors

        webhook_url (str): Optional Parameter to specify the Webhook to send the
            errors to.  Defaults to `os.environ.get("WEBHOOK_URL")`

    Returns:
        None, but writes the Errors to Slack if there are any
    """
    try:
        date = datetime.now().date()
        num_errors = len(errors)
        str_dump = "\n".join(errors)

        if num_errors > 0:
            response = requests.post(
                webhook_url,
                data=json.dumps(
                    {
                        "text": (
                            f"\U0001f6d1 {num_errors} Errors during NBA ELT "
                            f"Ingestion on {date}: \n {str_dump}"
                        )
                    }
                ),
                headers={"Content-Type": "application/json"},
            )
            logging.info(
                f"Wrote Errors to Slack, Reponse Code {response.status_code}. "
                "Exiting ..."
            )
            return response.status_code
        else:
            logging.info("No Error Logs, not writing to Slack.  Exiting out ...")
            return None
    except Exception as e:
        logging.error(f"Error Writing to Slack, {e}")
        raise

1	from datetime import date, datetime	1✔
2	import json	1✔
3	import logging	1✔
4	import os	1✔
5	import re	1✔
6
7	import awswrangler as wr	1✔
8	from nltk.sentiment import SentimentIntensityAnalyzer	1✔
9	import numpy as np	1✔
10	import pandas as pd	1✔
11	import requests	1✔
12
13
14	def filter_spread(value: str) -> str:	1✔
15	"""
16	Filter out 3-digit values from the `spread` column
17	in the Scrape Odds Function such as `-108` or `-112`
18
19	Parameters:
20	value (str): The original value from the spread column.
21
22	Returns:
23	The spread value without any 3-digit values present
24	"""
25	parts = value.split()	1✔
26	filtered_parts = [	1✔
27	(
28	part
29	if (part[0] in ["+", "-"] and float(part[1:]) <= 25)
30	or (part.isdigit() and int(part) <= 25)
31	else ""
32	)
33	for part in parts
34	]
35	result = " ".join(filtered_parts).strip()	1✔
36
37	# this last part strips out a couple extra white spaces
38	return re.sub(r"\s+", " ", result)	1✔
39
40
41	def get_season_type(todays_date: date \| None = None) -> str:	1✔
42	"""
43	Function to generate Season Type for a given Date.
44	2025-03-16 NOTE this has been deprecated as this logic
45	belongs in the dbt project
46
47	Args:
48	todays_date (date): The Date to generate a Season Type for. Defaults to
49	today's date.
50
51	Returns:
52	The Season Type for Given Date
53	"""
54	if todays_date is None:	1✔
55	todays_date = datetime.now().date()	×
56
57	if todays_date < datetime(2025, 4, 15).date():	1✔
58	season_type = "Regular Season"	1✔
59	elif (todays_date >= datetime(2025, 4, 16).date()) & (	1✔
60	todays_date < datetime(2025, 4, 21).date()
61	):
62	season_type = "Play-In"	1✔
63	else:
64	season_type = "Playoffs"	1✔
65
66	return season_type	1✔
67
68
69	def check_schedule(date: datetime.date) -> bool:	1✔
70	"""
71	Small Function used in Boxscores + PBP Functions to check if
72	there are any games scheduled for a given date.
73
74	Args:
75	date (datetime.date): The Date to check for games on.
76
77	Returns:
78	Boolean: True if there are games scheduled, False if not.
79	"""
80	schedule_endpoint = f"https://api.jyablonski.dev/schedule?date={date}"	1✔
81	schedule_data = requests.get(schedule_endpoint).json()	1✔
82
83	return True if len(schedule_data) > 0 else False	1✔
84
85
86	def add_sentiment_analysis(df: pd.DataFrame, sentiment_col: str) -> pd.DataFrame:	1✔
87	"""
88	Function to add Sentiment Analysis columns to a DataFrame via nltk Vader Lexicon.
89
90	Args:
91	df (pd.DataFrame): The Pandas DataFrame
92
93	sentiment_col (str): The Column in the DataFrame to run Sentiment Analysis on
94	(comments / tweets etc).
95
96	Returns:
97	The same DataFrame but with the Sentiment Analysis columns attached.
98	"""
99	try:	1✔
100	analyzer = SentimentIntensityAnalyzer()	1✔
101	df["compound"] = [	1✔
102	analyzer.polarity_scores(x)["compound"] for x in df[sentiment_col]
103	]
104	df["neg"] = [analyzer.polarity_scores(x)["neg"] for x in df[sentiment_col]]	1✔
105	df["neu"] = [analyzer.polarity_scores(x)["neu"] for x in df[sentiment_col]]	1✔
106	df["pos"] = [analyzer.polarity_scores(x)["pos"] for x in df[sentiment_col]]	1✔
107	df["sentiment"] = np.where(df["compound"] > 0, 1, 0)	1✔
108	return df	1✔
109	except Exception as e:	×
110	logging.error(f"Error Occurred while adding Sentiment Analysis, {e}")	×
111	raise	×
112
113
114	def get_leading_zeroes(value: int) -> str:	1✔
115	"""
116	Function to add leading zeroes to a month (1 (January) -> 01).
117	Used in the the `write_to_s3` function.
118
119	Args:
120	value (int): The value integer (created from `datetime.now().month`)
121
122	Returns:
123	The same value integer with a leading 0 if it is less than 10
124	(Nov/Dec aka 11/12 unaffected).
125	"""
126	if len(str(value)) > 1:	1✔
127	return str(value)	1✔
128	else:
129	return f"0{value}"	1✔
130
131
132	def clean_player_names(name: str) -> str:	1✔
133	"""
134	Function to remove suffixes from a player name.
135
136	Args:
137	name (str): The raw player name you wish to alter.
138
139	Returns:
140	str: Cleaned Name w/ no suffix bs
141	"""
142	try:	1✔
143	cleaned_name = (	1✔
144	name.replace(" Jr.", "")
145	.replace(" Sr.", "")
146	.replace(" III", "") # III HAS TO GO FIRST, OVER II
147	.replace(" II", "") # or else Robert Williams III -> Robert WilliamsI
148	.replace(" IV", "")
149	)
150	return cleaned_name	1✔
151	except Exception as e:	×
152	logging.error(f"Error Occurred with Clean Player Names, {e}")	×
153	raise	×
154
155
156	def write_to_s3(	1✔
157	file_name: str,
158	df: pd.DataFrame,
159	date: datetime.date = datetime.now().date(),
160	bucket: str = os.environ.get("S3_BUCKET"),
161	) -> None:
162	"""
163	S3 Function using awswrangler to write file. Only supports parquet right now.
164
165	Args:
166	file_name (str): The base name of the file (boxscores, opp_stats)
167
168	df (pd.DataFrame): The Pandas DataFrame to write to S3
169
170	bucket (str): The Bucket to write to. Defaults to `os.environ.get('S3_BUCKET')`
171
172	date (datetime.date): Date to partition the data by.
173	Defaults to `datetime.now().date()`
174
175	Returns:
176	None, but writes the Pandas DataFrame to an S3 File.
177
178	"""
179	year_partition = date.year	×
180	month_partition = get_leading_zeroes(value=date.month)	×
181	file_name_jn = f"{file_name}-{date}"	×
182	try:	×
183	if len(df) == 0:	×
184	logging.info(f"Not storing {file_name} to s3 because it's empty.")	×
185	pass	×
186	else:
187	wr.s3.to_parquet(	×
188	df=df,
189	path=f"s3://{bucket}/{file_name}/validated/year={year_partition}/month={month_partition}/{file_name_jn}.parquet",
190	index=False,
191	)
192	logging.info(	×
193	f"Storing {len(df)} {file_name} rows to S3 (s3://{bucket}/{file_name}/validated/{year_partition}/{file_name_jn}.parquet)"
194	)
NEW 195	return None	×
196	except Exception as error:	×
197	logging.error(f"S3 Storage Function Failed {file_name}, {error}")	×
NEW 198	return None	×
199
200
201	def write_to_sql(con, table_name: str, df: pd.DataFrame, table_type: str) -> None:	1✔
202	"""
203	Simple Wrapper Function to write a Pandas DataFrame to SQL
204
205	Args:
206	con (SQL Connection): The connection to the SQL DB.
207
208	table_name (str): The Table name to write to SQL as.
209
210	df (DataFrame): The Pandas DataFrame to store in SQL
211
212	table_type (str): Whether the table should replace or append to an
213	existing SQL Table under that name
214
215	Returns:
216	Writes the Pandas DataFrame to a Table in the Schema we connected to.
217
218	"""
219	try:	1✔
220	if len(df) == 0:	1✔
221	logging.info(f"{table_name} is empty, not writing to SQL")	×
222	else:
223	df.to_sql(	1✔
224	con=con,
225	name=table_name,
226	index=False,
227	if_exists=table_type,
228	)
229	logging.info(	1✔
230	f"Writing {len(df)} {table_name} rows to aws_{table_name}_source to SQL"
231	)
232
233	return None	1✔
234	except Exception as error:	×
235	logging.error(f"SQL Write Script Failed, {error}")	×
NEW 236	return None	×
237
238
239	def query_logs(log_file: str = "logs/example.log") -> list[str]:	1✔
240	"""
241	Small Function to read Logs CSV File and grab Errors
242
243	Args:
244	log_file (str): Optional String of the Log File Name
245
246	Returns:
247	list of Error Messages to be passed into Slack Function
248	"""
249	logs = pd.read_csv(log_file, sep=r"\\t", engine="python", header=None)	1✔
250	logs = logs.rename(columns={0: "errors"})	1✔
251	logs = logs.query("errors.str.contains('Failed')", engine="python")	1✔
252	logs = logs["errors"].to_list()	1✔
253
254	logging.info(f"Returning {len(logs)} Failed Logs")	1✔
255	return logs	1✔
256
257
258	def write_to_slack(	1✔
259	errors: list, webhook_url: str = os.environ.get("WEBHOOK_URL", default="default")
260	) -> int \| None:
261	""" "
262	Function to write Errors out to Slack. Requires a pre-configured `webhook_url`
263	to be setup.
264
265	Args:
266	errors (list): The list of Failed Tasks + their associated errors
267
268	webhook_url (str): Optional Parameter to specify the Webhook to send the
269	errors to. Defaults to `os.environ.get("WEBHOOK_URL")`
270
271	Returns:
272	None, but writes the Errors to Slack if there are any
273	"""
274	try:	1✔
275	date = datetime.now().date()	1✔
276	num_errors = len(errors)	1✔
277	str_dump = "\n".join(errors)	1✔
278
279	if num_errors > 0:	1✔
280	response = requests.post(	1✔
281	webhook_url,
282	data=json.dumps(
283	{
284	"text": (
285	f"\U0001f6d1 {num_errors} Errors during NBA ELT "
286	f"Ingestion on {date}: \n {str_dump}"
287	)
288	}
289	),
290	headers={"Content-Type": "application/json"},
291	)
292	logging.info(	1✔
293	f"Wrote Errors to Slack, Reponse Code {response.status_code}. "
294	"Exiting ..."
295	)
296	return response.status_code	1✔
297	else:
298	logging.info("No Error Logs, not writing to Slack. Exiting out ...")	1✔
299	return None	1✔
300	except Exception as e:	×
301	logging.error(f"Error Writing to Slack, {e}")	×
302	raise	×

jyablonski / python_docker / 14430722668

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous