• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

jyablonski / python_docker / 12846384455

18 Jan 2025 06:20PM UTC coverage: 83.181%. First build
12846384455

Pull #88

github

web-flow
Merge 5dc50a9b2 into 961f0fcad
Pull Request #88: Ingestion v1.13.5

27 of 28 new or added lines in 12 files covered. (96.43%)

821 of 987 relevant lines covered (83.18%)

0.83 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

74.87
/src/utils.py
1
from datetime import date, datetime, timedelta
1✔
2
import hashlib
1✔
3
import json
1✔
4
import logging
1✔
5
import os
1✔
6
import re
1✔
7
import time
1✔
8
from typing import Any, Callable
1✔
9

10
import awswrangler as wr
1✔
11
from bs4 import BeautifulSoup
1✔
12
from nltk.sentiment import SentimentIntensityAnalyzer
1✔
13
import numpy as np
1✔
14
import pandas as pd
1✔
15
import praw
1✔
16
import requests
1✔
17
from sqlalchemy.engine.base import Connection, Engine
1✔
18
import sentry_sdk
1✔
19

20
# import tweepy
21

22
sentry_sdk.init(os.environ.get("SENTRY_TOKEN"), traces_sample_rate=1.0)
1✔
23
sentry_sdk.set_user({"email": "jyablonski9@gmail.com"})
1✔
24

25

26
def time_function(func: Callable[..., Any]) -> Callable[..., Any]:
1✔
27
    """
28
    Decorator function used to record the execution time of any
29
    function it's applied to.
30

31
    Args:
32
        func (Callable): Function to track the execution time on.
33

34
    Returns:
35
        Callable[..., Any]: The wrapped function that records
36
            the execution time.
37
    """
38

39
    def wrapper(*args, **kwargs):
1✔
40
        start_time = time.time()
1✔
41
        result = func(*args, **kwargs)
1✔
42
        total_func_time = round(time.time() - start_time, 2)
1✔
43
        logging.info(f"{func.__name__} took {total_func_time} seconds")
1✔
44

45
        return result
1✔
46

47
    return wrapper
1✔
48

49

50
def filter_spread(value: str) -> str:
1✔
51
    """
52
    Filter out 3-digit values from the `spread` column
53
    in the Scrape Odds Function such as `-108` or `-112`
54

55
    Parameters:
56
        value (str): The original value from the spread column.
57

58
    Returns:
59
        The spread value without any 3-digit values present
60
    """
61
    parts = value.split()
1✔
62
    filtered_parts = [
1✔
63
        (
64
            part
65
            if (part[0] in ["+", "-"] and float(part[1:]) <= 25)
66
            or (part.isdigit() and int(part) <= 25)
67
            else ""
68
        )
69
        for part in parts
70
    ]
71
    result = " ".join(filtered_parts).strip()
1✔
72

73
    # this last part strips out a couple extra white spaces
74
    return re.sub(r"\s+", " ", result)
1✔
75

76

77
def get_season_type(todays_date: date | None = None) -> str:
1✔
78
    """
79
    Function to generate Season Type for a given Date.
80

81
    Args:
82
        todays_date (date): The Date to generate a Season Type for.  Defaults to
83
            today's date.
84

85
    Returns:
86
        The Season Type for Given Date
87
    """
88
    if todays_date is None:
1✔
89
        todays_date = datetime.now().date()
1✔
90

91
    if todays_date < datetime(2025, 4, 15).date():
1✔
92
        season_type = "Regular Season"
1✔
93
    elif (todays_date >= datetime(2025, 4, 16).date()) & (
1✔
94
        todays_date < datetime(2025, 4, 21).date()
95
    ):
96
        season_type = "Play-In"
1✔
97
    else:
98
        season_type = "Playoffs"
1✔
99

100
    return season_type
1✔
101

102

103
def add_sentiment_analysis(df: pd.DataFrame, sentiment_col: str) -> pd.DataFrame:
1✔
104
    """
105
    Function to add Sentiment Analysis columns to a DataFrame via nltk Vader Lexicon.
106

107
    Args:
108
        df (pd.DataFrame): The Pandas DataFrame
109

110
        sentiment_col (str): The Column in the DataFrame to run Sentiment Analysis on
111
            (comments / tweets etc).
112

113
    Returns:
114
        The same DataFrame but with the Sentiment Analysis columns attached.
115
    """
116
    try:
1✔
117
        analyzer = SentimentIntensityAnalyzer()
1✔
118
        df["compound"] = [
1✔
119
            analyzer.polarity_scores(x)["compound"] for x in df[sentiment_col]
120
        ]
121
        df["neg"] = [analyzer.polarity_scores(x)["neg"] for x in df[sentiment_col]]
1✔
122
        df["neu"] = [analyzer.polarity_scores(x)["neu"] for x in df[sentiment_col]]
1✔
123
        df["pos"] = [analyzer.polarity_scores(x)["pos"] for x in df[sentiment_col]]
1✔
124
        df["sentiment"] = np.where(df["compound"] > 0, 1, 0)
1✔
125
        return df
1✔
126
    except BaseException as e:
×
127
        logging.error(f"Error Occurred while adding Sentiment Analysis, {e}")
×
128
        sentry_sdk.capture_exception(e)
×
129
        raise e
×
130

131

132
def get_leading_zeroes(value: int) -> str:
1✔
133
    """
134
    Function to add leading zeroes to a month (1 (January) -> 01).
135
    Used in the the `write_to_s3` function.
136

137
    Args:
138
        value (int): The value integer (created from `datetime.now().month`)
139

140
    Returns:
141
        The same value integer with a leading 0 if it is less than 10
142
            (Nov/Dec aka 11/12 unaffected).
143
    """
144
    if len(str(value)) > 1:
1✔
145
        return str(value)
1✔
146
    else:
147
        return f"0{value}"
1✔
148

149

150
def clean_player_names(name: str) -> str:
1✔
151
    """
152
    Function to remove suffixes from a player name.
153

154
    Args:
155
        name (str): The raw player name you wish to alter.
156

157
    Returns:
158
        str: Cleaned Name w/ no suffix bs
159
    """
160
    try:
1✔
161
        cleaned_name = (
1✔
162
            name.replace(" Jr.", "")
163
            .replace(" Sr.", "")
164
            .replace(" III", "")  # III HAS TO GO FIRST, OVER II
165
            .replace(" II", "")  # or else Robert Williams III -> Robert WilliamsI
166
            .replace(" IV", "")
167
        )
168
        return cleaned_name
1✔
169
    except BaseException as e:
×
170
        logging.error(f"Error Occurred with Clean Player Names, {e}")
×
171
        sentry_sdk.capture_exception(e)
×
172
        raise e
×
173

174

175
@time_function
1✔
176
def get_player_stats_data(feature_flags_df: pd.DataFrame) -> pd.DataFrame:
1✔
177
    """
178
    Web Scrape function w/ BS4 that grabs aggregate season stats
179

180
    Args:
181
        feature_flags_df (pd.DataFrame): Feature Flags DataFrame to
182
            check whether to run this function or not
183

184
    Returns:
185
        DataFrame of Player Aggregate Season stats
186
    """
187
    feature_flag = "stats"
1✔
188
    feature_flag_check = check_feature_flag(
1✔
189
        flag=feature_flag, flags_df=feature_flags_df
190
    )
191

192
    if feature_flag_check is False:
1✔
193
        logging.info(f"Feature Flag {feature_flag} is disabled, skipping function")
×
194
        df = pd.DataFrame()
×
195
        return df
×
196

197
    # stats = stats.rename(columns={"fg%": "fg_pct", "3p%": "3p_pct",
198
    # "2p%": "2p_pct", "efg%": "efg_pct", "ft%": "ft_pct"})
199
    try:
1✔
200
        year_stats = 2025
1✔
201
        url = f"https://www.basketball-reference.com/leagues/NBA_{year_stats}_per_game.html"
1✔
202
        html = requests.get(url).content
1✔
203
        soup = BeautifulSoup(html, "html.parser")
1✔
204
        headers = [th.getText() for th in soup.findAll("tr", limit=2)[0].findAll("th")]
1✔
205
        headers = headers[1:]
1✔
206
        rows = soup.findAll("tr")[1:]
1✔
207
        player_stats = [
1✔
208
            [td.getText() for td in rows[i].findAll("td")] for i in range(len(rows))
209
        ]
210
        stats = pd.DataFrame(player_stats, columns=headers)
1✔
211
        stats["PTS"] = pd.to_numeric(stats["PTS"])
1✔
212
        stats = stats.query("Player == Player").reset_index()
1✔
213
        stats["Player"] = (
1✔
214
            stats["Player"]
215
            .str.normalize("NFKD")
216
            .str.encode("ascii", errors="ignore")
217
            .str.decode("utf-8")
218
        )
219
        stats.columns = stats.columns.str.lower()
1✔
220
        stats["scrape_date"] = datetime.now().date()
1✔
221
        stats = stats.drop(columns=["index", "awards"], axis=1)
1✔
222
        logging.info(
1✔
223
            "General Stats Transformation Function Successful, "
224
            f"retrieving {len(stats)} updated rows"
225
        )
226
        return stats
1✔
227
    except BaseException as error:
×
228
        logging.error(f"General Stats Extraction Function Failed, {error}")
×
229
        sentry_sdk.capture_exception(error)
×
230
        df = pd.DataFrame()
×
231
        return df
×
232

233

234
@time_function
1✔
235
def get_boxscores_data(
1✔
236
    feature_flags_df: pd.DataFrame,
237
    month: int = (datetime.now() - timedelta(1)).month,
238
    day: int = (datetime.now() - timedelta(1)).day,
239
    year: int = (datetime.now() - timedelta(1)).year,
240
) -> pd.DataFrame:
241
    """
242
    Function that grabs box scores from a given date in mmddyyyy
243
    format - defaults to yesterday.  values can be ex. 1 or 01.
244
    Can't use `read_html` for this so this is raw web scraping baby.
245

246
    Args:
247
        feature_flags_df (pd.DataFrame): Feature Flags DataFrame to
248
            check whether to run this function or not
249

250
        month (int): month value of the game played (0 - 12)
251

252
        day (int): day value of the game played (1 - 31)
253

254
        year (int): year value of the game played (2021)
255

256
    Returns:
257
        DataFrame of Player Aggregate Season stats
258
    """
259
    day = get_leading_zeroes(value=day)
1✔
260
    month = get_leading_zeroes(value=month)
1✔
261

262
    feature_flag = "boxscores"
1✔
263
    feature_flag_check = check_feature_flag(
1✔
264
        flag=feature_flag, flags_df=feature_flags_df
265
    )
266

267
    if feature_flag_check is False:
1✔
268
        logging.info(f"Feature Flag {feature_flag} is disabled, skipping function")
×
269
        df = pd.DataFrame()
×
270
        return df
×
271

272
    url = f"https://www.basketball-reference.com/friv/dailyleaders.fcgi?month={month}&day={day}&year={year}&type=all"
1✔
273
    season_type = get_season_type()
1✔
274
    date = f"{year}-{month}-{day}"
1✔
275

276
    try:
1✔
277
        html = requests.get(url).content
1✔
278
        soup = BeautifulSoup(html, "html.parser")
1✔
279
        headers = [th.getText() for th in soup.findAll("tr", limit=2)[0].findAll("th")]
1✔
280
        headers = headers[1:]
1✔
281
        headers[1] = "Team"
1✔
282
        headers[2] = "Location"
1✔
283
        headers[3] = "Opponent"
1✔
284
        headers[4] = "Outcome"
1✔
285
        headers[6] = "FGM"
1✔
286
        headers[8] = "FGPercent"
1✔
287
        headers[9] = "threePFGMade"
1✔
288
        headers[10] = "threePAttempted"
1✔
289
        headers[11] = "threePointPercent"
1✔
290
        headers[14] = "FTPercent"
1✔
291
        headers[15] = "OREB"
1✔
292
        headers[16] = "DREB"
1✔
293
        headers[24] = "PlusMinus"
1✔
294

295
        rows = soup.findAll("tr")[1:]
1✔
296
        player_stats = [
1✔
297
            [td.getText() for td in rows[i].findAll("td")] for i in range(len(rows))
298
        ]
299

300
        df = pd.DataFrame(player_stats, columns=headers)
1✔
301

302
        df[
1✔
303
            [
304
                "FGM",
305
                "FGA",
306
                "FGPercent",
307
                "threePFGMade",
308
                "threePAttempted",
309
                "threePointPercent",
310
                "OREB",
311
                "DREB",
312
                "TRB",
313
                "AST",
314
                "STL",
315
                "BLK",
316
                "TOV",
317
                "PF",
318
                "PTS",
319
                "PlusMinus",
320
                "GmSc",
321
            ]
322
        ] = df[
323
            [
324
                "FGM",
325
                "FGA",
326
                "FGPercent",
327
                "threePFGMade",
328
                "threePAttempted",
329
                "threePointPercent",
330
                "OREB",
331
                "DREB",
332
                "TRB",
333
                "AST",
334
                "STL",
335
                "BLK",
336
                "TOV",
337
                "PF",
338
                "PTS",
339
                "PlusMinus",
340
                "GmSc",
341
            ]
342
        ].apply(
343
            pd.to_numeric
344
        )
345
        df["date"] = str(year) + "-" + str(month) + "-" + str(day)
1✔
346
        df["date"] = pd.to_datetime(df["date"])
1✔
347
        df["Type"] = season_type
1✔
348
        df["Location"] = df["Location"].apply(lambda x: "A" if x == "@" else "H")
1✔
349
        df["Team"] = df["Team"].str.replace("PHO", "PHX")
1✔
350
        df["Team"] = df["Team"].str.replace("CHO", "CHA")
1✔
351
        df["Team"] = df["Team"].str.replace("BRK", "BKN")
1✔
352
        df["Opponent"] = df["Opponent"].str.replace("PHO", "PHX")
1✔
353
        df["Opponent"] = df["Opponent"].str.replace("CHO", "CHA")
1✔
354
        df["Opponent"] = df["Opponent"].str.replace("BRK", "BKN")
1✔
355
        df = df.query("Player == Player").reset_index(drop=True)
1✔
356
        df["Player"] = (
1✔
357
            df["Player"]
358
            .str.normalize("NFKD")  # this is removing all accented characters
359
            .str.encode("ascii", errors="ignore")
360
            .str.decode("utf-8")
361
        )
362
        df["scrape_date"] = datetime.now().date()
1✔
363
        df.columns = df.columns.str.lower()
1✔
364
        logging.info(
1✔
365
            "Box Score Transformation Function Successful, "
366
            f"retrieving {len(df)} rows for {date}"
367
        )
368
        return df
1✔
369
    except IndexError as error:
1✔
370
        schedule_endpoint = f"https://api.jyablonski.dev/schedule?date={date}"
1✔
371
        schedule_data = requests.get(schedule_endpoint).json()
1✔
372

373
        if len(schedule_data) > 0:
1✔
374
            logging.error(
1✔
375
                f"""Box Scores Function Failed, no Data Available yet for {date}"""
376
            )
377
            df = pd.DataFrame()
1✔
378
            return df
1✔
379
        else:
380
            logging.info(f"No Games were played on {date}; no Box Scores to pull")
1✔
381
            df = pd.DataFrame()
1✔
382
            return df
1✔
383

384
    except BaseException as error:
×
385
        logging.error(f"Box Scores Function Failed, {error}")
×
386
        sentry_sdk.capture_exception(error)
×
387
        df = pd.DataFrame()
×
388
        return df
×
389

390

391
@time_function
1✔
392
def get_opp_stats_data(feature_flags_df: pd.DataFrame) -> pd.DataFrame:
1✔
393
    """
394
    Web Scrape function w/ pandas read_html that grabs all
395
        regular season opponent team stats
396

397
    Args:
398
        feature_flags_df (pd.DataFrame): Feature Flags DataFrame
399
            to check whether to run this function or not
400

401
    Returns:
402
        Pandas DataFrame of all current team opponent stats
403
    """
404
    feature_flag = "opp_stats"
1✔
405
    feature_flag_check = check_feature_flag(
1✔
406
        flag=feature_flag, flags_df=feature_flags_df
407
    )
408

409
    if feature_flag_check is False:
1✔
410
        logging.info(f"Feature Flag {feature_flag} is disabled, skipping function")
×
411
        df = pd.DataFrame()
×
412
        return df
×
413

414
    year = (datetime.now() - timedelta(1)).year
1✔
415
    month = (datetime.now() - timedelta(1)).month
1✔
416
    day = (datetime.now() - timedelta(1)).day
1✔
417
    year_stats = 2025
1✔
418

419
    try:
1✔
420
        url = f"https://www.basketball-reference.com/leagues/NBA_{year_stats}.html"
1✔
421
        df = pd.read_html(url)[5]
1✔
422
        df = df[["Team", "FG%", "3P%", "3P", "PTS"]]
1✔
423
        df = df.rename(
1✔
424
            columns={
425
                df.columns[0]: "team",
426
                df.columns[1]: "fg_percent_opp",
427
                df.columns[2]: "threep_percent_opp",
428
                df.columns[3]: "threep_made_opp",
429
                df.columns[4]: "ppg_opp",
430
            }
431
        )
432
        df = df.query('team != "League Average"')
1✔
433
        df = df.reset_index(drop=True)
1✔
434
        df["scrape_date"] = datetime.now().date()
1✔
435
        logging.info(
1✔
436
            "Opp Stats Transformation Function Successful, "
437
            f"retrieving {len(df)} rows for {year}-{month}-{day}"
438
        )
439
        return df
1✔
440
    except BaseException as error:
×
441
        logging.error(f"Opp Stats Web Scrape Function Failed, {error}")
×
442
        sentry_sdk.capture_exception(error)
×
443
        df = pd.DataFrame()
×
444
        return df
×
445

446

447
@time_function
1✔
448
def get_injuries_data(feature_flags_df: pd.DataFrame) -> pd.DataFrame:
1✔
449
    """
450
    Web Scrape function w/ pandas read_html that grabs all current injuries
451

452
    Args:
453
        feature_flags_df (pd.DataFrame): Feature Flags DataFrame to check
454
            whether to run this function or not
455

456
    Returns:
457
        Pandas DataFrame of all current player injuries & their associated team
458
    """
459
    feature_flag = "injuries"
1✔
460
    feature_flag_check = check_feature_flag(
1✔
461
        flag=feature_flag, flags_df=feature_flags_df
462
    )
463

464
    if feature_flag_check is False:
1✔
465
        logging.info(f"Feature Flag {feature_flag} is disabled, skipping function")
×
466
        df = pd.DataFrame()
×
467
        return df
×
468

469
    try:
1✔
470
        url = "https://www.basketball-reference.com/friv/injuries.fcgi"
1✔
471
        df = pd.read_html(url)[0]
1✔
472
        df = df.rename(columns={"Update": "Date"})
1✔
473
        df.columns = df.columns.str.lower()
1✔
474
        df["scrape_date"] = datetime.now().date()
1✔
475
        df["player"] = (
1✔
476
            df["player"]
477
            .str.normalize("NFKD")  # this is removing all accented characters
478
            .str.encode("ascii", errors="ignore")
479
            .str.decode("utf-8")
480
        )
481
        df["player"] = df["player"].apply(clean_player_names)
1✔
482
        df = df.drop_duplicates()
1✔
483
        logging.info(
1✔
484
            f"Injury Transformation Function Successful, retrieving {len(df)} rows"
485
        )
486
        return df
1✔
487
    except BaseException as error:
×
488
        logging.error(f"Injury Web Scrape Function Failed, {error}")
×
489
        sentry_sdk.capture_exception(error)
×
490
        df = pd.DataFrame()
×
491
        return df
×
492

493

494
@time_function
1✔
495
def get_transactions_data(feature_flags_df: pd.DataFrame) -> pd.DataFrame:
1✔
496
    """
497
    Web Scrape function w/ BS4 that retrieves NBA Trades, signings, waivers etc.
498

499
    Args:
500
        feature_flags_df (pd.DataFrame): Feature Flags DataFrame to check whether
501
            to run this function or not
502

503
    Returns:
504
        Pandas DataFrame of all season transactions, trades, player waives etc.
505
    """
506
    feature_flag = "transactions"
1✔
507
    feature_flag_check = check_feature_flag(
1✔
508
        flag=feature_flag, flags_df=feature_flags_df
509
    )
510

511
    if feature_flag_check is False:
1✔
512
        logging.info(f"Feature Flag {feature_flag} is disabled, skipping function")
×
513
        df = pd.DataFrame()
×
514
        return df
×
515

516
    try:
1✔
517
        url = "https://www.basketball-reference.com/leagues/NBA_2025_transactions.html"
1✔
518
        html = requests.get(url).content
1✔
519
        soup = BeautifulSoup(html, "html.parser")
1✔
520
        # theres a bunch of garbage in the first 50 rows - no matter what
521
        trs = soup.findAll("li")[70:]
1✔
522
        rows = []
1✔
523
        mylist = []
1✔
524
        for tr in trs:
1✔
525
            date = tr.find("span")
1✔
526
            # needed bc span can be null (multi <p> elements per span)
527
            if date is not None:
1✔
528
                date = date.text
1✔
529
            data = tr.findAll("p")
1✔
530
            for p in data:
1✔
531
                mylist.append(p.text)
1✔
532
            data3 = [date] + [mylist]
1✔
533
            rows.append(data3)
1✔
534
            mylist = []
1✔
535

536
        transactions = pd.DataFrame(rows)
1✔
537
        transactions.columns = ["Date", "Transaction"]
1✔
538
        transactions = transactions.query(
1✔
539
            'Date == Date & Date != ""'
540
        ).reset_index()  # filters out nulls and empty values
541
        transactions = transactions.explode("Transaction")
1✔
542
        transactions["Date"] = transactions["Date"].str.replace(
1✔
543
            "\\?", "October 1, 2024", regex=True  # bad data 10-14-21
544
        )
545
        transactions["Date"] = pd.to_datetime(transactions["Date"])
1✔
546
        transactions.columns = transactions.columns.str.lower()
1✔
547
        transactions = transactions[["date", "transaction"]]
1✔
548
        transactions["scrape_date"] = datetime.now().date()
1✔
549
        transactions = transactions.drop_duplicates()
1✔
550
        logging.info(
1✔
551
            "Transactions Transformation Function Successful, "
552
            f"retrieving {len(transactions)} rows"
553
        )
554
        return transactions
1✔
555
    except BaseException as error:
×
556
        logging.error(f"Transaction Web Scrape Function Failed, {error}")
×
557
        sentry_sdk.capture_exception(error)
×
558
        df = pd.DataFrame()
×
559
        return df
×
560

561

562
@time_function
1✔
563
def get_advanced_stats_data(feature_flags_df: pd.DataFrame) -> pd.DataFrame:
1✔
564
    """
565
    Web Scrape function w/ pandas read_html that grabs all team advanced stats
566

567
    Args:
568
        feature_flags_df (pd.DataFrame): Feature Flags DataFrame to check
569
            whether to run this function or not
570

571
    Returns:
572
        DataFrame of all current Team Advanced Stats
573
    """
574
    feature_flag = "adv_stats"
1✔
575
    feature_flag_check = check_feature_flag(
1✔
576
        flag=feature_flag, flags_df=feature_flags_df
577
    )
578

579
    if feature_flag_check is False:
1✔
580
        logging.info(f"Feature Flag {feature_flag} is disabled, skipping function")
×
581
        df = pd.DataFrame()
×
582
        return df
×
583

584
    year_stats = 2025
1✔
585
    try:
1✔
586
        url = f"https://www.basketball-reference.com/leagues/NBA_{year_stats}.html"
1✔
587
        df = pd.read_html(url)
1✔
588
        df = pd.DataFrame(df[10])
1✔
589
        df.drop(columns=df.columns[0], axis=1, inplace=True)
1✔
590
        df.columns = [
1✔
591
            "Team",
592
            "Age",
593
            "W",
594
            "L",
595
            "PW",
596
            "PL",
597
            "MOV",
598
            "SOS",
599
            "SRS",
600
            "ORTG",
601
            "DRTG",
602
            "NRTG",
603
            "Pace",
604
            "FTr",
605
            "3PAr",
606
            "TS%",
607
            "bby1",  # the bby columns are because of hierarchical html formatting
608
            "eFG%",
609
            "TOV%",
610
            "ORB%",
611
            "FT/FGA",
612
            "bby2",
613
            "eFG%_opp",
614
            "TOV%_opp",
615
            "DRB%_opp",
616
            "FT/FGA_opp",
617
            "bby3",
618
            "Arena",
619
            "Attendance",
620
            "Att/Game",
621
        ]
622
        df.drop(["bby1", "bby2", "bby3"], axis=1, inplace=True)
1✔
623
        df = df.query('Team != "League Average"').reset_index()
1✔
624
        # Playoff teams get a * next to them ??  fkn stupid, filter it out.
625
        df["Team"] = df["Team"].str.replace("\\*", "", regex=True)
1✔
626
        df["scrape_date"] = datetime.now().date()
1✔
627
        df.columns = df.columns.str.lower()
1✔
628
        logging.info(
1✔
629
            """
630
            Advanced Stats Transformation Function Successful,
631
            retrieving updated data for 30 Teams
632
            """
633
        )
634
        return df
1✔
635
    except BaseException as error:
×
636
        logging.error(f"Advanced Stats Web Scrape Function Failed, {error}")
×
637
        sentry_sdk.capture_exception(error)
×
638
        df = pd.DataFrame()
×
639
        return df
×
640

641

642
@time_function
1✔
643
def get_shooting_stats_data(feature_flags_df: pd.DataFrame) -> pd.DataFrame:
1✔
644
    """
645
    Web Scrape function w/ pandas read_html that grabs all raw shooting stats
646

647
    Args:
648
        feature_flags_df (pd.DataFrame): Feature Flags DataFrame to check whether
649
            to run this function or not
650

651
    Returns:
652
        DataFrame of raw shooting stats
653
    """
654
    feature_flag = "shooting_stats"
1✔
655
    feature_flag_check = check_feature_flag(
1✔
656
        flag=feature_flag, flags_df=feature_flags_df
657
    )
658

659
    if feature_flag_check is False:
1✔
660
        logging.info(f"Feature Flag {feature_flag} is disabled, skipping function")
×
661
        df = pd.DataFrame()
×
662
        return df
×
663

664
    year_stats = 2025
1✔
665
    try:
1✔
666
        url = f"https://www.basketball-reference.com/leagues/NBA_{year_stats}_shooting.html"
1✔
667
        df = pd.read_html(url)[0]
1✔
668
        df.columns = df.columns.to_flat_index()
1✔
669
        df = df.rename(
1✔
670
            columns={
671
                df.columns[1]: "player",
672
                df.columns[6]: "mp",
673
                df.columns[8]: "avg_shot_distance",
674
                df.columns[10]: "pct_fga_2p",
675
                df.columns[11]: "pct_fga_0_3",
676
                df.columns[12]: "pct_fga_3_10",
677
                df.columns[13]: "pct_fga_10_16",
678
                df.columns[14]: "pct_fga_16_3p",
679
                df.columns[15]: "pct_fga_3p",
680
                df.columns[17]: "fg_pct_0_3",
681
                df.columns[18]: "fg_pct_3_10",
682
                df.columns[19]: "fg_pct_10_16",
683
                df.columns[20]: "fg_pct_16_3p",
684
                df.columns[22]: "pct_2pfg_ast",
685
                df.columns[23]: "pct_3pfg_ast",
686
                df.columns[24]: "dunk_pct_tot_fg",
687
                df.columns[25]: "dunks",
688
                df.columns[26]: "corner_3_ast_pct",
689
                df.columns[27]: "corner_3pm_pct",
690
                df.columns[28]: "heaves_att",
691
                df.columns[29]: "heaves_makes",
692
            }
693
        )[
694
            [
695
                "player",
696
                "mp",
697
                "avg_shot_distance",
698
                "pct_fga_2p",
699
                "pct_fga_0_3",
700
                "pct_fga_3_10",
701
                "pct_fga_10_16",
702
                "pct_fga_16_3p",
703
                "pct_fga_3p",
704
                "fg_pct_0_3",
705
                "fg_pct_3_10",
706
                "fg_pct_10_16",
707
                "fg_pct_16_3p",
708
                "pct_2pfg_ast",
709
                "pct_3pfg_ast",
710
                "dunk_pct_tot_fg",
711
                "dunks",
712
                "corner_3_ast_pct",
713
                "corner_3pm_pct",
714
                "heaves_att",
715
                "heaves_makes",
716
            ]
717
        ]
718
        df = df.query('player != "Player"').copy()
1✔
719
        df["mp"] = pd.to_numeric(df["mp"])
1✔
720
        df = (
1✔
721
            df.sort_values(["mp"], ascending=False)
722
            .groupby("player")
723
            .first()
724
            .reset_index()
725
            .drop("mp", axis=1)
726
        )
727
        df["player"] = (
1✔
728
            df["player"]
729
            .str.normalize("NFKD")  # this is removing all accented characters
730
            .str.encode("ascii", errors="ignore")
731
            .str.decode("utf-8")
732
        )
733
        df["player"] = df["player"].apply(clean_player_names)
1✔
734
        df["scrape_date"] = datetime.now().date()
1✔
735
        df["scrape_ts"] = datetime.now()
1✔
736
        logging.info(
1✔
737
            "Shooting Stats Transformation Function Successful, "
738
            f"retrieving {len(df)} rows"
739
        )
740
        return df
1✔
741
    except BaseException as error:
×
742
        logging.error(f"Shooting Stats Web Scrape Function Failed, {error}")
×
743
        sentry_sdk.capture_exception(error)
×
744
        df = pd.DataFrame()
×
745
        return df
×
746

747

748
@time_function
1✔
749
def scrape_odds(feature_flags_df: pd.DataFrame) -> pd.DataFrame:
1✔
750
    """
751
    Function to web scrape Gambling Odds from cover.com
752

753
    Args:
754
        feature_flags_df (pd.DataFrame): Feature Flags DataFrame to check whether
755
            to run this function or not
756

757
    Returns:
758
        DataFrame of Gambling Odds for Today's Games
759
    """
760
    feature_flag = "odds"
1✔
761
    feature_flag_check = check_feature_flag(
1✔
762
        flag=feature_flag, flags_df=feature_flags_df
763
    )
764

765
    if feature_flag_check is False:
1✔
766
        logging.info(f"Feature Flag {feature_flag} is disabled, skipping function")
×
767
        df = pd.DataFrame()
×
768
        return df
×
769

770
    try:
1✔
771
        url = "https://www.covers.com/sport/basketball/nba/odds"
1✔
772
        df = pd.read_html(url)
1✔
773
        odds = df[0]
1✔
774
        odds["spread"] = df[3].iloc[:, 4]  # 5th column in df[3]
1✔
775
        # Select columns by index: First column (index 0), 5th column (index 4), and 'spread'
776
        odds = odds.iloc[:, [0, 4, -1]]
1✔
777
        # Rename the selected columns
778
        odds = odds.rename(
1✔
779
            columns={
780
                odds.columns[0]: "datetime1",  # Rename first column
781
                odds.columns[1]: "moneyline",  # Rename second column
782
            }
783
        )
784
        # filter out any records not from today
785
        odds = odds.query(
1✔
786
            "datetime1 != 'FINAL' and datetime1 == datetime1 and datetime1.str.contains('Today')",
787
            engine="python",
788
        ).copy()
789
        # PK is a pick em game, so we'll set the spread to -1.0
790
        odds["spread"] = odds["spread"].str.replace("PK", "-1.0")
1✔
791
        if len(odds) == 0:
1✔
792
            logging.info("No Odds Records available for today's games")
×
793
            return []
×
794

795
        odds["spread"] = odds["spread"].apply(filter_spread)
1✔
796
        odds["spread"] = odds["spread"].apply(lambda x: " ".join(x.split()))
1✔
797
        odds["datetime1"] = odds["datetime1"].str.replace("Today, ", "")
1✔
798
        odds_final = odds[["datetime1", "spread", "moneyline"]].copy()
1✔
799

800
        # \b: Word boundary anchor, ensures that the match occurs at a word boundary.
801
        # (: Start of a capturing group.
802
        # [A-Z]: Character class matching any uppercase letter from 'A' to 'Z'.
803
        # {2,3}: Quantifier specifying that the preceding character class [A-Z] should appear 2 to 3 times.
804
        # ): End of the capturing group.
805
        # \b: Word boundary anchor, again ensuring that the match occurs at a word boundary.
806

807
        pattern = r"\b([A-Z]{2,3})\b"
1✔
808

809
        odds_final["team"] = (
1✔
810
            odds_final["datetime1"]
811
            .str.extractall(pattern)
812
            .unstack()
813
            .apply(lambda x: " ".join(x.dropna()), axis=1)
814
        )
815

816
        # turning the space separated elements in a list, then exploding that list
817
        odds_final["team"] = odds_final["team"].str.split(" ", n=1, expand=False)
1✔
818
        odds_final["spread"] = odds_final["spread"].str.split(" ", n=1, expand=False)
1✔
819
        odds_final["moneyline"] = odds_final["moneyline"].str.split(
1✔
820
            " ", n=1, expand=False
821
        )
822
        odds_final = odds_final.explode(["team", "spread", "moneyline"]).reset_index()
1✔
823
        odds_final = odds_final.drop("index", axis=1)
1✔
824
        odds_final["date"] = datetime.now().date()
1✔
825
        odds_final["spread"] = odds_final[
1✔
826
            "spread"
827
        ].str.strip()  # strip trailing and leading spaces
828
        odds_final["moneyline"] = odds_final["moneyline"].str.strip()
1✔
829
        odds_final["time"] = odds_final["datetime1"].str.split().str[1]
1✔
830
        odds_final["datetime1"] = pd.to_datetime(
1✔
831
            (datetime.now().date().strftime("%Y-%m-%d") + " " + odds_final["time"]),
832
            format="%Y-%m-%d %H:%M",
833
        )
834

835
        odds_final["total"] = 200
1✔
836
        odds_final["team"] = odds_final["team"].str.replace("BK", "BKN")
1✔
837
        odds_final["moneyline"] = odds_final["moneyline"].str.replace(
1✔
838
            "\\+", "", regex=True
839
        )
840
        odds_final["moneyline"] = odds_final["moneyline"].astype("int")
1✔
841
        odds_final = odds_final[
1✔
842
            ["team", "spread", "total", "moneyline", "date", "datetime1"]
843
        ]
844
        logging.info(
1✔
845
            f"Odds Scrape Successful, returning {len(odds_final)} records "
846
            f"from {len(odds_final) // 2} games Today"
847
        )
848
        return odds_final
1✔
849
    except BaseException as e:
×
850
        logging.error(f"Odds Function Web Scrape Failed, {e}")
×
851
        sentry_sdk.capture_exception(e)
×
852
        df = pd.DataFrame()
×
853
        return df
×
854

855

856
# def get_odds_data() -> pd.DataFrame:
857
#     """
858
#     *********** DEPRECATED AS OF 2022-10-19 ***********
859

860
#     Web Scrape function w/ pandas read_html that grabs current day's
861
#         nba odds in raw format. There are 2 objects [0], [1] if the days
862
#         are split into 2.  AWS ECS operates in UTC time so the game start
863
#         times are actually 5-6+ hours ahead of what they actually are, so
864
#         there are 2 html tables.
865

866
#     Args:
867
#         None
868

869
#     Returns:
870
#         Pandas DataFrame of NBA moneyline + spread odds for upcoming games
871
#    for that day
872
#     """
873
#     year = (datetime.now() - timedelta(1)).year
874

875
#     try:
876
#         url = "https://sportsbook.draftkings.com/leagues/basketball/nba"
877
#         df = pd.read_html(url)
878
#         if len(df) == 0:
879
#             logging.info("Odds Transformation Failed, no Odds Data available.")
880
#             df = pd.DataFrame()
881
#             return df
882
#         else:
883
#             try:
884
#                 data1 = df[0].copy()
885
#                 data1.columns.values[0] = "Tomorrow"
886
#                 date_try = str(year) + " " + data1.columns[0]
887
#                 data1["date"] = np.where(
888
#                     date_try == "2022 Tomorrow",
889
#                     datetime.now().date(),  # if the above is true, then return this
890
#                     str(year) + " " + data1.columns[0],  # if false then return this
891
#                 )
892
#                 # )
893
#                 date_try = data1["date"].iloc[0]
894
#                 data1.reset_index(drop=True)
895
#                 data1["Tomorrow"] = data1["Tomorrow"].str.replace(
896
#                     "LA Clippers", "LAC Clippers", regex=True
897
#                 )
898

899
#                 data1["Tomorrow"] = data1["Tomorrow"].str.replace(
900
#                     "AM", "AM ", regex=True
901
#                 )
902
#                 data1["Tomorrow"] = data1["Tomorrow"].str.replace(
903
#                     "PM", "PM ", regex=True
904
#                 )
905
#                 data1["Time"] = data1["Tomorrow"].str.split().str[0]
906
#                 data1["datetime1"] = (
907
#                     pd.to_datetime(date_try.strftime("%Y-%m-%d") + " "
908
#                                   + data1["Time"])
909
#                     - timedelta(hours=6)
910
#                     + timedelta(days=1)
911
#                 )
912
#                 if len(df) > 1:  # if more than 1 day's data appears then do this
913
#                     data2 = df[1].copy()
914
#                     data2.columns.values[0] = "Tomorrow"
915
#                     data2.reset_index(drop=True)
916
#                     data2["Tomorrow"] = data2["Tomorrow"].str.replace(
917
#                         "LA Clippers", "LAC Clippers", regex=True
918
#                     )
919
#                     data2["Tomorrow"] = data2["Tomorrow"].str.replace(
920
#                         "AM", "AM ", regex=True
921
#                     )
922
#                     data2["Tomorrow"] = data2["Tomorrow"].str.replace(
923
#                         "PM", "PM ", regex=True
924
#                     )
925
#                     data2["Time"] = data2["Tomorrow"].str.split().str[0]
926
#                     data2["datetime1"] = (
927
#                         pd.to_datetime(
928
#                             date_try.strftime("%Y-%m-%d") + " " + data2["Time"]
929
#                         )
930
#                         - timedelta(hours=6)
931
#                         + timedelta(days=1)
932
#                     )
933
#                     data2["date"] = data2["datetime1"].dt.date
934

935
#                     data = pd.concat([data1, data2])
936
#                     data["SPREAD"] = data["SPREAD"].str[:-4]
937
#                     data["TOTAL"] = data["TOTAL"].str[:-4]
938
#                     data["TOTAL"] = data["TOTAL"].str[2:]
939
#                     data["Tomorrow"] = data["Tomorrow"].str.split().str[1:2]
940
#                     data["Tomorrow"] = pd.DataFrame(
941
#                         [
942
#                             str(line).strip("[").strip("]").replace("'", "")
943
#                             for line in data["Tomorrow"]
944
#                         ]
945
#                     )
946
#                     data["SPREAD"] = data["SPREAD"].str.replace("pk", "-1",
947
#                           regex=True)
948
#                     data["SPREAD"] = data["SPREAD"].str.replace("+", "", regex=True)
949
#                     data.columns = data.columns.str.lower()
950
#                     data = data[
951
#                         [
952
#                             "tomorrow",
953
#                             "spread",
954
#                             "total",
955
#                             "moneyline",
956
#                             "date",
957
#                             "datetime1",
958
#                         ]
959
#                     ]
960
#                     data = data.rename(columns={data.columns[0]: "team"})
961
#                     data = data.query(
962
#                         "date == date.min()"
963
#                     )  # only grab games from upcoming day
964
#                     logging.info(
965
#                         f"""Odds Transformation Function Successful {len(df)} day, \
966
#                         retrieving {len(data)} rows"""
967
#                     )
968
#                     return data
969
#                 else:  # if there's only 1 day of data then just use that
970
#                     data = data1.reset_index(drop=True)
971
#                     data["SPREAD"] = data["SPREAD"].str[:-4]
972
#                     data["TOTAL"] = data["TOTAL"].str[:-4]
973
#                     data["TOTAL"] = data["TOTAL"].str[2:]
974
#                     data["Tomorrow"] = data["Tomorrow"].str.split().str[1:2]
975
#                     data["Tomorrow"] = pd.DataFrame(
976
#                         [
977
#                             str(line).strip("[").strip("]").replace("'", "")
978
#                             for line in data["Tomorrow"]
979
#                         ]
980
#                     )
981
#                     data["SPREAD"] = data["SPREAD"].str.replace("pk", "-1",
982
#                        regex=True)
983
#                     data["SPREAD"] = data["SPREAD"].str.replace("+", "", regex=True)
984
#                     data.columns = data.columns.str.lower()
985
#                     data = data[
986
#                         [
987
#                             "tomorrow",
988
#                             "spread",
989
#                             "total",
990
#                             "moneyline",
991
#                             "date",
992
#                             "datetime1",
993
#                         ]
994
#                     ]
995
#                     data = data.rename(columns={data.columns[0]: "team"})
996
#                     data = data.query(
997
#                         "date == date.min()"
998
#                     )  # only grab games from upcoming day
999
#                     logging.info(
1000
#                         f"""Odds Transformation Successful {len(df)} day, \
1001
#                         retrieving {len(data)} rows"""
1002
#                     )
1003
#                     return data
1004
#             except BaseException as error:
1005
#                 logging.error(
1006
#                     f"Odds Transformation Failed for {len(df)} day objects, {error}"
1007
#                 )
1008
#                 sentry_sdk.capture_exception(error)
1009
#                 data = pd.DataFrame()
1010
#                 return data
1011
#     except (
1012
#         BaseException,
1013
#         ValueError,
1014
#     ) as error:  # valueerror fucked shit up apparently idfk
1015
#         logging.error(f"Odds Function Web Scrape Failed, {error}")
1016
#         sentry_sdk.capture_exception(error)
1017
#         df = pd.DataFrame()
1018
#         return df
1019

1020

1021
@time_function
1✔
1022
def get_reddit_data(feature_flags_df: pd.DataFrame, sub: str = "nba") -> pd.DataFrame:
1✔
1023
    """
1024
    Web Scrape function w/ PRAW that grabs top ~27 top posts from a given subreddit.
1025
    Left sub as an argument in case I want to scrape multi subreddits in the future
1026
    (r/nba, r/nbadiscussion, r/sportsbook etc)
1027

1028
    Args:
1029
        feature_flags_df (pd.DataFrame): Feature Flags DataFrame to check whether
1030
            to run this function or not
1031

1032
        sub (string): subreddit to query
1033

1034
    Returns:
1035
        Pandas DataFrame of all current top posts on r/nba
1036
    """
1037
    feature_flag = "reddit_posts"
×
1038
    feature_flag_check = check_feature_flag(
×
1039
        flag=feature_flag, flags_df=feature_flags_df
1040
    )
1041

1042
    if feature_flag_check is False:
×
1043
        logging.info(f"Feature Flag {feature_flag} is disabled, skipping function")
×
1044
        df = pd.DataFrame()
×
1045
        return df
×
1046

1047
    reddit = praw.Reddit(
×
1048
        client_id=os.environ.get("reddit_accesskey"),
1049
        client_secret=os.environ.get("reddit_secretkey"),
1050
        user_agent="praw-app",
1051
        username=os.environ.get("reddit_user"),
1052
        password=os.environ.get("reddit_pw"),
1053
    )
1054
    try:
×
1055
        subreddit = reddit.subreddit(sub)
×
1056
        posts = []
×
1057
        for post in subreddit.hot(limit=27):
×
1058
            posts.append(
×
1059
                [
1060
                    post.title,
1061
                    post.score,
1062
                    post.id,
1063
                    post.url,
1064
                    str(f"https://www.reddit.com{post.permalink}"),
1065
                    post.num_comments,
1066
                    post.selftext,
1067
                    datetime.now().date(),
1068
                    datetime.now(),
1069
                ]
1070
            )
1071
        posts = pd.DataFrame(
×
1072
            posts,
1073
            columns=[
1074
                "title",
1075
                "score",
1076
                "id",
1077
                "url",
1078
                "reddit_url",
1079
                "num_comments",
1080
                "body",
1081
                "scrape_date",
1082
                "scrape_time",
1083
            ],
1084
        )
1085
        posts.columns = posts.columns.str.lower()
×
1086

1087
        logging.info(
×
1088
            "Reddit Scrape Successful, grabbing 27 Recent "
1089
            f"popular posts from r/{sub} subreddit"
1090
        )
1091
        return posts
×
1092
    except BaseException as error:
×
1093
        logging.error(f"Reddit Scrape Function Failed, {error}")
×
1094
        sentry_sdk.capture_exception(error)
×
1095
        data = pd.DataFrame()
×
1096
        return data
×
1097

1098

1099
@time_function
1✔
1100
def get_reddit_comments(
1✔
1101
    feature_flags_df: pd.DataFrame, urls: pd.Series
1102
) -> pd.DataFrame:
1103
    """
1104
    Web Scrape function w/ PRAW that iteratively extracts comments from provided
1105
    reddit post urls.
1106

1107
    Args:
1108
        feature_flags_df (pd.DataFrame): Feature Flags DataFrame to check whether
1109
            to run this function or not
1110

1111
        urls (Series): The (reddit) urls to extract comments from
1112

1113
    Returns:
1114
        Pandas DataFrame of all comments from the provided reddit urls
1115
    """
1116
    feature_flag = "reddit_comments"
1✔
1117
    feature_flag_check = check_feature_flag(
1✔
1118
        flag=feature_flag, flags_df=feature_flags_df
1119
    )
1120

1121
    if feature_flag_check is False:
1✔
1122
        logging.info(f"Feature Flag {feature_flag} is disabled, skipping function")
×
1123
        df = pd.DataFrame()
×
1124
        return df
×
1125

1126
    reddit = praw.Reddit(
1✔
1127
        client_id=os.environ.get("reddit_accesskey"),
1128
        client_secret=os.environ.get("reddit_secretkey"),
1129
        user_agent="praw-app",
1130
        username=os.environ.get("reddit_user"),
1131
        password=os.environ.get("reddit_pw"),
1132
    )
1133
    author_list = []
1✔
1134
    comment_list = []
1✔
1135
    score_list = []
1✔
1136
    flair_list1 = []
1✔
1137
    flair_list2 = []
1✔
1138
    edited_list = []
1✔
1139
    url_list = []
1✔
1140

1141
    try:
1✔
1142
        for i in urls:
1✔
1143
            submission = reddit.submission(url=i)
1✔
1144
            submission.comments.replace_more(limit=0)
1✔
1145
            # this removes all the "more comment" stubs
1146
            # to grab ALL comments use limit=None, but it will take 100x longer
1147
            for comment in submission.comments.list():
1✔
1148
                author_list.append(comment.author)
×
1149
                comment_list.append(comment.body)
×
1150
                score_list.append(comment.score)
×
1151
                flair_list1.append(comment.author_flair_css_class)
×
1152
                flair_list2.append(comment.author_flair_text)
×
1153
                edited_list.append(comment.edited)
×
1154
                url_list.append(i)
×
1155

1156
        df = pd.DataFrame(
1✔
1157
            {
1158
                "author": author_list,
1159
                "comment": comment_list,
1160
                "score": score_list,
1161
                "url": url_list,
1162
                "flair1": flair_list1,
1163
                "flair2": flair_list2,
1164
                "edited": edited_list,
1165
                "scrape_date": datetime.now().date(),
1166
                "scrape_ts": datetime.now(),
1167
            }
1168
        )
1169

1170
        df = df.query('author != "None"')  # remove deleted comments rip
1✔
1171
        df["author"] = df["author"].astype(str)
1✔
1172
        df = df.sort_values("score").groupby(["author", "comment", "url"]).tail(1)
1✔
1173
        df = add_sentiment_analysis(df, "comment")
1✔
1174

1175
        df["edited"] = np.where(
1✔
1176
            df["edited"] is False, 0, 1
1177
        )  # if edited, then 1, else 0
1178
        df["md5_pk"] = df.apply(
1✔
1179
            lambda x: hashlib.md5(
1180
                (str(x["author"]) + str(x["comment"]) + str(x["url"])).encode("utf8")
1181
            ).hexdigest(),
1182
            axis=1,
1183
        )
1184
        # this hash function lines up with the md5 function in postgres
1185
        # this is needed for the upsert to work on it.
1186
        logging.info(
1✔
1187
            f"Reddit Comment Extraction Success, retrieving {len(df)} "
1188
            f"total comments from {len(urls)} total urls"
1189
        )
1190
        return df
1✔
1191
    except BaseException as e:
×
1192
        logging.error(f"Reddit Comment Extraction Failed for url {i}, {e}")
×
1193
        sentry_sdk.capture_exception(e)
×
1194
        df = pd.DataFrame()
×
1195
        return df
×
1196

1197

1198
# def scrape_tweets_tweepy(
1199
#     search_parameter: str, count: int, result_type: str
1200
# ) -> pd.DataFrame:
1201
#     """
1202
#     Web Scrape function w/ Tweepy to scrape Tweets made within last ~ 7 days
1203

1204
#     Args:
1205
#         search_parameter (str): The string you're interested in finding Tweets for
1206

1207
#         count (int): Number of tweets to grab
1208

1209
#         result_type (str): Either mixed, recent, or popular.
1210

1211
#     Returns:
1212
#         Pandas DataFrame of recent Tweets
1213
#     """
1214
#     auth = tweepy.OAuthHandler(
1215
#         os.environ.get("twitter_consumer_api_key"),
1216
#         os.environ.get("twitter_consumer_api_secret"),
1217
#     )
1218

1219
#     api = tweepy.API(auth, wait_on_rate_limit=True)
1220

1221
#     full_tweet_df = pd.DataFrame()
1222
#     try:
1223
#         for tweet in tweepy.Cursor(  # result_type can be mixed, recent, or popular.
1224
#             api.search_tweets, search_parameter, count=count, result_type=result_type
1225
#         ).items(count):
1226
#             df = {
1227
#                 "api_created_at": tweet._json["created_at"],
1228
#                 "tweet_id": tweet._json["id_str"],
1229
#                 "username": tweet._json["user"]["screen_name"],
1230
#                 "user_id": tweet._json["user"]["id"],
1231
#                 "tweet": tweet._json["text"],
1232
#                 "likes": tweet._json["favorite_count"],
1233
#                 "retweets": tweet._json["retweet_count"],
1234
#                 "language": tweet._json["lang"],
1235
#                 "scrape_ts": datetime.now(),
1236
#                 "profile_img": tweet._json["user"]["profile_image_url"],
1237
#                 "url": f"https://twitter.com/twitter/statuses/{tweet._json['id']}",
1238
#             }
1239
#             full_tweet_df = pd.concat([df, full_tweet_df])
1240

1241
#         df = add_sentiment_analysis(df, "tweet")
1242
#         logging.info(f"Twitter Scrape Successful, retrieving {len(df)} Tweets")
1243
#         return df
1244
#     except BaseException as e:
1245
#         logging.error(f"Error Occurred for Scrape Tweets Tweepy, {e}")
1246
#         sentry_sdk.capture_exception(e)
1247
#         df = pd.DataFrame()
1248
#         return df
1249

1250

1251
# @time_function
1252
# def scrape_tweets_combo(feature_flags_df: pd.DataFrame) -> pd.DataFrame:
1253
#     """
1254
#     Web Scrape function to scrape Tweepy Tweets for both popular & mixed tweets
1255

1256
#     Args:
1257
#         feature_flags_df (pd.DataFrame): Feature Flags DataFrame to check whether
1258
#             to run this function or not
1259

1260
#     Returns:
1261
#         Pandas DataFrame of both popular and mixed tweets.
1262
#     """
1263
#     feature_flag = "twitter"
1264
#     feature_flag_check = check_feature_flag(
1265
#         flag=feature_flag, flags_df=feature_flags_df
1266
#     )
1267

1268
#     if feature_flag_check is False:
1269
#         logging.info(f"Feature Flag {feature_flag} is disabled, skipping function")
1270
#         df = pd.DataFrame()
1271
#         return df
1272

1273
#     try:
1274
#         df1 = scrape_tweets_tweepy("nba", 1000, "popular")
1275
#         df2 = scrape_tweets_tweepy("nba", 5000, "mixed")
1276

1277
#         # so the scrape_ts column screws up with filtering duplicates out so
1278
#         # this code ignores that column to correctly drop the duplicates
1279
#         df_combo = pd.concat([df1, df2])
1280
#         df_combo = df_combo.drop_duplicates(
1281
#             subset=df_combo.columns.difference(
1282
#                 ["scrape_ts", "likes", "retweets", "tweet"]
1283
#             )
1284
#         )
1285

1286
#         logging.info(
1287
#             f"Grabbing {len(df1)} Popular Tweets and {len(df2)} Mixed Tweets "
1288
#             f"for {len(df_combo)} Total, {(len(df1) + len(df2) - len(df_combo))} "
1289
#             "were duplicates"
1290
#         )
1291
#         return df_combo
1292
#     except BaseException as e:
1293
#         logging.error(f"Error Occurred for Scrape Tweets Combo, {e}")
1294
#         sentry_sdk.capture_exception(e)
1295
#         df = pd.DataFrame()
1296
#         return df
1297

1298

1299
@time_function
1✔
1300
def get_pbp_data(feature_flags_df: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
1✔
1301
    """
1302
    Web Scrape function w/ pandas read_html that uses aliases via boxscores function
1303
    to scrape the pbp data iteratively for each game played the previous day.
1304
    It assumes there is a location column in the df being passed in.
1305

1306
    Args:
1307
        feature_flags_df (pd.DataFrame): Feature Flags DataFrame to check whether
1308
            to run this function or not
1309

1310
        df (DataFrame) - The Boxscores DataFrame
1311

1312
    Returns:
1313
        All PBP Data for the games in the input df
1314

1315
    """
1316
    feature_flag = "pbp"
1✔
1317
    feature_flag_check = check_feature_flag(
1✔
1318
        flag=feature_flag, flags_df=feature_flags_df
1319
    )
1320

1321
    if feature_flag_check is False:
1✔
1322
        logging.info(f"Feature Flag {feature_flag} is disabled, skipping function")
×
1323
        df = pd.DataFrame()
×
1324
        return df
×
1325

1326
    if len(df) > 0:
1✔
1327
        game_date = df["date"][0]
1✔
1328
    else:
1329
        df = pd.DataFrame()
×
1330
        logging.warning(
×
1331
            "PBP Transformation Function Failed, "
1332
            f"no data available for {datetime.now().date()}"
1333
        )
1334
        return df
×
1335
    try:
1✔
1336
        if len(df) > 0:
1✔
1337
            yesterday_hometeams = (
1✔
1338
                df.query('location == "H"')[["team"]].drop_duplicates().dropna()
1339
            )
1340
            yesterday_hometeams["team"] = yesterday_hometeams["team"].str.replace(
1✔
1341
                "PHX", "PHO"
1342
            )
1343
            yesterday_hometeams["team"] = yesterday_hometeams["team"].str.replace(
1✔
1344
                "CHA", "CHO"
1345
            )
1346
            yesterday_hometeams["team"] = yesterday_hometeams["team"].str.replace(
1✔
1347
                "BKN", "BRK"
1348
            )
1349

1350
            away_teams = (
1✔
1351
                df.query('location == "A"')[["team", "opponent"]]
1352
                .drop_duplicates()
1353
                .dropna()
1354
            )
1355
            away_teams = away_teams.rename(
1✔
1356
                columns={
1357
                    away_teams.columns[0]: "AwayTeam",
1358
                    away_teams.columns[1]: "HomeTeam",
1359
                }
1360
            )
1361
        else:
1362
            yesterday_hometeams = []
×
1363

1364
        if len(yesterday_hometeams) > 0:
1✔
1365
            try:
1✔
1366
                newdate = str(
1✔
1367
                    df["date"].drop_duplicates()[0].date()
1368
                )  # this assumes all games in the boxscores df are 1 date
1369
                newdate = pd.to_datetime(newdate).strftime(
1✔
1370
                    "%Y%m%d"
1371
                )  # formatting into url format.
1372
                pbp_list = pd.DataFrame()
1✔
1373
                for i in yesterday_hometeams["team"]:
1✔
1374
                    url = f"https://www.basketball-reference.com/boxscores/pbp/{newdate}0{i}.html"
1✔
1375
                    df = pd.read_html(url)[0]
1✔
1376
                    df.columns = df.columns.map("".join)
1✔
1377
                    df = df.rename(
1✔
1378
                        columns={
1379
                            df.columns[0]: "Time",
1380
                            df.columns[1]: "descriptionPlayVisitor",
1381
                            df.columns[2]: "AwayScore",
1382
                            df.columns[3]: "Score",
1383
                            df.columns[4]: "HomeScore",
1384
                            df.columns[5]: "descriptionPlayHome",
1385
                        }
1386
                    )
1387
                    conditions = [
1✔
1388
                        (
1389
                            df["HomeScore"].str.contains("Jump ball:", na=False)
1390
                            & df["Time"].str.contains("12:00.0")
1391
                        ),
1392
                        (
1393
                            df["HomeScore"].str.contains(
1394
                                "Start of 2nd quarter", na=False
1395
                            )
1396
                        ),
1397
                        (
1398
                            df["HomeScore"].str.contains(
1399
                                "Start of 3rd quarter", na=False
1400
                            )
1401
                        ),
1402
                        (
1403
                            df["HomeScore"].str.contains(
1404
                                "Start of 4th quarter", na=False
1405
                            )
1406
                        ),
1407
                        (
1408
                            df["HomeScore"].str.contains(
1409
                                "Start of 1st overtime", na=False
1410
                            )
1411
                        ),
1412
                        (
1413
                            df["HomeScore"].str.contains(
1414
                                "Start of 2nd overtime", na=False
1415
                            )
1416
                        ),
1417
                        (
1418
                            df["HomeScore"].str.contains(
1419
                                "Start of 3rd overtime", na=False
1420
                            )
1421
                        ),
1422
                        (
1423
                            df["HomeScore"].str.contains(
1424
                                "Start of 4th overtime", na=False
1425
                            )
1426
                        ),  # if more than 4 ots then rip
1427
                    ]
1428
                    values = [
1✔
1429
                        "1st Quarter",
1430
                        "2nd Quarter",
1431
                        "3rd Quarter",
1432
                        "4th Quarter",
1433
                        "1st OT",
1434
                        "2nd OT",
1435
                        "3rd OT",
1436
                        "4th OT",
1437
                    ]
1438
                    df["Quarter"] = np.select(conditions, values, default=None)
1✔
1439
                    df["Quarter"] = df["Quarter"].ffill()
1✔
1440
                    df = df.query(
1✔
1441
                        'Time != "Time" & '
1442
                        'Time != "2nd Q" & '
1443
                        'Time != "3rd Q" & '
1444
                        'Time != "4th Q" & '
1445
                        'Time != "1st OT" & '
1446
                        'Time != "2nd OT" & '
1447
                        'Time != "3rd OT" & '
1448
                        'Time != "4th OT"'
1449
                    ).copy()
1450
                    # use COPY to get rid of the fucking goddamn warning
1451
                    df["HomeTeam"] = i
1✔
1452
                    df["HomeTeam"] = df["HomeTeam"].str.replace("PHO", "PHX")
1✔
1453
                    df["HomeTeam"] = df["HomeTeam"].str.replace("CHO", "CHA")
1✔
1454
                    df["HomeTeam"] = df["HomeTeam"].str.replace("BRK", "BKN")
1✔
1455
                    df = df.merge(away_teams)
1✔
1456
                    df[["scoreAway", "scoreHome"]] = df["Score"].str.split(
1✔
1457
                        "-", expand=True, n=1
1458
                    )
1459
                    df["scoreAway"] = pd.to_numeric(df["scoreAway"], errors="coerce")
1✔
1460
                    df["scoreAway"] = df["scoreAway"].ffill()
1✔
1461
                    df["scoreAway"] = df["scoreAway"].fillna(0)
1✔
1462
                    df["scoreHome"] = pd.to_numeric(df["scoreHome"], errors="coerce")
1✔
1463
                    df["scoreHome"] = df["scoreHome"].ffill()
1✔
1464

1465
                    df["scoreHome"] = df["scoreHome"].fillna(0)
1✔
1466
                    df["marginScore"] = df["scoreHome"] - df["scoreAway"]
1✔
1467
                    df["Date"] = game_date
1✔
1468
                    df["scrape_date"] = datetime.now().date()
1✔
1469
                    df = df.rename(
1✔
1470
                        columns={
1471
                            df.columns[0]: "timeQuarter",
1472
                            df.columns[6]: "numberPeriod",
1473
                        }
1474
                    )
1475
                    pbp_list = pd.concat([df, pbp_list])
1✔
1476
                    df = pd.DataFrame()
1✔
1477
                pbp_list.columns = pbp_list.columns.str.lower()
1✔
1478
                pbp_list = pbp_list.query(
1✔
1479
                    "(awayscore.notnull()) | (homescore.notnull())", engine="python"
1480
                )
1481
                logging.info(
1✔
1482
                    "PBP Data Transformation Function Successful, "
1483
                    f"retrieving {len(pbp_list)} rows for {datetime.now().date()}"
1484
                )
1485
                # filtering only scoring plays here, keep other all other rows in future
1486
                # for lineups stuff etc.
1487
                return pbp_list
1✔
1488
            except BaseException as error:
×
1489
                logging.error(f"PBP Transformation Function Logic Failed, {error}")
×
1490
                sentry_sdk.capture_exception(error)
×
1491
                df = pd.DataFrame()
×
1492
                return df
×
1493
        else:
1494
            df = pd.DataFrame()
×
1495
            logging.warning(
×
1496
                "PBP Transformation Function Failed, no data available "
1497
                f"for {datetime.now().date()}"
1498
            )
1499
            return df
×
1500
    except BaseException as error:
×
1501
        logging.error(f"PBP Data Transformation Function Failed, {error}")
×
1502
        sentry_sdk.capture_exception(error)
×
1503
        data = pd.DataFrame()
×
1504
        return data
×
1505

1506

1507
@time_function
1✔
1508
def schedule_scraper(
1✔
1509
    feature_flags_df: pd.DataFrame,
1510
    year: str,
1511
    month_list: list[str] = [
1512
        "october",
1513
        "november",
1514
        "december",
1515
        "january",
1516
        "february",
1517
        "march",
1518
        "april",
1519
    ],
1520
) -> pd.DataFrame:
1521
    """
1522
    Web Scrape Function to scrape Schedule data by iterating through a list of months
1523

1524
    Args:
1525
        feature_flags_df (pd.DataFrame): Feature Flags DataFrame to check whether
1526
            to run this function or not
1527

1528
        year (str) - The year to scrape
1529

1530
        month_list (list) - List of full-month names to scrape
1531

1532
    Returns:
1533
        DataFrame of Schedule Data to be stored.
1534

1535
    """
1536
    current_date = (
1✔
1537
        datetime.now().date()
1538
    )  # DO NOT REMOVE, used in df.query function later
1539
    feature_flag = "schedule"
1✔
1540
    feature_flag_check = check_feature_flag(
1✔
1541
        flag=feature_flag, flags_df=feature_flags_df
1542
    )
1543

1544
    if feature_flag_check is False:
1✔
1545
        logging.info(f"Feature Flag {feature_flag} is disabled, skipping function")
×
1546
        df = pd.DataFrame()
×
1547
        return df
×
1548

1549
    try:
1✔
1550
        schedule_df = pd.DataFrame()
1✔
1551
        completed_months = []
1✔
1552
        for month in month_list:
1✔
1553
            url = f"https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html"
1✔
1554
            html = requests.get(url).content
1✔
1555
            soup = BeautifulSoup(html, "html.parser")
1✔
1556

1557
            headers = [th.getText() for th in soup.findAll("tr")[0].findAll("th")]
1✔
1558
            headers[6] = "boxScoreLink"
1✔
1559
            headers[7] = "isOT"
1✔
1560
            headers = headers[1:]
1✔
1561

1562
            rows = soup.findAll("tr")[1:]
1✔
1563
            date_info = [
1✔
1564
                [th.getText() for th in rows[i].findAll("th")] for i in range(len(rows))
1565
            ]
1566

1567
            game_info = [
1✔
1568
                [td.getText() for td in rows[i].findAll("td")] for i in range(len(rows))
1569
            ]
1570
            date_info = [i[0] for i in date_info]
1✔
1571

1572
            schedule = pd.DataFrame(game_info, columns=headers)
1✔
1573
            schedule["Date"] = date_info
1✔
1574

1575
            logging.info(
1✔
1576
                f"Schedule Function Completed for {month}, retrieving {len(schedule)} rows"
1577
            )
1578
            completed_months.append(month)
1✔
1579
            schedule_df = pd.concat([schedule, schedule_df])
1✔
1580

1581
    except IndexError:
×
1582
        logging.info(
×
1583
            f"{month} currently has no data in basketball-reference, "
1584
            f"stopping the function and returning data for {' '.join(completed_months)}"
1585
        )
1586
    finally:
1587
        if not schedule_df.empty:
1✔
1588
            schedule_df = schedule_df[
1✔
1589
                ["Start (ET)", "Visitor/Neutral", "Home/Neutral", "Date"]
1590
            ]
1591
            schedule_df["proper_date"] = pd.to_datetime(
1✔
1592
                schedule_df["Date"], format="%a, %b %d, %Y"
1593
            ).dt.date
1594
            schedule_df.columns = schedule_df.columns.str.lower()
1✔
1595
            schedule_df = schedule_df.rename(
1✔
1596
                columns={
1597
                    "start (et)": "start_time",
1598
                    "visitor/neutral": "away_team",
1599
                    "home/neutral": "home_team",
1600
                }
1601
            )
1602
            # filtering the data to only rows beyond the current date because we already have
1603
            # the historical records
1604
            schedule_df = schedule_df.query("proper_date >= @current_date")
1✔
1605
            return schedule_df
1✔
1606
        else:
1607
            return pd.DataFrame()
×
1608

1609

1610
def write_to_s3(
1✔
1611
    file_name: str,
1612
    df: pd.DataFrame,
1613
    date: datetime.date = datetime.now().date(),
1614
    bucket: str = os.environ.get("S3_BUCKET"),
1615
) -> None:
1616
    """
1617
    S3 Function using awswrangler to write file.  Only supports parquet right now.
1618

1619
    Args:
1620
        file_name (str): The base name of the file (boxscores, opp_stats)
1621

1622
        df (pd.DataFrame): The Pandas DataFrame to write to S3
1623

1624
        bucket (str): The Bucket to write to.  Defaults to `os.environ.get('S3_BUCKET')`
1625

1626
        date (datetime.date): Date to partition the data by.  Defaults to `datetime.now().date()`
1627

1628
    Returns:
1629
        Writes the Pandas DataFrame to an S3 File.
1630

1631
    """
1632
    year_partition = date.year
1✔
1633
    month_partition = get_leading_zeroes(value=date.month)
1✔
1634
    file_name_jn = f"{file_name}-{date}"
1✔
1635
    try:
1✔
1636
        if len(df) == 0:
1✔
1637
            logging.info(f"Not storing {file_name} to s3 because it's empty.")
×
1638
            pass
×
1639
        else:
1640
            wr.s3.to_parquet(
1✔
1641
                df=df,
1642
                path=f"s3://{bucket}/{file_name}/validated/year={year_partition}/month={month_partition}/{file_name_jn}.parquet",
1643
                index=False,
1644
            )
1645
            logging.info(
1✔
1646
                f"Storing {len(df)} {file_name} rows to S3 (s3://{bucket}/{file_name}/validated/{year_partition}/{file_name_jn}.parquet)"
1647
            )
1648
            pass
1✔
1649
    except BaseException as error:
×
1650
        logging.error(f"S3 Storage Function Failed {file_name}, {error}")
×
1651
        sentry_sdk.capture_exception(error)
×
1652
        pass
×
1653

1654

1655
def write_to_sql(con, table_name: str, df: pd.DataFrame, table_type: str) -> None:
1✔
1656
    """
1657
    Simple Wrapper Function to write a Pandas DataFrame to SQL
1658

1659
    Args:
1660
        con (SQL Connection): The connection to the SQL DB.
1661

1662
        table_name (str): The Table name to write to SQL as.
1663

1664
        df (DataFrame): The Pandas DataFrame to store in SQL
1665

1666
        table_type (str): Whether the table should replace or append to an
1667
            existing SQL Table under that name
1668

1669
    Returns:
1670
        Writes the Pandas DataFrame to a Table in the Schema we connected to.
1671

1672
    """
1673
    try:
1✔
1674
        if len(df) == 0:
1✔
1675
            logging.info(f"{table_name} is empty, not writing to SQL")
×
1676
        else:
1677
            df.to_sql(
1✔
1678
                con=con,
1679
                name=table_name,
1680
                index=False,
1681
                if_exists=table_type,
1682
            )
1683
            logging.info(
1✔
1684
                f"Writing {len(df)} {table_name} rows to aws_{table_name}_source to SQL"
1685
            )
1686

1687
        return None
1✔
1688
    except BaseException as error:
×
NEW
1689
        logging.error(f"SQL Write Script Failed, {error}")
×
1690
        sentry_sdk.capture_exception(error)
×
1691

1692

1693
# deprecated as of 2023-10-17 rip
1694
# def send_aws_email(logs: pd.DataFrame) -> None:
1695
#     """
1696
#     Email function utilizing boto3, has to be set up with SES in AWS
1697
#     and env variables passed in via Terraform.
1698

1699
#     The actual email code is copied from aws/boto3 and the subject &
1700
#     message should go in the subject / body_html variables.
1701

1702
#     Args:
1703
#         logs (DataFrame): The log file name generated by the script.
1704

1705
#     Returns:
1706
#         Sends an email out upon every script execution, including errors (if any)
1707
#     """
1708
#     sender = os.environ.get("USER_EMAIL")
1709
#     recipient = os.environ.get("USER_EMAIL")
1710
#     aws_region = "us-east-1"
1711
#     subject = f"""
1712
#     NBA ELT PIPELINE - {str(len(logs))} Alert Fails for {str(datetime.now().date())}
1713
#     """
1714
#     body_html = f"""\
1715
# <h3>Errors:</h3>
1716
#                    {logs.to_html()}"""
1717

1718
#     charset = "UTF-8"
1719
#     client = boto3.client("ses", region_name=aws_region)
1720
#     try:
1721
#         response = client.send_email(
1722
#             Destination={
1723
#                 "ToAddresses": [
1724
#                     recipient,
1725
#                 ],
1726
#             },
1727
#             Message={
1728
#                 "Body": {
1729
#                     "Html": {
1730
#                         "Charset": charset,
1731
#                         "Data": body_html,
1732
#                     },
1733
#                     "Text": {
1734
#                         "Charset": charset,
1735
#                         "Data": body_html,
1736
#                     },
1737
#                 },
1738
#                 "Subject": {
1739
#                     "Charset": charset,
1740
#                     "Data": subject,
1741
#                 },
1742
#             },
1743
#             Source=sender,
1744
#         )
1745
#     except ClientError as e:
1746
#         logging.error(e.response["Error"]["Message"])
1747
#         raise e
1748
#     else:
1749
#         logging.info(f"Email sent! Message ID: {response['MessageId']}")
1750
#         return None
1751

1752

1753
# DEPRECATING this as of 2022-04-25 - i send emails everyday now regardless
1754
# of pass or fail
1755
# def execute_email_function(logs: pd.DataFrame) -> None:
1756
#     """
1757
#     Email function that executes the email function upon script finishing.
1758
#     This is really not necessary; originally thought i wouldn't email
1759
#     if no errors would found but now i send it everyday regardless.
1760

1761
#     Args:
1762
#         logs (DataFrame): The log file name generated by the script.
1763

1764
#     Returns:
1765
#         Holds the actual send_email logic and executes if invoked as a
1766
#             script (aka on ECS)
1767
#     """
1768
#     try:
1769
#         if len(logs) > 0:
1770
#             logging.info("Sending Email")
1771
#             send_aws_email(logs)
1772
#         elif len(logs) == 0:
1773
#             logging.info("No Errors!")
1774
#             send_aws_email(logs)
1775
#     except BaseException as error:
1776
#         logging.error(f"Failed Email Alert, {error}")
1777
#         sentry_sdk.capture_exception(error)
1778

1779

1780
def get_feature_flags(connection: Connection | Engine) -> pd.DataFrame:
1✔
1781
    flags = pd.read_sql_query(sql="select * from marts.feature_flags;", con=connection)
1✔
1782

1783
    logging.info(f"Retrieving {len(flags)} Feature Flags")
1✔
1784
    return flags
1✔
1785

1786

1787
def check_feature_flag(flag: str, flags_df: pd.DataFrame) -> bool:
1✔
1788
    flags_df = flags_df.query(f"flag == '{flag}'")
1✔
1789

1790
    if len(flags_df) > 0 and flags_df["is_enabled"].iloc[0] == 1:
1✔
1791
        return True
1✔
1792
    else:
1793
        return False
1✔
1794

1795

1796
def query_logs(log_file: str = "logs/example.log") -> list:
1✔
1797
    """
1798
    Small Function to read Logs CSV File and grab Errors
1799

1800
    Args:
1801
        log_file (str): Optional String of the Log File Name
1802

1803
    Returns:
1804
        list of Error Messages to be passed into Slack Function
1805
    """
1806
    logs = pd.read_csv(log_file, sep=r"\\t", engine="python", header=None)
1✔
1807
    logs = logs.rename(columns={0: "errors"})
1✔
1808
    logs = logs.query("errors.str.contains('Failed')", engine="python")
1✔
1809
    logs = logs["errors"].to_list()
1✔
1810

1811
    logging.info(f"Returning {len(logs)} Failed Logs")
1✔
1812
    return logs
1✔
1813

1814

1815
def write_to_slack(
1✔
1816
    errors: list, webhook_url: str = os.environ.get("WEBHOOK_URL", default="default")
1817
) -> int | None:
1818
    """ "
1819
    Function to write Errors out to Slack.  Requires a pre-configured `webhook_url`
1820
    to be setup.
1821

1822
    Args:
1823
        errors (list): The list of Failed Tasks + their associated errors
1824

1825
        webhook_url (str): Optional Parameter to specify the Webhook to send the
1826
            errors to.  Defaults to `os.environ.get("WEBHOOK_URL")`
1827

1828
    Returns:
1829
        None, but writes the Errors to Slack if there are any
1830
    """
1831
    try:
1✔
1832
        date = datetime.now().date()
1✔
1833
        num_errors = len(errors)
1✔
1834
        str_dump = "\n".join(errors)
1✔
1835

1836
        if num_errors > 0:
1✔
1837
            response = requests.post(
1✔
1838
                webhook_url,
1839
                data=json.dumps(
1840
                    {
1841
                        "text": (
1842
                            f"\U0001F6D1 {num_errors} Errors during NBA ELT "
1843
                            f"Ingestion on {date}: \n {str_dump}"
1844
                        )
1845
                    }
1846
                ),
1847
                headers={"Content-Type": "application/json"},
1848
            )
1849
            logging.info(
1✔
1850
                f"Wrote Errors to Slack, Reponse Code {response.status_code}. "
1851
                "Exiting ..."
1852
            )
1853
            return response.status_code
1✔
1854
        else:
1855
            logging.info("No Error Logs, not writing to Slack.  Exiting out ...")
1✔
1856
            return None
1✔
1857
    except BaseException as e:
×
1858
        raise e
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc