17895240058

Committed 21 Sep 2025 03:02PM UTC coverage: 91.564% (-2.6%) from 94.155%

Build # 17895240058

Build Type

Pull #53

github

Committed by

web-flow

Commit Message

Merge 6bce7887b into ef14347f4

Pull Request Pull Request #53: Extend Transactions example

Run Details

35 of 83 new or added lines in 5 files covered. (42.17%)

2 existing lines in 2 files now uncovered.

1487 of 1624 relevant lines covered (91.56%)

0.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

30.36

/examples/sample_data_generator.py

# This Python script generates 2 sample datasets: customers and transactions to demonstrate the examples.
import pandas as pd
import numpy as np
from datetime import datetime
import sqlite3
from pathlib import Path
import tempfile
import shutil

# Modify these constants to control data volume and location
N_CUSTOMERS = 100
N_TRANSACTIONS = 100_000
CUSTOMERS_PATH = '/tmp/sources/customers.csv'
TRANSACTIONS_PATH = '/tmp/sources/transactions_db.sqlite'
DBFS_PATH = '/dbfs/tmp/sources'  # For Databricks

def random_dates(start: datetime, end: datetime, n: int) -> list[datetime]:
    """Generates a list of random dates between start and end"""
    start_u = start.timestamp()
    end_u = end.timestamp()
    return [datetime.fromtimestamp(np.random.uniform(start_u, end_u)) for _ in range(n)]

def generate_customers(num_rows: int = N_CUSTOMERS) -> pd.DataFrame:
    """Generates a DataFrame of dummy customer data"""
    first_names = [
        "John", "Jane", "Michael", "Emily", "David", "Sarah", "Chris", "Jessica",
        "Daniel", "Laura", "James", "Olivia", "Matthew", "Emma", "Joshua", "Sophia"
    ]
    last_names = [
        "Smith", "Johnson", "Brown", "Williams", "Jones", "Garcia", "Miller",
        "Davis", "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson"
    ]
    cities = ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix",
              "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose"]

    start_date = datetime(2022, 1, 1, 0, 0, 0)
    end_date = datetime(2022, 6, 1, 0, 0, 0)

    data = []
    for customer_id in range(1, num_rows + 1):
        first = np.random.choice(first_names)
        last = np.random.choice(last_names)
        email = f"{first.lower()}.{last.lower()}@mail.com"
        age = np.random.randint(15, 80)
        city = np.random.choice(cities)
        reg_date = random_dates(start_date, end_date, 1)[0].date()
        data.append((customer_id, f"{first} {last}", email, age, city, reg_date))

    customers = pd.DataFrame(data, columns=["customer_id", "name", "email", "age", "city", "registration_date"])
    customers['customer_id'] = 'c_' + customers['customer_id'].astype(str)
    return customers

def generate_transactions(num_rows: int = N_TRANSACTIONS, n_customers: int = N_CUSTOMERS) -> pd.DataFrame:
    """Generates a DataFrame of dummy transaction data"""
    start_date = datetime(2022, 1, 1, 0, 0, 0)
    end_date = datetime(2025, 6, 1, 0, 0, 0)

    data = {
        "transaction_id": np.arange(1, num_rows + 1),
        "customer_id": np.random.randint(1, n_customers + 1, size=num_rows),
        "product_id": np.random.randint(1, 21, size=num_rows),
        "quantity": np.random.randint(1, 101, size=num_rows),
        "price": np.round(np.random.uniform(10.0, 100.0, size=num_rows), 2),
        "timestamp": random_dates(start_date, end_date, num_rows)
    }

    transactions = pd.DataFrame(data)
    transactions['transaction_id'] = 't_' + transactions['transaction_id'].astype(str)
    transactions['customer_id'] = 'c_' + transactions['customer_id'].astype(str)
    transactions['product_id'] = 'p_' + transactions['product_id'].astype(str)
    return transactions

def save_csv_to_dbfs(df: pd.DataFrame, filename: str, dbfs_path: str = DBFS_PATH):
    """Saves data in df (Pandas DataFrame) to DBFS path as CSV"""
    path = Path(dbfs_path)
    path.mkdir(parents=True, exist_ok=True)
    df.to_csv(f"{dbfs_path}/{filename}.csv", index=False)

def save_sqlite_to_dbfs(df: pd.DataFrame, filename: str, dbfs_path: str = DBFS_PATH):
    """Saves data in df (Pandas DataFrame) to DBFS path as SQLite database"""
    path = Path(dbfs_path)
    path.mkdir(parents=True, exist_ok=True)

    with tempfile.NamedTemporaryFile(delete=False, suffix=".sqlite") as tmp_file:
        temp_path = Path(tmp_file.name)

    # sqlite connector cannot write directly to DBFS. Write to a temp file and copy.
    with sqlite3.connect(temp_path) as conn:
        df.to_sql('transactions', conn, if_exists='replace', index=False)
    dest_path = f"{dbfs_path}/{filename}.sqlite"
    shutil.copy(temp_path, dest_path)

1	# This Python script generates 2 sample datasets: customers and transactions to demonstrate the examples.
2	import pandas as pd	1✔
3	import numpy as np	1✔
4	from datetime import datetime	1✔
5	import sqlite3	1✔
6	from pathlib import Path	1✔
7	import tempfile	1✔
8	import shutil	1✔
9
10	# Modify these constants to control data volume and location
11	N_CUSTOMERS = 100	1✔
12	N_TRANSACTIONS = 100_000	1✔
13	CUSTOMERS_PATH = '/tmp/sources/customers.csv'	1✔
14	TRANSACTIONS_PATH = '/tmp/sources/transactions_db.sqlite'	1✔
15	DBFS_PATH = '/dbfs/tmp/sources' # For Databricks	1✔
16
17	def random_dates(start: datetime, end: datetime, n: int) -> list[datetime]:	1✔
18	"""Generates a list of random dates between start and end"""
NEW 19	start_u = start.timestamp()	×
NEW 20	end_u = end.timestamp()	×
NEW 21	return [datetime.fromtimestamp(np.random.uniform(start_u, end_u)) for _ in range(n)]	×
22
23	def generate_customers(num_rows: int = N_CUSTOMERS) -> pd.DataFrame:	1✔
24	"""Generates a DataFrame of dummy customer data"""
NEW 25	first_names = [	×
26	"John", "Jane", "Michael", "Emily", "David", "Sarah", "Chris", "Jessica",
27	"Daniel", "Laura", "James", "Olivia", "Matthew", "Emma", "Joshua", "Sophia"
28	]
NEW 29	last_names = [	×
30	"Smith", "Johnson", "Brown", "Williams", "Jones", "Garcia", "Miller",
31	"Davis", "Rodriguez", "Martinez", "Hernandez", "Lopez", "Gonzalez", "Wilson"
32	]
NEW 33	cities = ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix",	×
34	"Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose"]
35
NEW 36	start_date = datetime(2022, 1, 1, 0, 0, 0)	×
NEW 37	end_date = datetime(2022, 6, 1, 0, 0, 0)	×
38
NEW 39	data = []	×
NEW 40	for customer_id in range(1, num_rows + 1):	×
NEW 41	first = np.random.choice(first_names)	×
NEW 42	last = np.random.choice(last_names)	×
NEW 43	email = f"{first.lower()}.{last.lower()}@mail.com"	×
NEW 44	age = np.random.randint(15, 80)	×
NEW 45	city = np.random.choice(cities)	×
NEW 46	reg_date = random_dates(start_date, end_date, 1)[0].date()	×
NEW 47	data.append((customer_id, f"{first} {last}", email, age, city, reg_date))	×
48
NEW 49	customers = pd.DataFrame(data, columns=["customer_id", "name", "email", "age", "city", "registration_date"])	×
NEW 50	customers['customer_id'] = 'c_' + customers['customer_id'].astype(str)	×
NEW 51	return customers	×
52
53	def generate_transactions(num_rows: int = N_TRANSACTIONS, n_customers: int = N_CUSTOMERS) -> pd.DataFrame:	1✔
54	"""Generates a DataFrame of dummy transaction data"""
NEW 55	start_date = datetime(2022, 1, 1, 0, 0, 0)	×
NEW 56	end_date = datetime(2025, 6, 1, 0, 0, 0)	×
57
NEW 58	data = {	×
59	"transaction_id": np.arange(1, num_rows + 1),
60	"customer_id": np.random.randint(1, n_customers + 1, size=num_rows),
61	"product_id": np.random.randint(1, 21, size=num_rows),
62	"quantity": np.random.randint(1, 101, size=num_rows),
63	"price": np.round(np.random.uniform(10.0, 100.0, size=num_rows), 2),
64	"timestamp": random_dates(start_date, end_date, num_rows)
65	}
66
NEW 67	transactions = pd.DataFrame(data)	×
NEW 68	transactions['transaction_id'] = 't_' + transactions['transaction_id'].astype(str)	×
NEW 69	transactions['customer_id'] = 'c_' + transactions['customer_id'].astype(str)	×
NEW 70	transactions['product_id'] = 'p_' + transactions['product_id'].astype(str)	×
NEW 71	return transactions	×
72
73	def save_csv_to_dbfs(df: pd.DataFrame, filename: str, dbfs_path: str = DBFS_PATH):	1✔
74	"""Saves data in df (Pandas DataFrame) to DBFS path as CSV"""
NEW 75	path = Path(dbfs_path)	×
NEW 76	path.mkdir(parents=True, exist_ok=True)	×
NEW 77	df.to_csv(f"{dbfs_path}/{filename}.csv", index=False)	×
78
79	def save_sqlite_to_dbfs(df: pd.DataFrame, filename: str, dbfs_path: str = DBFS_PATH):	1✔
80	"""Saves data in df (Pandas DataFrame) to DBFS path as SQLite database"""
NEW 81	path = Path(dbfs_path)	×
NEW 82	path.mkdir(parents=True, exist_ok=True)	×
83
NEW 84	with tempfile.NamedTemporaryFile(delete=False, suffix=".sqlite") as tmp_file:	×
NEW 85	temp_path = Path(tmp_file.name)	×
86
87	# sqlite connector cannot write directly to DBFS. Write to a temp file and copy.
NEW 88	with sqlite3.connect(temp_path) as conn:	×
NEW 89	df.to_sql('transactions', conn, if_exists='replace', index=False)	×
NEW 90	dest_path = f"{dbfs_path}/{filename}.sqlite"	×
NEW 91	shutil.copy(temp_path, dest_path)	×

jorvik-io / jorvik / 17895240058

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous