Big Data on Your Laptop? (cont.)
Here's the code for the Big Data on Your Laptop experiment. Pls comment if you have any suggestions to improve the accuracy of this measurement. Thanks!
Code:
import pandas as pd
import numpy as np
import time
from datetime import datetime, timedelta
import random
def rand_str(length=10):
letters = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
return ''.join(random.choice(letters) for _ in range(length))
def rand_datetime(start, end):
return start + timedelta(seconds=random.randint(0, int((end - start).total_seconds())))
def gen_dates(start, end, spec_date, num, frac=0.001):
spec_dates = [spec_date + timedelta(minutes=random.randint(0, 59), seconds=random.randint(0, 59)) for _ in range(int(num * frac))]
other_dates = [rand_datetime(start, end) for _ in range(num - len(spec_dates))]
return spec_dates + other_dates
num_rows, num_float, num_string, num_date = 10000000, 3, 5, 2
start_date, end_date, spec_date_hour = datetime(2020, 1, 1), datetime(2023, 12, 31), datetime(2023, 12, 31, 13)
total_start = time.time()
start = time.time()
data1 = {f'float{i}': np.random.rand(num_rows) for i in range(num_float)}
data1.update({f'str{i}': [rand_str() for _ in range(num_rows)] for i in range(num_string)})
data1.update({f'date{i}': gen_dates(start_date, end_date, spec_date_hour, num_rows) for i in range(num_date)})
df1 = pd.DataFrame(data1)
time1 = time.time() - start
print(f"DataFrame 1: {time1:.2f} s")
print(f"Total rows in DataFrame 1: {df1.shape[0]}")
start = time.time()
data2 = {f'float{i}': np.random.rand(num_rows) for i in range(num_float)}
data2.update({f'str{i}': [rand_str() for _ in range(num_rows)] for i in range(num_string)})
data2.update({f'date{i}': gen_dates(start_date, end_date, spec_date_hour, num_rows) for i in range(num_date)})
df2 = pd.DataFrame(data2)
time2 = time.time() - start
print(f"DataFrame 2: {time2:.2f} s")
print(f"Total rows in DataFrame 2: {df2.shape[0]}")
start_hour, end_hour = spec_date_hour.replace(minute=0, second=0, microsecond=0), spec_date_hour.replace(minute=59, second=59, microsecond=999999)
df1_filtered = df1[(df1['date0'] >= start_hour) & (df1['date0'] <= end_hour)]
df2_filtered = df2[(df2['date0'] >= start_hour) & (df2['date0'] <= end_hour)]
print(f"DF1 rows in hour {spec_date_hour}: {df1_filtered.shape[0]}")
print(f"DF2 rows in hour {spec_date_hour}: {df2_filtered.shape[0]}")
start = time.time()
joined_df = df1.merge(df2, left_on='date0', right_on='date0', suffixes=('_df1', '_df2')).query("date0 >= @start_hour & date0 <= @end_hour")
join_time = time.time() - start
print(f"Join time: {join_time:.2f} s")
print(f"Joined rows: {joined_df.shape[0]}")
total_exec_time = time1 + time2 + join_time
total_resp_time = time.time() - total_start
latency = total_resp_time - total_exec_time
print(f"Total response time: {total_resp_time:.2f} s")
print(f"Total execution time: {total_exec_time:.2f} s")
print(f"Latency: {latency:.2f} s")
Biostatistician in Science & Tech | Consultant | Causal Inference Specialist | Founder & Editor @ biostatistics.ca
4moI've been using tableone, but this post makes table1 seem like a lot more fun 😋