import numpy as np
import scipy.stats as stats
# Convert Admit to numeric: Yes=1, No=0
df['Admit_num'] = df['Admit'].apply(lambda x: 1 if x == 'Yes' else 0)
def frequentist_ci_prop(k, n, confidence=0.90):
p_hat = k / n
z = stats.norm.ppf(1 - (1 - confidence) / 2)
se = np.sqrt(p_hat * (1 - p_hat) / n)
return p_hat, (p_hat - z * se, p_hat + z * se)
def frequentist_ci_diff(k1, n1, k2, n2, confidence=0.90):
p1 = k1 / n1
p2 = k2 / n2
z = stats.norm.ppf(1 - (1 - confidence) / 2)
se = np.sqrt(p1 * (1 - p1) / n1 + p2 * (1 - p2) / n2)
diff = p1 - p2
return diff, (diff - z * se, diff + z * se)
def bayesian_beta(k, n, confidence=0.90, prior_a=1, prior_b=1):
alpha_post = prior_a + k
beta_post = prior_b + (n - k)
mean = alpha_post / (alpha_post + beta_post)
ci = stats.beta.ppf([(1 - confidence) / 2, 1 - (1 - confidence) / 2], alpha_post, beta_post)
return mean, ci
def bayesian_diff(k1, n1, k2, n2, confidence=0.90, n_samples=10000, prior_a=1, prior_b=1):
post1 = stats.beta.rvs(prior_a + k1, prior_b + n1 - k1, size=n_samples)
post2 = stats.beta.rvs(prior_a + k2, prior_b + n2 - k2, size=n_samples)
diffs = post1 - post2
mean_diff = np.mean(diffs)
ci = np.percentile(diffs, [(1 - confidence) / 2 * 100, (1 - (1 - confidence) / 2) * 100])
return mean_diff, ci
def bootstrap_stats(data_frame, n_boots=5000, confidence=0.90):
def get_stats(d):
overall = d['Admit_num'].mean()
male = d[d['M.F'] == 'Male']['Admit_num']
female = d[d['M.F'] == 'Female']['Admit_num']
p_m = male.mean()
p_f = female.mean()
diff = p_m - p_f
dept_diffs = {}
for dept in ['A', 'B', 'C', 'D', 'E', 'F']:
dm = d[(d['M.F'] == 'Male') & (d['Dept'] == dept)]['Admit_num']
dfem = d[(d['M.F'] == 'Female') & (d['Dept'] == dept)]['Admit_num']
dept_diffs[dept] = dm.mean() - dfem.mean()
return overall, p_m, p_f, diff, dept_diffs
results = []
for _ in range(n_boots):
boot_df = data_frame.sample(frac=1, replace=True)
results.append(get_stats(boot_df))
# Process results
overalls = [r[0] for r in results]
p_ms = [r[1] for r in results]
p_fs = [r[2] for r in results]
diffs = [r[3] for r in results]
dept_diffs = {dept: [r[4][dept] for r in results] for dept in ['A', 'B', 'C', 'D', 'E', 'F']}
alpha = (1 - confidence) / 2
summary = {
'overall': (np.mean(overalls), np.percentile(overalls, [alpha*100, (1-alpha)*100])),
'p_m': (np.mean(p_ms), np.percentile(p_ms, [alpha*100, (1-alpha)*100])),
'p_f': (np.mean(p_fs), np.percentile(p_fs, [alpha*100, (1-alpha)*100])),
'diff': (np.mean(diffs), np.percentile(diffs, [alpha*100, (1-alpha)*100])),
}
summary['dept_diffs'] = {dept: (np.mean(dept_diffs[dept]), np.percentile(dept_diffs[dept], [alpha*100, (1-alpha)*100])) for dept in ['A', 'B', 'C', 'D', 'E', 'F']}
return summary
# Data counts
n_total = len(df)
k_total = df['Admit_num'].sum()
df_m = df[df['M.F'] == 'Male']
n_m = len(df_m)
k_m = df_m['Admit_num'].sum()
df_f = df[df['M.F'] == 'Female']
n_f = len(df_f)
k_f = df_f['Admit_num'].sum()
# Frequentist Results
freq_overall = frequentist_ci_prop(k_total, n_total)
freq_m = frequentist_ci_prop(k_m, n_m)
freq_f = frequentist_ci_prop(k_f, n_f)
freq_diff = frequentist_ci_diff(k_m, n_m, k_f, n_f)
freq_dept_diffs = {}
for dept in ['A', 'B', 'C', 'D', 'E', 'F']:
dm = df[(df['M.F'] == 'Male') & (df['Dept'] == dept)]
dfem = df[(df['M.F'] == 'Female') & (df['Dept'] == dept)]
freq_dept_diffs[dept] = frequentist_ci_diff(len(dm[dm['Admit_num']==1]), len(dm),
len(dfem[dfem['Admit_num']==1]), len(dfem))
# Bayesian Results
bayes_overall = bayesian_beta(k_total, n_total)
bayes_m = bayesian_beta(k_m, n_m)
bayes_f = bayesian_beta(k_f, n_f)
bayes_diff_res = bayesian_diff(k_m, n_m, k_f, n_f)
bayes_dept_diffs = {}
for dept in ['A', 'B', 'C', 'D', 'E', 'F']:
dm = df[(df['M.F'] == 'Male') & (df['Dept'] == dept)]
dfem = df[(df['M.F'] == 'Female') & (df['Dept'] == dept)]
bayes_dept_diffs[dept] = bayesian_diff(len(dm[dm['Admit_num']==1]), len(dm),
len(dfem[dfem['Admit_num']==1]), len(dfem))
# Simulation Results
sim_res = bootstrap_stats(df)
# Print formatting
print("--- FREQUENTIST ---")