๐Ÿ’ก ํ•ต์‹ฌ : CI๋‚˜ PI๋ฅผ ๊ณ„์‚ฐํ•ด์„œ ์ธก์ •

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes # ๋‹น๋‡จ๋ณ‘ ์˜ˆ์ธก
import scipy.stats as st
#from pathlib import Path

np.random.seed(42)

data = load_diabetes(as_frame=True)
df = data.frame.copy()
df

๐Ÿ”ธ ๋ชจ์ง‘๋‹จ ํ‰๊ท ์— ๋Œ€ํ•œ t-๋ถ„ํฌ ๊ธฐ๋ฐ˜์˜ ์‹ ๋ขฐ ๊ตฌ๊ฐ„์„ ๊ณ„์‚ฐ โญโญโญ

def ci_for_mean_t(sample, alpha=0.05):
    n = len(sample)
    xbar = np.mean(sample)      # ํ‘œ๋ณธ ํ‰๊ท 
    s = np.std(sample, ddof=1)  # ํ‘œ๋ณธ ํ‘œ์ค€ํŽธ์ฐจ๋ฅผ ๊ณ„์‚ฐํ•˜๊ธฐ ์œ„ํ•ด์„œ ์ž์œ ๋„๋ฅผ n-1๋กœ ๋‘์–ด์•ผ ํ•˜๊ธฐ ๋•Œ๋ฌธ์— ddof = 1
    se = s / np.sqrt(n)         # ํ‘œ์ค€์˜ค์ฐจ
    tcrit = st.t.ppf(1 - alpha / 2, df=n-1)            # t๋ถ„ํฌ์˜ ๊ฐ’ ์ฐพ๊ธฐ. ์ „์ฒด ๋ฉด์  ์ค‘ ํŠน์ • ๋น„์œจ์„ ์ฐจ์ง€ํ•˜๋Š” ์ง€์ 
    low, high = xbar - tcrit * se, xbar + tcrit * se   # ์‹ ๋ขฐ๊ตฌ๊ฐ„์˜ ํ•˜ํ•œ๊ณผ ์ƒํ•œ ๊ณ„์‚ฐ
    return low, high, xbar, s, se, tcrit

# ์ž‘์€ ํ‘œ๋ณธ ๋ฝ‘์•„๋ณด๊ธฐ
n = 20
sample = df["target"].sample(n=n, replace=True, random_state=7).to_numpy()  # replace : ๋™์ผํ•œ ๊ฐ’ ๋ฝ‘๊ฒŒ ํ• ๊ฑฐ๋ƒ?
low, high, xbar, s, se, tcrit = ci_for_mean_t(sample)

true_mu = df["target"].mean()
ci_summary = pd.DataFrame({
    "n": [n],
    "sample_mean": [xbar],
    "sample_sd": [s],
    "SE": [se],
    "t* (0.975, df=n-1)": [tcrit],
    "CI_low": [low],
    "CI_high": [high],
    "contains_true_mu?": [bool(low <= true_mu <= high)],
    "true_mu (from full data)": [true_mu]    
})

ci_summary

image.png

plt.figure()
plt.axvline(true_mu, linestyle="--", label="True ฮผ (full data)", color="#f69256")
plt.plot([low, high], [0, 0], marker="|", color="#00b3ca")
plt.xlabel("target")
plt.yticks([])
plt.title("95% t-CI for ฮผ (Diabetes target)")
plt.legend()
plt.show()

image.png

โ— ์‹ ๋ขฐ๊ตฌ๊ฐ„์€ ๋นˆ๋„ ์ฃผ์˜์ ์œผ๋กœ ํ•ด์„ํ•ด์•ผ ํ•จ

โŒ : ์ด๋ฒˆ ๊ตฌ๊ฐ„์ด ฮผ๋ฅผ 95% ํ™•๋ฅ ๋กœ ํฌํ•จํ•œ๋‹ค

โญ• : ์ด ์ ˆ์ฐจ๋กœ ๋ฐ˜๋ณตํ•ด ๋งŒ๋“  ๊ตฌ๊ฐ„๋“ค ์ค‘ ์•ฝ 95%๊ฐ€ ฮผ๋ฅผ ํฌํ•จํ•œ๋‹ค

๐Ÿ”Ž ํ†ต๊ณ„์  ์‹ ๋ขฐ๊ตฌ๊ฐ„์˜ ์‹ค์ œ ์ปค๋ฒ„๋ฆฌ์ง€(coverage)๋ฅผ ์‹คํ—˜์ ์œผ๋กœ ๊ฒ€์ฆํ•˜๋Š” ์‹œ๋ฎฌ๋ ˆ์ด์…˜ ํ•จ์ˆ˜

def coverage_experiment(series, n=25, reps=400, alpha=0.05, seed=7):
    # n = ํ‘œ๋ณธ ํฌ๊ธฐ(๊ฐœ์ˆ˜)
    # reps = ์‹คํ—˜์„ ๋ช‡ ๋ฒˆ ๋ฐ˜๋ณตํ• ์ง€?
    # alpha = ์œ ์˜์ˆ˜์ค€ (0.5๋ฉด 95% ์‹ ๋ขฐ๊ตฌ๊ฐ„)
    rng = np.random.default_rng(seed)   # PCG64 ๋ผ๋Š” ์•Œ๊ณ ๋ฆฌ์ฆ˜์„ ์“ฐ๋Š” ๋‚œ์ˆ˜ ์ƒ์„ฑ๊ธฐ, ์ตœ๊ทผ ๋” ์•ˆ์ •์ ์ด๊ณ  ๋น ๋ฅด๋‹ค๊ณ  ์ธ์ •๋ฐ›๋Š” ์ตœ์‹  ์•Œ๊ณ ๋ฆฌ์ฆ˜
    mu_true = series.mean()
    intervals = []
    hits = 0
    for i in range(reps):
        idx = rng.integers(0, len(series), size=n)
        smp = series.to_numpy()[idx]
        low, high, *_ = ci_for_mean_t(smp, alpha=alpha)
        intervals.append((low, high))
        hits += int(low <= mu_true <= high)
    return mu_true, intervals, hits / reps

mu_true, intervals, cov = coverage_experiment(df["target"], n=25, reps=400, alpha=0.05, seed=11)

plt.figure()
k = 40
for i, (low_i, high_i) in enumerate(intervals[:k]):
    plt.plot([low_i, high_i], [i, i], marker="|")
plt.axvline(mu_true, linestyle="--")
plt.title(f"Coverage experiment (first {k} CIs), estimated coverageโ‰ˆ{cov:.3f}")
plt.xlabel("target")
plt.ylabel("sample index")
plt.show()

image.png

๐Ÿ”Ž ์‹ ๋ขฐ๊ตฌ๊ฐ„์˜ ํ‰๊ท  ๋„ˆ๋น„๋ฅผ ๊ณ„์‚ฐํ•˜์—ฌ, ํ‘œ๋ณธ ํฌ๊ธฐ์™€ ์‹ ๋ขฐ๊ตฌ๊ฐ„ ๋„ˆ๋น„ ์‚ฌ์ด์˜ ๊ด€๊ณ„ ๋ณด์—ฌ์คŒ - ์ด๋Š” ํ†ต๊ณ„ํ•™์˜ ์ค‘์š”ํ•œ ์›๋ฆฌ์ธ "ํ‘œ๋ณธ ํฌ๊ธฐ๊ฐ€ ์ปค์งˆ์ˆ˜๋ก ์‹ ๋ขฐ๊ตฌ๊ฐ„์˜ ๋„ˆ๋น„๊ฐ€ ์ข์•„์ง„๋‹ค"๋Š” ๊ฐœ๋…์„ ๊ฒ€์ฆ