[250921] 33일차 - i_hate_statistics 02 - 실습

🔸 def summary_stats → 수치형 데이터의 대표값 추출하는 함수 ⭐⭐⭐

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 시퀀스(리스트, 문자열 등) 안에서 각 원소가 몇 번 등장했는지 세주는 빈도 계산 도구 -> 최빈값 계산할 때 사용
from collections import Counter

# numpy의 랜덤값을 만드는 함수들 seed를 정해주는 것 (재현성을 보존하기 위해서)
# 42는 관례적으로 많이쓰는 숫자 -> 소설책 (공상과학) 은하수를 여행하는 히치하이커를 위한 안내서 -> 우주를 전체의 진리를 설명하는 답이 42
np.random.seed(42)

# 수치형 데이터의 대표값 추출하는 함수
def summary_stats(series, is_countinuous=True, bins=50):
    mean = float(series.mean())
    median = float(series.median())
    std = float(series.std(ddof=1))     # ddof : 표본표준편차 계산의 분모가 되는 자유도를 지정.

    # is_continuous=True인 경우(연속형 데이터) 히스토그램 생성해 빈도가 가장 높은 구간의 중앙값을 최빈값 추정치로 사용
    # 숫자중에서 실수값을 가진 숫자들 (소수점 나올 수 있는 숫자들) -> 최빈값을 바로 쓰기가 어려움
    if is_countinuous:
        counts, edges = np.histogram(series, bins=bins)
        idx = int(np.argmax(counts))
        mode_est = float((edges[idx] + edges[idx+1]) / 2.0)
    # is_continuous=False인 경우(이산형 데이터) Counter 객체 사용해 가장 빈번한 값 찾음
    # 이산형 데이터들 (정수형) ex) 점수, 등급
    else:
        c = Counter(series.tolist())
        mode_est = float(Counter(series.tolist().most_common(1)[0][0]))
    return {"mean": mean, "median": median, "mode": mode_est, "std": std}

# 그래프 그리고 그림을 저장하고 싶을 때 사용
def savefig(name):
    path = f"{name}.png"
    plt.tight_layout()
    plt.savefig(path, dpi=180, bbox_inches="tight")
    plt.show()
    print(f"Saved figure to {path}")

# 금융 사용하게 되는 (수익률 계산) -> <https://m.blog.naver.com/suyou111/222301932504>
def annualized_stats(monthly_returns):
    monthly_geom = np.prod(1 + monthly_returns) ** (1 / len(monthly_returns)) - 1
    ann_return = (1 + monthly_geom) ** 12 - 1
    ann_std = np.std(monthly_returns, ddof=1) * np.sqrt(12)
    return float(ann_return), float(ann_std)

.std(ddof=1) : 표준편차 계산. 1이면 표본표준편차를 계산한다는 의미. 분모를 n-1로 나누게 됨. 기본값 n-1
np.argmax() : 최대값의 인덱스를 반환
plt.savefig(fname, dpi, transparent, bbox_inches) : 이미지 파일 저장
- fname : 이미지가 저장될 파일 이름
- dpi : 이미지의 해상도 설정
- transparent : 기본값 True. 저장된 그림의 배경이 투명해짐
- bbox_inches : 이미지 주위 공백 제어. 일반적으로 여백 최소화하는데 사용.
np.prod() : 배열 요소의 곱 계산
np.sqrt() : 숫자의 양의 제곱근 계산

🔎 heavy tail(긴꼬리) 분포를 가진 매출 보여주는 시각화

### 데이터 임의 생성 구간
n = 5000
regular = np.random.lognormal(mean=np.log(30000), sigma=0.5, size=int(n * 0.97))    # typical carts around 30k KRW
vip = np.random.lognormal(mean=np.log(300000), sigma=0.6, size=int(n * 0.03))   # VIP big orders
sales = pd.Series(np.concatenate([regular, vip]))
###

stats_sales = summary_stats(sales, is_countinuous=True, bins=60)
stats_sales["scenario"] = "Sales (KRW)"

plt.figure()
plt.hist(sales, bins=60, color="#9bc4d5")
plt.xlabel("Cart amount (KRW)")
plt.ylabel("Count")
plt.title("Sales distribution (heavy tail)")
plt.xscale("log")
plt.axvline(stats_sales["mean"], linestyle="--", label="Mean", color="#3e4e5f")
plt.axvline(stats_sales["median"], linestyle=":", label="Median", color="#34558b")
plt.legend()
plt.show()
# savefig("sales_hist")

🔎 두 개의 누적 수익률 비교

### 데이터 임의 생성 구간
months = 120
fundA = np.random.normal(loc=0.006, scale=0.04, size=months)    # high vol
fundB = np.random.normal(loc=0.006, scale=0.01, size=months)    # low vol
###

ann_ret_A, ann_std_A = annualized_stats(fundA)
ann_ret_B, ann_std_B = annualized_stats(fundB)

stats_fundA = {"mean" : ann_ret_A, "median" : float(np.median(fundA)) * 12, 
               "mode" : float(pd.Series(fundA).mode().iloc[0]), "std" : ann_std_A,
               "scenario" : "Fund A (annualized)"}
stats_fundB = {"mean" : ann_ret_B, "median" : float(np.median(fundB)) * 12, 
               "mode" : float(pd.Series(fundB).mode().iloc[0]), "std" : ann_std_B,
               "scenario" : "Fund B (annualized)"}

plt.figure()
cum_A = np.cumprod(1 + fundA) - 1
cum_B = np.cumprod(1 + fundB) - 1
plt.plot(cum_A, label="Fund A (high σ)", color="#ea435d")
plt.plot(cum_B, label="Fund B (low σ)", color="#6768ab")
plt.xlabel("Months")
plt.ylabel("Cumulative return")
plt.title("Cumulative returns: same mean, different risk")
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.legend()
plt.show()
# savefig("funds_cumulative")

fundA → 표준편차 0.04

fundB → 표준편차 0.01

⇒ 즉 펀드A가 펀드B보다 더 변동성이 큰, 위험한 투자 상품임