from statsmodels.tools.tools import add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor
def analyze_multicollinearity(df):
X = add_constant(df)
vif_df = pd.DataFrame()
vif_df['feature'] = X.columns
vif_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
corr = df.corr().abs().stack().reset_index()
corr.columns = ['var1', 'var2', 'correlation']
corr = corr[corr['var1'] != corr['var2']]
corr['pair'] = corr.apply(lambda row: tuple(sorted([row['var1'], row['var2']])), axis=1)
corr = corr.drop_duplicates('pair')
corr = corr.drop(columns=['var1','var2'])
corr = corr.sort_values(by='correlation', ascending=False)
return vif_df.sort_values(by='VIF', ascending=False), corr