Assume df_model is your working dataframe num_cols = [c for c in df_model.columns if c.startswith('X_num')]
Compute skewness skews = df_model[num_cols].skew(numeric_only=True).sort_values(ascending=False) print("Skewness per numeric feature:\n", skews, "\n")
Create subplots rows = int(np.ceil(len(num_cols) / 3)) fig, axes = plt.subplots(rows, 3, figsize=(16, 4 * rows)) axes = axes.flatten()
Plot each numeric feature for i, col in enumerate(num_cols): ax = axes[i] ax.hist(df_model[col], bins=30, color='steelblue', edgecolor='white', alpha=0.8, density=True) ax.set_title(f"{col}\nSkew: {skews[col]:.2f}") ax.set_xlabel("") ax.set_ylabel("Density")
Hide empty subplots if any for j in range(len(num_cols), len(axes)): fig.delaxes(axes[j])
plt.suptitle("Distributions of Numeric Features (Raw)", fontsize=14, y=1.02) plt.tight_layout() plt.show()
Heuristic: log1p if |skew| 0.75 and strictly positive log_cols = [c for c in num_cols if abs(skews[c]) 0.75 and (X_train[c] 0).all()] plain_cols = [c for c in num_cols if c not in log_cols]
---------- 4) Apply log1p to TRAIN numeric (inplace on copies) ---------- X_train_log = X_train.copy() for c in log_cols: X_train_log[c] = np.log1p(X_train_log[c])
Apply the SAME transform to TEST X_test_log = X_test.copy() for c in log_cols: X_test_log[c] = np.log1p(X_test_log[c])
---------- 5) Standardize numeric features ---------- scaler = StandardScaler() scaled_train = pd.DataFrame( scaler.fit_transform(X_train_log[num_cols]), columns=num_cols, index=X_train_log.index) scaled_test = pd.DataFrame( scaler.transform(X_test_log[num_cols]), columns=num_cols, index=X_test_log.index)
X_train_log[num_cols] = scaled_train X_test_log[num_cols] = scaled_test
Assume df_model is your working dataframe num_cols = [c for c in X_train_log.columns if c.startswith('X_num')]
Compute skewness skews = X_train_log[num_cols].skew(numeric_only=True).sort_values(ascending=False) print("Skewness per numeric feature:\n", skews, "\n")
Create subplots rows = int(np.ceil(len(num_cols) / 3)) fig, axes = plt.subplots(rows, 3, figsize=(16, 4 * rows)) axes = axes.flatten()
Plot each numeric feature for i, col in enumerate(num_cols): ax = axes[i] ax.hist(X_train_log[col], bins=30, color='steelblue', edgecolor='white', alpha=0.8, density=True) ax.set_title(f"{col}\nSkew: {skews[col]:.2f}") ax.set_xlabel("") ax.set_ylabel("Density")
Hide empty subplots if any for j in range(len(num_cols), len(axes)): fig.delaxes(axes[j])
plt.suptitle("Distributions of Numeric Features (Raw)", fontsize=14, y=1.02) plt.tight_layout() plt.show()
---------- 6) Reassemble final frames (order optional) ---------- ordered_cols = num_cols + oh_cols + ord_cols ordered_cols = [c for c in ordered_cols if c in X_train_log.columns]
X_train_scaled = X_train_log[ordered_cols].copy() X_test_scaled = X_test_log[ordered_cols].copy()
---------- 7) Sanity checks ---------- print("Skew on train numeric features:") print(skews.sort_values(ascending=False), "\n")
print("Log-transformed numeric columns:", log_cols) print("Plain-scaled numeric columns:", plain_cols, "\n")
print("X_train_scaled shape:", X_train_scaled.shape) print("X_test_scaled shape:", X_test_scaled.shape) print("First 5 cols:", X_train_scaled.columns[:5].tolist())
import statsmodels.api as sm from sklearn.metrics import mean_squared_error
-------- Prepare data -------- X_train_sm = sm.add_constant(X_train_scaled) # adds intercept term X_test_sm = sm.add_constant(X_test_scaled)
Fit OLS model ols_model = sm.OLS(y_train, X_train_sm).fit()
Predictions y_pred = ols_model.predict(X_test_sm)
-------- Model summary -------- print(ols_model.summary())
mse_train = mean_squared_error(y_train, ols_model.predict(X_train_sm)) mse_test = mean_squared_error(y_test, y_pred)
print(f"Train MSE: {mse_train:.3f}") print(f"Test MSE:...
Read more(A) Linearity Residuals should not show a pattern versus fitted values.
residuals = y_train - ols_model.fittedvalues fitted = ols_model.fittedvalues
plt.figure(figsize=(6,4)) plt.scatter(fitted, residuals, alpha=0.7, color='steelblue', edgecolor='white') plt.axhline(0, color='red', linestyle='--') plt.xlabel("Fitted Values") plt.ylabel("Residuals") plt.title("Residuals vs Fitted Values (Linearity Check)") plt.show()
(B) Normality of residuals : Residuals should follow a normal distribution. p 0.05 → residuals not significantly different from normal.
plt.figure(figsize=(6,4)) plt.hist(residuals, bins=30, color='steelblue', edgecolor='white', density=True, alpha=0.8) plt.xlabel("Residuals") plt.ylabel("Density") plt.title("Residuals Distribution (Normality Check)") plt.show()
(C) Homoscedasticity (constant variance) p 0.05 → homoscedasticity holds. p
Read moreimport matplotlib.pyplot as plt import seaborn as sns
df['Y'].describe()
Clean data to remove infinities and NaNs df = df.replace([np.inf, -np.inf], np.nan).dropna(subset=['Y']) sns.displot(df['Y'], kde=True)
Select your features cols = [ 'X_num1', 'X_num2', 'X_num3', 'X_num4', 'X_num5', 'X_num6', 'X_num7', 'X_num8', 'X_oh1', 'X_oh2', 'X_ord1' ]
--- Plot pairwise scatterplots + histograms (diagonal) --- pd.plotting.scatter_matrix( df[cols], figsize=(14, 10), diagonal='hist', # or 'kde' for density on diagonal alpha=0.6, color='steelblue', edgecolor='white' )
Adjust layout plt.suptitle("Pairwise Feature Relationships", y=1.02, fontsize=14) plt.tight_layout() plt.show()
--- 1) Encode ordinal variable X_ord1 --- Only map if it's still strings (object); if already numeric, this will be skipped if df['X_ord1'].dtype == 'O': ord_map = {'Bearish': 0, 'Neutral': 1, 'Bullish': 2} df['X_ord1'] = df['X_ord1'].map(ord_map)
--- 2) One-hot encode nominal variables X_oh1 and X_oh2 --- oh_source_cols = ['X_oh1', 'X_oh2'] df_oh = pd.get_dummies(df, columns=oh_source_cols, drop_first=True) df_oh = df_oh.astype(int)
--- 3) Order columns neatly (optional) --- num_cols = [f'X_num{i}' for i in range(1, 9)] Get all new dummy columns automatically oh_cols = [c for c in df_oh.columns if c.startswith('X_oh1_') or c.startswith('X_oh2_')] ord_cols = ['X_ord1'] target = ['Stock_Price']
ordered_cols = num_cols + oh_cols + ord_cols + target ordered_cols = [c for c in ordered_cols if c in df_oh.columns] df_final = df_oh[ordered_cols].copy()
Assume df_final is your preprocessed DataFrame with X features only X_cols = [c for c in df_final.columns if c.startswith(('X_num', 'X_oh', 'X_ord'))] corr_matrix = df_final[X_cols].corr(method='pearson')
Plot fig, ax = plt.subplots(figsize=(10,8)) im = ax.imshow(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
Add colorbar cbar = plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04) cbar.set_label("Correlation", rotation=270, labelpad=15)
Label axes ax.set_xticks(np.arange(len(X_cols))) ax.set_yticks(np.arange(len(X_cols))) ax.set_xticklabels(X_cols, rotation=90) ax.set_yticklabels(X_cols)
Annotate correlation values for i in range(len(X_cols)): for j in range(len(X_cols)): value = corr_matrix.iloc[i, j] choose text color based on background brightness for readability color = "white" if abs(value) 0.5 else "black" ax.text(j, i, f"{value:.2f}", ha="center", va="center", color=color, fontsize=8)
plt.title("Feature Correlation Heatmap", fontsize=14) plt.tight_layout() plt.show()
from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler
---------- 0) Working copy ---------- df_model = df_final.copy() # your encoded dataframe
---------- 1) Identify columns ---------- target_col = 'Stock_Price' if 'Stock_Price' in df_model.columns else 'Y' num_cols = [c for c in df_model.columns if c.startswith('X_num')] oh_cols = [c for c in df_model.columns if c.startswith('X_oh')] ord_cols = ['X_ord1'] if 'X_ord1' in df_model.columns else []
Ensure dummies are numeric 0/1 df_model[oh_cols] = df_model[oh_cols].astype(int)
---------- 2) Train / test split ---------- X = df_model.drop(columns=[target_col]) y = df_model[target_col].copy()
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2,...
Read more