import pandas as pd
import numpy as np
from scipy.stats import skew
# Sample DataFrame
data = {
'Category': ['A', 'B', 'A', 'B', 'A'],
'Numeric1': [100, 200, 300, 400, 500],
'Numeric2': [10, 20, 30, 40, 50]
}
df = pd.DataFrame(data)
# Select numerical columns for skewness check and correction
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
# Check skewness of numerical columns
skewness = df[numerical_cols].apply(lambda x: skew(x))
print("Skewness of numerical columns:")
print(skewness)
# Correct skewness if necessary (e.g., apply log transformation)
skewed_cols = skewness[skewness > 0.75].index # Example threshold for skewness
for col in skewed_cols:
df[col] = np.log1p(df[col]) # Applying log transformation (log1p to handle zero values)
# Check skewness again after transformation
skewness_corrected = df[numerical_cols].apply(lambda x: skew(x))
print("\nSkewness after transformation:")
print(skewness_corrected)