Suppose there is a dataframe with 10000 samples and 4 features and the features are not guaranteed to be independent
np.random.seed(42)
data = np.random.randn(10000, 4)
df = pd.DataFrame(data, columns=[f'Feature_{i+1}' for i in range(4)])
missing_rows = np.random.choice(10000, 3000, replace=False)
for row in missing_rows:
df.iloc[row, np.random.choice(4)] = np.nan
I want to find all the outliers in this dataframe, also I want to mark all these outliers and those NAs with different color or font in the original dataframe. Below is my attempt, and I am trying to use Isolation Forest but I don't know how to do it. Any hint or help would be great, thanks
def IQR(series):
Q1 = series.quantile(0.25)
Q3 = series.quantile(0.75)
IQR = Q3 - Q1
LB = Q1 - 1.5 * IQR
UB = Q3 + 1.5 * IQR
return ((series < LB) | (series > UB) | (series.isnull()))
for col in data.columns:
data[f'abnormal {col}'] = IQR(data[col])
The example below colours NaNs in, and computes an outlier statistic per column:
IsolationForest
doesn't seem to support NaNs, so you must impute them or drop them. Rather than using IsolationForest
, I've used an IQR function per column.
np.random.seed(0)
data = np.random.randn(10000, 4)
df = pd.DataFrame(data, columns=[f'Feature_{i+1}' for i in range(4)])
missing_rows = np.random.choice(10000, 3000, replace=False)
for row in missing_rows:
df.iloc[row, np.random.choice(4)] = np.nan
def is_outlier(df):
q1 = df.quantile(0.25, axis=0)
q3 = df.quantile(0.75, axis=0)
iqr = q3 - q1
lower_thresh = iqr - 1.5 * iqr
upper_thresh = iqr + 1.5 * iqr
return (df < lower_thresh) | (df > upper_thresh)
df_outliers = is_outlier(df)
def style_outlier(v, props='color:white;background-color:royalblue'):
return np.where(df_outliers.loc[v.name], props, '')
def style_nan(v, props='color:black;background-color:firebrick'):
return props if pd.isna(v) else ''
(
df
.iloc[::1000] #only show every 1000th row
.style
.apply(style_outlier, axis=1)
# .highlight_null() #pandas in-built nan highlighting
.applymap(style_nan)
)