My BART model is predicting NaN for some cases. Does anyone know why this happens? or how I can prevent this?
My data has missing data but to my knowledge, BART can handle this. My data are finite.
import pandas as pd
import numpy as np
import random
import bartpy
from bartpy.sklearnmodel import SklearnModel
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
# simulate df with 46 features and 9000 rows
# create binary vars and make a df
label=np.random.randint(2, size=9000)
df = pd.DataFrame({'label':label})
df['a']=np.random.randint(2, size=9000)
# create integers
df['b'] = np.random.randint(low=50, high=96, size=9000)
df['b'] = np.random.randint(low=4, high=97, size=9000)
df['c'] = np.random.randint(low=0, high=1759.22, size=9000)
df['d'] = np.random.randint(low=0, high=5702.2, size=9000)
df['e'] = np.random.randint(low=0, high=7172.31, size=9000)
# create numerics
df['f'] = np.random.uniform(0, 908.56, 9000)
df['f'] = np.random.uniform(0,908.56, 9000)
df['g'] = np.random.uniform(0,2508.78, 9000)
df['h'] = np.random.uniform(0,3757.56, 9000)
df['i'] = np.random.uniform(0,560.18, 9000)
df['j'] = np.random.uniform(0,1362.71, 9000)
df['k'] = np.random.uniform(0,2578.26, 9000)
df['l'] = np.random.uniform(175.07,997, 9000)
df['m'] = np.random.uniform(992.39,3972.81, 9000)
df['n'] = np.random.uniform(1787.24,5823.21, 9000)
df['o'] = np.random.uniform(-56,53, 9000)
df['p'] = np.random.uniform(-47,46, 9000)
df['q'] = np.random.uniform(-1089.03,1546.87, 9000)
df['r'] = np.random.uniform(-1599.14,898.79, 9000)
df['s'] = np.random.uniform(-2871.02,5329, 9000)
df['t'] = np.random.uniform(-4231.44,2481.55, 9000)
df['u'] = np.random.uniform(-3435.9,5824.22, 9000)
df['v'] = np.random.uniform(-5086.6,4548.43, 9000)
df['w'] = np.random.uniform(-406.57,907.91, 9000)
df['x'] = np.random.uniform(-834.82,840.27, 9000)
df['y'] = np.random.uniform(-549.2,2506.29, 9000)
df['z'] = np.random.uniform(-1547.2,2434.18, 9000)
df['aa'] = np.random.uniform(-426.6,3636.17, 9000)
df['bb'] = np.random.uniform(-2819.8,3390, 9000)
df['cc'] = np.random.uniform(-266.75,527.81, 9000)
df['dd'] = np.random.uniform(-778.64,527.81, 9000)
df['ee'] = np.random.uniform(-476.09,1358.32, 9000)
df['ff'] = np.random.uniform(-1890.91,919.3, 9000)
df['gg'] = np.random.uniform(-1633.23,2577.01, 9000)
df['hh'] = np.random.uniform(-2427.93,2078.78, 9000)
df['ii'] = np.random.uniform(-339.67,518.32, 9000)
df['jj'] = np.random.uniform(-528.07,412, 9000)
df['kk'] = np.random.uniform(-1460.23,1610.58, 9000)
df['ll'] = np.random.uniform(-1984.08,1127.82, 9000)
df['mm'] = np.random.uniform(-2153.38,2402.24, 9000)
df['nn'] = np.random.uniform(-2311.27,1809.37, 9000)
df['oo'] = np.random.uniform(16,92, 9000)
df['pp'] = np.random.uniform(4,24, 9000)
df['qq'] = np.random.uniform(4,80, 9000)
df['rr'] = np.random.uniform(0,1, 9000)
# add missings to floats
# select only numeric columns to apply the missingness to
cols_list = df.select_dtypes('float64').columns.tolist()
# randomly remove cases from the dataframe
for col in df[cols_list]:
df.loc[df.sample(frac=0.02).index, col] = np.nan
# # 80/20 train test split
X_train, X_test, y_train, y_test = train_test_split(df.drop(['label'],axis=1), df['label'], train_size=0.7, random_state = 99)
# Modelling
model = SklearnModel(n_jobs = 30)
model.fit(X_train, y_train)
# Predictions
y_predictions = model.predict(X_test)
np.isnan(y_predictions).sum()
Hi,
Thank you for bart-py!
My BART model is predicting NaN for some cases. Does anyone know why this happens? or how I can prevent this?
My data has missing data but to my knowledge, BART can handle this. My data are finite.
Thank you!
Code:
(Sorry for the lengthy data generation)