Comparing ordinal with usual classification

In this notebook we use the sklearn diabetes dataset as a comparison between the LGBMOrdinal, LGBMClassifier, and Logistic regression models. We convert the continuous label to classes by binnging it using quantiles.

We then train and test the models several times with different train/test splits and evaluate their mean absolute deviation instead of accuracy. This metric penalises wrong predictions that are further appart from the true label more than those which are closer.

import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from ordinalgbt.lgb import LGBMOrdinal
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 3
      1 import numpy as np
      2 import pandas as pd
----> 3 from lightgbm import LGBMClassifier
      4 from sklearn.datasets import load_diabetes
      5 from sklearn.linear_model import LinearRegression, LogisticRegression
ModuleNotFoundError: No module named 'lightgbm'
data = load_diabetes()
X = pd.DataFrame(data["data"], columns = data["feature_names"])
y = data["target"]
nq = 10
thresholds = np.append(np.append(y.min()-1,np.quantile(y,np.arange(0,1,1/nq)[1:])),y.max()+1)
yq = pd.cut(x=y,bins=thresholds,right=True,labels=['q'+str(z+1) for z in range(nq)])
yord = yq.astype('category').codes
for ii in range(nsim):
  # Do a train/test split (80/20)
  ytrain, ytest, Xtrain, Xtest = train_test_split(yord, X, stratify=yord,test_size=0.2,
                                                  random_state=ii)
  # Ordinal model
  mdl_ord = LGBMOrdinal()
  mdl_ord.fit(Xtrain, ytrain)
  # Multinomial LGBM model
  mdl_class = LGBMClassifier()
  mdl_class.fit(Xtrain, ytrain)
  # Multinomial Regression model
  mdl_multi = LogisticRegression(penalty='l2',solver='lbfgs',max_iter=1000)
  mdl_multi.fit(Xtrain,ytrain)
  # Make predictions
  yhat_ord = mdl_ord.predict(Xtest)
  yhat_multi = mdl_multi.predict(Xtest)
  yhat_class = mdl_class.predict(Xtest)
  # Get MAE
  acc_class = np.abs(yhat_class - ytest).mean()
  acc_multi = np.abs(yhat_multi - ytest).mean()
  acc_ord = np.abs(yhat_ord - ytest).mean()
  holder.append(pd.DataFrame({'ord':acc_ord,'multi':acc_multi,'class':acc_class},index=[ii]))
df_mae = pd.concat(holder).mean(axis=0).reset_index().rename(columns={'index':'mdl',0:'MAE'})
di_lbls = {'ord':'LGBMOrdinal','multi':'SKlearn Multinomial','class':'LGBMClassifier'}
df_mae = df_mae.assign(mdl=lambda x: x.mdl.map(di_lbls))
print(np.round(df_mae,1))