Visualizing Machine Learning Models: Examples with Scikit-learn, XGB and Matplotlib

Last updated:
Table of Contents

Plot ROC Curve and calculate AUC

For more detailed information on the ROC curve see AUC and Calibrated models

The ROC curve and the AUC (the Area Under the Curve) are simple ways to view the results of a classifier.

The ROC curve is good for viewing how your model behaves on different levels of false-positive rates and the AUC is useful when you need to report a single number to indicate how good your model is.

import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

# load data
X_train, X_test, y_train, y_test = load_my_data()

# model can be any trained classifier that supports predict_proba()
clf = LogisticRegression()
clf.fit(X_train, y_train)

y_preds = clf.predict_proba(X_test)

# take the second column because the classifier outputs scores for
# the 0 class as well
preds = y_preds[:,1]

# fpr means false-positive-rate
# tpr means true-positive-rate
fpr, tpr, _ = metrics.roc_curve(y_test, preds)

auc_score = metrics.auc(fpr, tpr)

plt.title('ROC Curve')
plt.plot(fpr, tpr, label='AUC = {:.2f}'.format(auc_score))

# it's helpful to add a diagonal to indicate where chance 
# scores lie (i.e. just flipping a coin)
plt.plot([0,1],[0,1],'r--')

plt.xlim([-0.1,1.1])
plt.ylim([-0.1,1.1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

plt.legend(loc='lower right')
plt.show()

roc-auc-score Results may vary, especially in real-life problems.
This is a dummy dataset.

It is useful to view the results for all runs of a grid search.

See the full output on this jupyter notebook

Here is one way to do it: create multiple plots using plt.subplots() and plot the results for each with the title being the current grid configuration.

import matplotlib.pyplot as plt
import math

from sklearn import metrics

from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification

# load data
X, y = make_classification(n_samples=10000, 
                           n_features=25, 
                           n_redundant=10, 
                           n_repeated=5,
                           class_sep=0.2, 
                           flip_y=0.1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

# out pipeline has uses PCA for preprocessing then 
# a gradient boosting classifier
param_grid = [
    {
        'pca__n_components':[5,10,20],
        'clf__n_estimators':[5,20,50,100,200],
        'clf__max_depth':[1,2,3,4]
    }
]

pipeline = Pipeline([
    ('pca',PCA()),
    ('clf',GradientBoostingClassifier())
])

# now plotting:

num_cols = 3
num_rows = math.ceil(len(ParameterGrid(param_grid)) / num_cols)

# create a single figure
plt.clf()
fig,axes = plt.subplots(num_rows,num_cols,sharey=True)
fig.set_size_inches(num_cols*5,num_rows*5)

for i,g in enumerate(ParameterGrid(param_grid)):

    pipeline.set_params(**g)
    pipeline.fit(X_train,y_train)

    y_preds = pipeline.predict_proba(X_test)

    # take the second column because the classifier outputs scores for
    # the 0 class as well
    preds = y_preds[:,1]

    # fpr means false-positive-rate
    # tpr means true-positive-rate
    fpr, tpr, _ = metrics.roc_curve(y_test, preds)

    auc_score = metrics.auc(fpr, tpr)

    ax = axes[i // num_cols, i % num_cols]

    # don't print the whole name or it won't fit
    ax.set_title(str([r"{}:{}".format(k.split('__')[1:],v) for k,v in g.items()]),fontsize=9)
    ax.plot(fpr, tpr, label='AUC = {:.3f}'.format(auc_score))
    ax.legend(loc='lower right')

    # it's helpful to add a diagonal to indicate where chance 
    # scores lie (i.e. just flipping a coin)
    ax.plot([0,1],[0,1],'r--')

    ax.set_xlim([-0.1,1.1])
    ax.set_ylim([-0.1,1.1])
    ax.set_ylabel('True Positive Rate')
    ax.set_xlabel('False Positive Rate')

plt.gcf().tight_layout()
plt.show()

The result will look like this:

plot-auc-for-all-configurations-in-grid-search One plot is drawn for each configuration in the grid search.
Note that the title for each plot is the configuration of the grid at that step.

Plot variable importances for XGB

Using data from the Kaggle titanic competition

Use zip(data.columns,clf.feature_importances_) to make a dictionary linking variable names (in a pandas dataframe) and the variable importances:

Full code can be found here: plot-importances-xgb.ipynb

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

# this is the titanic train set
df = pd.read_csv('train.csv')

# let's just use 'Fare' and 'Age' features
data=df[['Age','Fare']]
target = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(data.values, target.values, test_size=0.4)

clf = XGBClassifier()
clf.fit(X_train,y_train)

# build importance data
importance_data = sorted(list(zip(data.columns,clf.feature_importances_)),key=lambda tpl:tpl[1],reverse=True)

xs = range(len(importance_data))
labels = [x for (x,_) in importance_data]
ys = [y for (_,y) in importance_data]

# plot
plt.bar(xs,ys,width=0.5)
plt.xticks(xs,labels)
plt.show()

variable-importances-for-xgb Just using these two numeric variables

Plot categorical, one-hot-encoded variable importances for XGB

Using data from the Kaggle titanic competition

As above, we build variable importances but we also merge together one-hot-encoded variables in the dataframe.

XGB treats one-hot-encoded variables separately, but it's likely that you want to see the full importance for each categorical variable as a whole.

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

# this is the titanic train set
df = pd.read_csv('train.csv')

# select the columns we will use
df = df[['Pclass','Sex','SibSp','Embarked','Age','Fare','Survived']]

# one-hot-encode categorical variables
df = pd.concat([df,pd.get_dummies(df['Pclass'], prefix='Pclass',dummy_na=True)],axis=1).drop(['Pclass'],axis=1)
df = pd.concat([df,pd.get_dummies(df['Sex'], prefix='Sex',dummy_na=True)],axis=1).drop(['Sex'],axis=1)
df = pd.concat([df,pd.get_dummies(df['SibSp'], prefix='SibSp',dummy_na=True)],axis=1).drop(['SibSp'],axis=1)
df = pd.concat([df,pd.get_dummies(df['Embarked'], prefix='Embarked',dummy_na=True)],axis=1).drop(['Embarked'],axis=1)

data=df.drop(['Survived'],axis=1)
target = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(data.values, target.values, test_size=0.4)

clf = XGBClassifier()
clf.fit(X_train,y_train)

at this point, we will modify the feature importances dict so that we get the sum of the importances for each categorical variable:

importances_dict = dict(zip(data.columns,clf.feature_importances_))

categorical_columns_names = ['Pclass','Sex','SibSp','Embarked']

for column_name in categorical_columns_names:

    all_values_sum = 0

    for key,value in list(importances_dict.items():
        if key.startswith(column_name):
            all_values_sum += importances_dict[key]
            del(importances_dict[key])

    importances_dict[column_name] = all_values_sum



# now we proceed the same as in the previous example, but using the
# modified importance we calculated above
importance_data = sorted(list(importances_dict.items()),key=lambda tpl:tpl[1],reverse=True)

xs = range(len(importance_data))
labels = [x for (x,_) in importance_data]
ys = [y for (_,y) in importance_data]

# plot
plt.bar(xs,ys,width=0.5)
plt.xticks(xs,labels)
plt.show()

original-plot When we one-hot-encode categorical
and use them in Gradient Boosting,
importance is calculated individually
for each possible category value
         
plot-aggregated-by-category But we can alter the importance
values so that the sum for all values
in a category is shown.
(Much easier to visualize)

Dialogue & Discussion