Week 6 Notebook: Evalulating Model Performance and Robustness

Let’s take a look at the model performance and dependence on other variables.

import tensorflow.keras as keras
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import uproot
import yaml

with open('definitions.yml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    definitions = yaml.load(file, Loader=yaml.FullLoader)
    
features = definitions['features']
spectators = definitions['spectators']
labels = definitions['labels']

nfeatures = definitions['nfeatures']
nspectators = definitions['nspectators']
nlabels = definitions['nlabels']
ntracks = definitions['ntracks']
from DataGenerator import DataGenerator

Load the Previous Model

Here, we will load the last model trained in Notebook 5.

# define callbacks
from tensorflow.keras.models import load_model

keras_model_deepset = load_model('keras_model_deepset_best.h5')
2021-10-23 22:02:25.531703: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.

Configure Data Generator for Testing

We will configure the data generator to load testing data, including “spectator variables” that we were not used in the training, but may be correlated with the output of the algorithm. Specifically, the jet mass and pT.

# load testing file
test_files = ['root://eospublic.cern.ch//eos/opendata/cms/datascience/HiggsToBBNtupleProducerTool/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC/test/ntuple_merged_0.root'
             ]
test_generator = DataGenerator(test_files, features, labels, spectators, batch_size=8192, n_dim=ntracks, 
                               remove_mass_pt_window=True, 
                               remove_unlabeled=True,
                               return_spectators=True,
                               max_entry=200000) # basically, no maximum
from tqdm.notebook import tqdm
# run model inference on test data set
predict_array_cnn = []
label_array_test = []
spec_array_test = []

for t in tqdm(test_generator,total=len(test_generator)):
    label_array_test.append(t[1][0])
    spec_array_test.append(t[1][1])
    predict_array_cnn.append(keras_model_deepset.predict(t[0]))
    
predict_array_cnn = np.concatenate(predict_array_cnn, axis=0)
label_array_test = np.concatenate(label_array_test, axis=0)
spec_array_test = np.concatenate(spec_array_test, axis=0)

# create ROC curves
fpr_cnn, tpr_cnn, threshold_cnn = roc_curve(label_array_test[:,1], predict_array_cnn[:,1])
    
# plot ROC curves
plt.figure()
plt.plot(tpr_cnn, fpr_cnn, lw=2.5, label="Deep Sets, AUC = {:.1f}%".format(auc(fpr_cnn,tpr_cnn)*100))
plt.xlabel(r'True positive rate')
plt.ylabel(r'False positive rate')
plt.semilogy()
plt.ylim(0.001, 1)
plt.xlim(0, 1)
plt.grid(True)
plt.legend(loc='upper left')
plt.show()
2021-10-23 22:03:04.381635: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-10-23 22:03:04.384986: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2593905000 Hz
../_images/06-evaluate_8_2.png
def find_nearest(array,value):
    idx = (np.abs(array-value)).argmin()
    return idx, array[idx]

Correlation with jet mass

This algorithm has a slight dependence on jet mass. Qualitatively, we can observe this “mass sculpting” by making the distributino of the jet mass with tighter and tighter requirements on the algorithm output. We can see that the original distribution of the jet mass for the background has no “resonance” or bump, and is basically smoothly falling.

However, once a requirement is made on the algorithm output, jets with low mass or high mass are excluded at higher rates.

plt.figure()
for wp in [1.0, 0.8, 0.5, 0.1]:
    idx, val = find_nearest(fpr_cnn, wp)
    plt.hist(spec_array_test[:,0], bins = np.linspace(40, 200, 21), 
             weights = label_array_test[:,0]*(predict_array_cnn[:,1] > threshold_cnn[idx]),
             alpha=0.4,density=True, label='QCD, {}% FPR cut'.format(int(wp*100)),linestyle='-')
plt.legend()
plt.xlabel(r'$m_{SD}$')
plt.ylabel(r'Normalized probability')
plt.xlim(40, 200)

plt.figure()
for wp in [1.0, 0.8, 0.5, 0.1]:
    idx, val = find_nearest(fpr_cnn, wp)
    plt.hist(spec_array_test[:,0], bins=np.linspace(40, 200, 21), 
             weights = label_array_test[:,1]*(predict_array_cnn[:,1] > threshold_cnn[idx]),
             alpha=0.4,density=True, label='H(bb), {}% FPR cut'.format(int(wp*100)),linestyle='-')
plt.legend()
plt.xlabel(r'$m_{SD}$')
plt.ylabel(r'Normalized probability')
plt.xlim(40, 200)

plt.figure()
plt.hist2d(spec_array_test[:,0][(label_array_test[:,0]==1) & (predict_array_cnn[:,1] > 0.1)], 
           predict_array_cnn[:,1][(label_array_test[:,0]==1) & (predict_array_cnn[:,1] > 0.1)], 
           bins=(30, 30), 
           cmap=plt.cm.jet)
plt.colorbar()
plt.title('QCD')
plt.ylabel(r'Deep Set Output')
plt.xlabel(r'$m_{SD}$')

plt.figure()
plt.hist2d(spec_array_test[:,0][(label_array_test[:,1]==1) & (predict_array_cnn[:,1] > 0.8)], 
           predict_array_cnn[:,1][(label_array_test[:,1]==1) & (predict_array_cnn[:,1] > 0.8)], 
           bins=(30, 30), 
           cmap=plt.cm.jet)
plt.colorbar()
plt.title('H(bb)')
plt.ylabel(r'Deep Set Output')
plt.xlabel(r'$m_{SD}$')

plt.show()
../_images/06-evaluate_11_0.png ../_images/06-evaluate_11_1.png ../_images/06-evaluate_11_2.png ../_images/06-evaluate_11_3.png

Dependence on jet pT

We can repeat the exercise this time looking at jet pT. We see that in general, higher pT jets are promoted to be more “signal-like” by the algorithm. This is likely due to the fact that high pT jets are over-represented in the signal sample compared to the background sample.

plt.figure()
for wp in [1.0, 0.8, 0.5, 0.1]:
    idx, val = find_nearest(fpr_cnn, wp)
    plt.hist(spec_array_test[:,1], bins=np.linspace(300, 2000, 51), 
             weights = label_array_test[:,0]*(predict_array_cnn[:,1] > threshold_cnn[idx]),
             alpha=0.4,density=True, label='QCD, {}% FPR cut'.format(int(wp*100)),linestyle='-')

plt.legend()
plt.xlabel(r'$p_{T}$')
plt.ylabel(r'Normalized probability')
plt.xlim(300, 2000)

plt.figure()
for wp in [1.0, 0.8, 0.5, 0.1]:
    idx, val = find_nearest(fpr_cnn, wp)
    plt.hist(spec_array_test[:,1], bins=np.linspace(300, 2000, 51), 
             weights = label_array_test[:,1]*(predict_array_cnn[:,1] > threshold_cnn[idx]),
             alpha=0.4,density=True, label='H(bb), {}% FPR cut'.format(int(wp*100)),linestyle='-')

plt.legend()
plt.xlabel(r'$p_{T}$')
plt.ylabel(r'Normalized probability')
plt.xlim(300, 2000)
plt.show()
../_images/06-evaluate_13_0.png ../_images/06-evaluate_13_1.png