Week 4 Notebook: Simple Classifiers

This week, we’re going to build some simple classifiers.

import tensorflow.keras as keras
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import uproot
import utils
import yaml

with open('definitions_simple.yml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    definitions = yaml.load(file, Loader=yaml.FullLoader)
    
features = definitions['features']
spectators = definitions['spectators']
labels = definitions['labels']

nfeatures = definitions['nfeatures']
nspectators = definitions['nspectators']
nlabels = definitions['nlabels']

Let’s set up a function to get features and labels.

# load training file
feature_array, label_array, spec_array = utils.get_features_labels('root://eospublic.cern.ch//eos/opendata/cms/datascience/HiggsToBBNtupleProducerTool/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC/train/ntuple_merged_10.root', 
                                                                   features, spectators, labels,
                                                                   remove_mass_pt_window=False,
                                                                   entry_stop=20000)

Decision Tree Classifier

from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth=5)
clf = clf.fit(feature_array, label_array[:,1])

Support Vector Machine Classifier

from sklearn import linear_model
svm = linear_model.SGDClassifier()
svm.fit(feature_array, label_array[:,1])
SGDClassifier()

Fully Connected Neural Network Classifier

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, BatchNormalization

# define dense keras model
inputs = Input(shape=(nfeatures,), name = 'input')  
x = BatchNormalization(name='bn_1')(inputs)
x = Dense(64, name = 'dense_1', activation='relu')(x)
x = Dense(32, name = 'dense_2', activation='relu')(x)
x = Dense(32, name = 'dense_3', activation='relu')(x)
outputs = Dense(nlabels, name = 'output', activation='softmax')(x)
keras_model = Model(inputs=inputs, outputs=outputs)
keras_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(keras_model.summary())
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input (InputLayer)           [(None, 27)]              0         
_________________________________________________________________
bn_1 (BatchNormalization)    (None, 27)                108       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                1792      
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                1056      
_________________________________________________________________
output (Dense)               (None, 2)                 66        
=================================================================
Total params: 5,102
Trainable params: 5,048
Non-trainable params: 54
_________________________________________________________________
None
2021-10-23 20:15:31.583287: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
# define callbacks
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_loss', patience=20)
reduce_lr = ReduceLROnPlateau(patience=5,factor=0.5)
model_checkpoint = ModelCheckpoint('keras_model_best.h5', monitor='val_loss', save_best_only=True)
callbacks = [early_stopping, model_checkpoint, reduce_lr]

# fit keras model
history = keras_model.fit(feature_array, label_array, batch_size=1024, 
                          epochs=100, validation_split=0.2, shuffle=False,
                          callbacks = callbacks, verbose=0)
# reload best weights
keras_model.load_weights('keras_model_best.h5')
2021-10-23 20:15:31.656166: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-10-23 20:15:31.656909: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2593905000 Hz
plt.figure()
plt.plot(history.history['loss'],label='Loss')
plt.plot(history.history['val_loss'],label='Val. loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()
../_images/04-simple-classifiers_12_0.png
# load testing file
feature_array_test, label_array_test, spec_array_test = utils.get_features_labels('root://eospublic.cern.ch//eos/opendata/cms/datascience/HiggsToBBNtupleProducerTool/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC/test/ntuple_merged_0.root', 
                                                                                  features, spectators, labels,
                                                                                  remove_mass_pt_window=True,
                                                                                  entry_stop=30000)
# run model inference on test data set
predict_array_nn = keras_model.predict(feature_array_test)[:,1]
predict_array_tree = clf.predict_proba(feature_array_test)[:,1]
predict_array_svm = svm.decision_function(feature_array_test)

# create ROC curves
fpr_tree, tpr_tree, threshold_tree = roc_curve(label_array_test[:,1], predict_array_tree)
fpr_svm, tpr_svm, threshold_svm = roc_curve(label_array_test[:,1], predict_array_svm)
fpr_nn, tpr_nn, threshold_nn = roc_curve(label_array_test[:,1], predict_array_nn)
    
# plot ROC curves
plt.figure()
plt.plot(tpr_tree, fpr_tree, lw=2.5, label="Tree, AUC = {:.1f}%".format(auc(fpr_tree,tpr_tree)*100))
plt.plot(tpr_svm, fpr_svm, lw=2.5, label="SVM, AUC = {:.1f}%".format(auc(fpr_svm,tpr_svm)*100))
plt.plot(tpr_nn, fpr_nn, lw=2.5, label="NN, AUC = {:.1f}%".format(auc(fpr_nn,tpr_nn)*100))
plt.xlabel(r'True positive rate')
plt.ylabel(r'False positive rate')
plt.semilogy()
plt.ylim(0.001,1)
plt.xlim(0,1)
plt.grid(True)
plt.legend(loc='upper left')
plt.show()
../_images/04-simple-classifiers_14_0.png

Try to add a boosted decision tree.