Week 5 Notebook: Building a Deep Learning Model

Now, we’ll look at a deep learning model based on low-level track features.

import tensorflow.keras as keras
import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import uproot
import tensorflow
import yaml

with open('definitions.yml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    definitions = yaml.load(file, Loader=yaml.FullLoader)
    
features = definitions['features']
spectators = definitions['spectators']
labels = definitions['labels']

nfeatures = definitions['nfeatures']
nspectators = definitions['nspectators']
nlabels = definitions['nlabels']
ntracks = definitions['ntracks']

Data Generators

A quick aside on data generators. As training on large datasets is a key component of many deep learning approaches (and especially in high energy physics), and these datasets no longer fit in memory, it is imporatant to write a data generator which can automatically fetch data.

Here we modify one from: https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly

from DataGenerator import DataGenerator
help(DataGenerator)
Help on class DataGenerator in module DataGenerator:

class DataGenerator(tensorflow.python.keras.utils.data_utils.Sequence)
 |  DataGenerator(list_files, features, labels, spectators, batch_size=1024, n_dim=60, remove_mass_pt_window=False, remove_unlabeled=True, return_spectators=False, max_entry=20000, scale_mass_pt=[1, 1])
 |  
 |  Generates data for Keras
 |  
 |  Method resolution order:
 |      DataGenerator
 |      tensorflow.python.keras.utils.data_utils.Sequence
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(self, index)
 |      Generate one batch of data
 |  
 |  __init__(self, list_files, features, labels, spectators, batch_size=1024, n_dim=60, remove_mass_pt_window=False, remove_unlabeled=True, return_spectators=False, max_entry=20000, scale_mass_pt=[1, 1])
 |      Initialization
 |  
 |  __len__(self)
 |      Denotes the number of batches per epoch
 |  
 |  on_epoch_end(self)
 |      Updates indexes after each epoch
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from tensorflow.python.keras.utils.data_utils.Sequence:
 |  
 |  __iter__(self)
 |      Create a generator that iterate over the Sequence.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from tensorflow.python.keras.utils.data_utils.Sequence:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
# load training and validation generators 
train_files = ['root://eospublic.cern.ch//eos/opendata/cms/datascience/HiggsToBBNtupleProducerTool/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC/train/ntuple_merged_10.root']
val_files = ['root://eospublic.cern.ch//eos/opendata/cms/datascience/HiggsToBBNtupleProducerTool/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC/train/ntuple_merged_11.root']


train_generator = DataGenerator(train_files, features, labels, spectators, batch_size=1024, n_dim=ntracks, 
                                remove_mass_pt_window=False, 
                                remove_unlabeled=True, max_entry=8000)

val_generator = DataGenerator(val_files, features, labels, spectators, batch_size=1024, n_dim=ntracks, 
                                remove_mass_pt_window=False, 
                                remove_unlabeled=True, max_entry=2000)

Test Data Generator

Note that the track array has a different “shape.” There are also less than the requested batch_size=1024 because we remove unlabeled samples.

X, y = train_generator[1]
print(X.shape)
print(y.shape)
(951, 60, 48)
(951, 2)

Note this generator can be optimized further (storing the data file locally, etc.). It’s important to note that I/O is often a bottleneck for training big networks.

Fully Connected Neural Network Classifier

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Flatten
import tensorflow.keras.backend as K

# define dense keras model
inputs = Input(shape=(ntracks, nfeatures,), name='input')  
x = BatchNormalization(name='bn_1')(inputs)
x = Flatten(name='flatten_1')(x)
x = Dense(64, name='dense_1', activation='relu')(x)
x = Dense(32, name='dense_2', activation='relu')(x)
x = Dense(32, name='dense_3', activation='relu')(x)
outputs = Dense(nlabels, name='output', activation='softmax')(x)
keras_model_dense = Model(inputs=inputs, outputs=outputs)
keras_model_dense.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(keras_model_dense.summary())
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input (InputLayer)           [(None, 60, 48)]          0         
_________________________________________________________________
bn_1 (BatchNormalization)    (None, 60, 48)            192       
_________________________________________________________________
flatten_1 (Flatten)          (None, 2880)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                184384    
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 32)                1056      
_________________________________________________________________
output (Dense)               (None, 2)                 66        
=================================================================
Total params: 187,778
Trainable params: 187,682
Non-trainable params: 96
_________________________________________________________________
None
2021-10-23 20:18:06.884631: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
# define callbacks
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_loss', patience=5)
reduce_lr = ReduceLROnPlateau(patience=5, factor=0.5)
model_checkpoint = ModelCheckpoint('keras_model_dense_best.h5', monitor='val_loss', save_best_only=True)
callbacks = [early_stopping, model_checkpoint, reduce_lr]

# fit keras model
history_dense = keras_model_dense.fit(train_generator,
                                      validation_data=val_generator,
                                      steps_per_epoch=len(train_generator),
                                      validation_steps=len(val_generator),
                                      max_queue_size=5,
                                      epochs=20,
                                      shuffle=False,
                                      callbacks=callbacks,
                                      verbose=0)
# reload best weights
keras_model_dense.load_weights('keras_model_dense_best.h5')
2021-10-23 20:18:13.529609: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-10-23 20:18:13.530329: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2593905000 Hz
plt.figure()
plt.plot(history_dense.history['loss'], label='Loss')
plt.plot(history_dense.history['val_loss'], label='Val. loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()
../_images/05-deep-learning_12_0.png

Deep Sets Classifier

This model uses the Dense layer of Keras, but really it’s more like the Deep Sets architecture applied to jets, the so-caled Particle-flow network approach[11, 14]. We are applying the same fully connected neural network to each track. Then the GlobalAveragePooling1D layer sums over the tracks (actually it takes the mean).

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, BatchNormalization, GlobalAveragePooling1D
import tensorflow.keras.backend as K

# define Deep Sets model with Dense Keras layer
inputs = Input(shape=(ntracks, nfeatures,), name='input')  
x = BatchNormalization(name='bn_1')(inputs)
x = Dense(64, name='dense_1', activation='relu')(x)
x = Dense(32, name='dense_2', activation='relu')(x)
x = Dense(32, name='dense_3', activation='relu')(x)
# sum over tracks
x = GlobalAveragePooling1D(name='pool_1')(x)
x = Dense(100, name='dense_4', activation='relu')(x)
outputs = Dense(nlabels, name='output', activation='softmax')(x)
keras_model_deepset = Model(inputs=inputs, outputs=outputs)
keras_model_deepset.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(keras_model_deepset.summary())
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input (InputLayer)           [(None, 60, 48)]          0         
_________________________________________________________________
bn_1 (BatchNormalization)    (None, 60, 48)            192       
_________________________________________________________________
dense_1 (Dense)              (None, 60, 64)            3136      
_________________________________________________________________
dense_2 (Dense)              (None, 60, 32)            2080      
_________________________________________________________________
dense_3 (Dense)              (None, 60, 32)            1056      
_________________________________________________________________
pool_1 (GlobalAveragePooling (None, 32)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 100)               3300      
_________________________________________________________________
output (Dense)               (None, 2)                 202       
=================================================================
Total params: 9,966
Trainable params: 9,870
Non-trainable params: 96
_________________________________________________________________
None
# define callbacks
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

early_stopping = EarlyStopping(monitor='val_loss', patience=5)
reduce_lr = ReduceLROnPlateau(patience=5, factor=0.5)
model_checkpoint = ModelCheckpoint('keras_model_deepset_best.h5', monitor='val_loss', save_best_only=True)
callbacks = [early_stopping, model_checkpoint, reduce_lr]

# fit keras model
history_deepset = keras_model_deepset.fit(train_generator, 
                                          validation_data=val_generator, 
                                          steps_per_epoch=len(train_generator), 
                                          validation_steps=len(val_generator),
                                          max_queue_size=5,
                                          epochs=20, 
                                          shuffle=False,
                                          callbacks=callbacks, 
                                          verbose=0)
# reload best weights
keras_model_deepset.load_weights('keras_model_deepset_best.h5')
plt.figure()
plt.plot(history_deepset.history['loss'], label='Loss')
plt.plot(history_deepset.history['val_loss'], label='Val. loss')
plt.xlabel('Epoch')
plt.legend()
plt.show()
../_images/05-deep-learning_16_0.png
# load testing file
test_files = ['root://eospublic.cern.ch//eos/opendata/cms/datascience/HiggsToBBNtupleProducerTool/HiggsToBBNTuple_HiggsToBB_QCD_RunII_13TeV_MC/test/ntuple_merged_0.root']
test_generator = DataGenerator(test_files, features, labels, spectators, batch_size=1024, n_dim=ntracks, 
                               remove_mass_pt_window=True, 
                               remove_unlabeled=True)
# run model inference on test data set
predict_array_dense = []
predict_array_deepset = []
label_array_test = []

for t in test_generator:
    label_array_test.append(t[1])
    predict_array_dense.append(keras_model_dense.predict(t[0]))
    predict_array_deepset.append(keras_model_deepset.predict(t[0]))
    
    
predict_array_dense = np.concatenate(predict_array_dense, axis=0)
predict_array_deepset = np.concatenate(predict_array_deepset, axis=0)
label_array_test = np.concatenate(label_array_test, axis=0)


# create ROC curves
fpr_dense, tpr_dense, threshold_dense = roc_curve(label_array_test[:,1], predict_array_dense[:,1])
fpr_deepset, tpr_deepset, threshold_deepset = roc_curve(label_array_test[:,1], predict_array_deepset[:,1])
    
# plot ROC curves
plt.figure()
plt.plot(tpr_dense, fpr_dense, lw=2.5, label="Dense, AUC = {:.1f}%".format(auc(fpr_dense, tpr_dense)*100))
plt.plot(tpr_deepset, fpr_deepset, lw=2.5, label="Deep Sets, AUC = {:.1f}%".format(auc(fpr_deepset, tpr_deepset)*100))
plt.xlabel(r'True positive rate')
plt.ylabel(r'False positive rate')
plt.semilogy()
plt.ylim(0.001, 1)
plt.xlim(0, 1)
plt.grid(True)
plt.legend(loc='upper left')
plt.show()
../_images/05-deep-learning_18_0.png

We see the more structurally-aware Deep Sets model does better than a simple fully conneted neural network appraoch.