Analysis Software
Documentation for sPHENIX simulation software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
train_ambiguity_solver.py
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file train_ambiguity_solver.py
1 import glob
2 
3 import pandas as pd
4 import numpy as np
5 
6 import torch.nn as nn
7 import torch.nn.functional as F
8 import torch.utils
9 from torch.utils.tensorboard import SummaryWriter
10 
11 from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
12 
13 from ambiguity_solver_network import prepareDataSet, DuplicateClassifier, Normalise
14 
15 avg_mean = [0, 0, 0, 0, 0, 0, 0, 0]
16 avg_sdv = [0, 0, 0, 0, 0, 0, 0, 0]
17 events = 0
18 
19 
20 def readDataSet(CKS_files: list[str]) -> pd.DataFrame:
21  """Read the dataset from the different files, remove the pure duplicate tracks and combine the datasets"""
22  """
23  @param[in] CKS_files: DataFrame contain the data from each track files (1 file per events usually)
24  @return: combined DataFrame containing all the track, ordered by events and then by truth particle ID in each events
25  """
26  globalindex = 0
27  data = pd.DataFrame()
28  for f in CKS_files:
29  datafile = pd.read_csv(f)
30  # We at this point we don't make any difference between fake and duplicate
31  datafile.loc[
32  datafile["good/duplicate/fake"] == "fake", "good/duplicate/fake"
33  ] = "duplicate"
34  datafile = prepareDataSet(datafile)
35  # Combine dataset
36  data = pd.concat([data, datafile])
37  return data
38 
39 
40 def prepareTrainingData(data: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]:
41  """Prepare the data"""
42  """
43  @param[in] data: input DataFrame to be prepared
44  @return: array of the network input and the corresponding truth
45  """
46  # Remove truth and useless variable
47  target_column = "good/duplicate/fake"
48  # Separate the truth from the input variables
49  y = LabelEncoder().fit(data[target_column]).transform(data[target_column])
50  input = data.drop(
51  columns=[
52  target_column,
53  "track_id",
54  "nMajorityHits",
55  "nSharedHits",
56  "truthMatchProbability",
57  "Hits_ID",
58  "chi2",
59  "pT",
60  ]
61  )
62  # Compute the normalisation factors
63  scale = StandardScaler()
64  scale.fit(input.select_dtypes("number"))
65  # Variables to compute the normalisation
66  global avg_mean
67  avg_mean = avg_mean + scale.mean_
68  global avg_sdv
69  avg_sdv = avg_sdv + scale.var_
70  global events
71  events = events + 1
72  # Prepare the input feature
73  x_cat = OrdinalEncoder().fit_transform(input.select_dtypes("object"))
74  x = np.concatenate((x_cat, input), axis=1)
75  return x, y
76 
77 
78 def batchSplit(data: pd.DataFrame, batch_size: int) -> list[pd.DataFrame]:
79  """Split the data into batch each containing @batch_size truth particles (the number of corresponding tracks may vary)"""
80  """
81  @param[in] data: input DataFrame to be cut into batch
82  @param[in] batch_size: Number of truth particles per batch
83  @return: list of DataFrame, each element correspond to a batch
84  """
85  batch = []
86  pid = data[0][0]
87  n_particle = 0
88  id_prev = 0
89  id = 0
90  for index, row, truth in zip(data[0], data[1], data[2]):
91  if index != pid:
92  pid = index
93  n_particle += 1
94  if n_particle == batch_size:
95  b = data[0][id_prev:id], data[1][id_prev:id], data[2][id_prev:id]
96  batch.append(b)
97  n_particle = 0
98  id_prev = id
99  id += 1
100  return batch
101 
102 
103 def computeLoss(
104  score_good: torch.Tensor,
105  score_duplicate: list[torch.Tensor],
106  batch_loss: torch.Tensor,
107  margin: float = 0.05,
108 ) -> torch.Tensor:
109  """Compute one loss for each duplicate track associated with the particle"""
110  """
111  @param[in] score_good: score return by the model for the good track associated with this particle
112  @param[in] score_duplicate: list of the scores of all duplicate track associated with this particle
113  @param[in] margin: Margin used in the computation of the MarginRankingLoss
114  @return: return the updated loss
115  """
116  # Compute the losses using the MarginRankingLoss with respect to the good track score
117  batch_loss = batch_loss
118  if score_duplicate:
119  for s in score_duplicate:
120  batch_loss += F.relu(s - score_good + margin) / len(score_duplicate)
121  return batch_loss
122 
123 
124 def scoringBatch(batch: list[pd.DataFrame], Optimiser=0) -> tuple[int, int, float]:
125  """Run the MLP on a batch and compute the corresponding efficiency and loss. If an optimiser is specified train the MLP."""
126  """
127  @param[in] batch: list of DataFrame, each element correspond to a batch
128  @param[in] Optimiser: Optimiser for the MLP, if one is specify the network will be train on batch.
129  @return: array containing the number of particles, the number of particle where the good track was found and the loss
130  """
131  # number of particles
132  nb_part = 0
133  # number of particles associated with a good track
134  nb_good_match = 0
135  # loss for the batch
136  loss = 0
137  # best track score for a particle
138  max_score = 0
139  # is the best score associated with the good track
140  max_match = 0
141  # loop over all the batch
142  for b_data in batch:
143  # ID of the current particule
144  pid = b_data[0][0]
145  # loss for the current batch
146  batch_loss = 0
147  # score of the good track
148  score_good = 0
149  # score of the duplicate tracks
150  score_duplicate = []
151  if Optimiser:
152  Optimiser.zero_grad()
153  input = torch.tensor(b_data[1], dtype=torch.float32)
154  prediction = duplicateClassifier(input)
155  # loop over all the track in the batch
156  for index, pred, truth in zip(b_data[0], prediction, b_data[2]):
157  # If we are changing particle uptade the loss
158  if index != pid:
159  # Starting a new particles, compute the loss for the previous one
160  if max_match == 1:
161  nb_good_match += 1
162  batch_loss = computeLoss(
163  score_good, score_duplicate, batch_loss, margin=0.05
164  )
165  nb_part += 1
166  # Reinitialise the variable for the next particle
167  pid = index
168  score_duplicate = []
169  score_good = 0
170  max_score = 0
171  max_match = 0
172  # Store track scores
173  if truth:
174  score_good = pred
175  else:
176  score_duplicate.append(pred)
177  # Prepare efficiency computtion
178  if pred == max_score:
179  max_match = 0
180  if pred > max_score:
181  max_score = pred
182  max_match = truth
183  # Compute the loss for the last particle when reaching the end of the batch
184  if max_match == 1:
185  nb_good_match += 1
186  batch_loss = computeLoss(score_good, score_duplicate, batch_loss, margin=0.05)
187  nb_part += 1
188  # Normalise the loss to the batch size
189  batch_loss = batch_loss / len(b_data[0])
190  loss += batch_loss
191  # Perform the gradient descend if an optimiser was specified
192  if Optimiser:
193  batch_loss.backward()
194  Optimiser.step()
195  loss = loss / len(batch)
196  return nb_part, nb_good_match, loss
197 
198 
199 def train(
200  duplicateClassifier: DuplicateClassifier,
201  data: tuple[np.ndarray, np.ndarray, np.ndarray],
202  epochs: int = 20,
203  batch: int = 32,
204  validation: float = 0.3,
205 ) -> DuplicateClassifier:
206  """Training of the MLP"""
207  """
208  @param[in] duplicateClassifier: model to be trained.
209  @param[in] data: tuple containing three list. Each element of those list correspond to a given track and represent : the truth particle ID, the track parameters and the truth.
210  @param[in] epochs: number of epoch the model will be trained for.
211  @param[in] batch: size of the batch used in the training
212  @param[in] validation: Fraction of the batch used in training
213  @return: trained model
214  """
215  # Prepare tensorboard for the training plot
216  # use 'tensorboard --logdir=runs' to access the plot afterward
217  writer = SummaryWriter()
218  opt = torch.optim.Adam(duplicateClassifier.parameters())
219  # Split the data in batch
220  batch = batchSplit(data, batch)
221  val_batch = int(len(batch) * (1 - validation))
222  # Loop over all the epoch
223  for epoch in range(epochs):
224  print("Epoch : ", epoch, " / ", epochs)
225  loss = 0.0
226  nb_part = 0.0
227  nb_good_match = 0.0
228 
229  # Loop over all the network over the training batch
230  nb_part, nb_good_match, loss = scoringBatch(batch[:val_batch], Optimiser=opt)
231  print("Loss/train : ", loss, " Eff/train : ", nb_good_match / nb_part)
232  writer.add_scalar("Loss/train", loss, epoch)
233  writer.add_scalar("Eff/train", nb_good_match / nb_part, epoch)
234 
235  # If using validation, compute the efficiency and loss over the training batch
236  if validation > 0.0:
237  nb_part, nb_good_match, loss = scoringBatch(batch[val_batch:])
238  writer.add_scalar("Loss/val", loss, epoch)
239  writer.add_scalar("Eff/val", nb_good_match / nb_part, epoch)
240  print("Loss/val : ", loss, " Eff/val : ", nb_good_match / nb_part)
241 
242  writer.close()
243  return duplicateClassifier
244 
245 
246 # ==================================================================
247 
248 # ttbar events used as the training input, here we assume 1000 events are availables
249 CKF_files = sorted(glob.glob("odd_output" + "/event0000000[0-7][0-9]-tracks_ckf.csv"))
250 data = readDataSet(CKF_files)
251 
252 # Prepare the data
253 x_train, y_train = prepareTrainingData(data)
254 
255 avg_mean = [x / events for x in avg_mean]
256 avg_sdv = [x / events for x in avg_sdv]
257 
258 # Create our model
259 input_dim = np.shape(x_train)[1]
260 layers_dim = [10, 15, 10]
261 
262 duplicateClassifier = nn.Sequential(
263  Normalise(avg_mean, avg_sdv), DuplicateClassifier(input_dim, layers_dim)
264 )
265 
266 # Train the model and save it
267 input = data.index, x_train, y_train
268 train(duplicateClassifier, input, epochs=20, batch=128, validation=0.3)
269 duplicateClassifier.eval()
270 input_test = torch.tensor(x_train, dtype=torch.float32)
271 torch.save(duplicateClassifier, "duplicateClassifier.pt")
272 torch.onnx.export(
273  duplicateClassifier,
274  input_test,
275  "duplicateClassifier.onnx",
276  input_names=["x"],
277  output_names=["y"],
278  dynamic_axes={"x": {0: "batch_size"}, "y": {0: "batch_size"}},
279 )
280 # ==================================================================
281 
282 # ttbar events for the test, here we assume 1000 events are availables
283 CKF_files_test = sorted(
284  glob.glob("odd_output" + "/event0000000[8-9][0-9]-tracks_ckf.csv")
285 )
286 test = readDataSet(CKF_files_test)
287 
288 # Prepare the data
289 x_test, y_test = prepareTrainingData(test)
290 
291 # Write the network score to a list
292 output_predict = []
293 
294 x_test = torch.tensor(x_test, dtype=torch.float32)
295 for x in x_test:
296  output_predict.append(duplicateClassifier(x))
297 
298 # For the first 100 particles print the ID, score and truth
299 for sample_test, sample_predict, sample_true in zip(
300  test.index[0:100], output_predict[0:100], y_test[0:100]
301 ):
302  print(sample_test, sample_predict, sample_true)
303 
304 id = 0
305 pid = test.index[0]
306 nb_part = 1
307 nb_good_match = 0
308 max_match = 0
309 max_score = 0
310 
311 # Compute the efficiency
312 for index, pred, truth in zip(test.index, output_predict, y_test):
313  if index != pid:
314  if max_match == 1:
315  nb_good_match += 1
316  pid = index
317  nb_part += 1
318  max_score = 0
319  max_match = 0
320  if pred == max_score:
321  max_match = 0
322  if pred > max_score:
323  max_score = pred
324  max_match = truth
325 nb_part += 1
326 if max_match == 1:
327  nb_good_match += 1
328 
329 print("nb particles : ", nb_part)
330 print("nb good match : ", nb_good_match)
331 print("Efficiency: ", 100 * nb_good_match / nb_part, " %")