Analysis Software
Documentation for sPHENIX simulation software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ambiguity_solver_full_chain.py
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file ambiguity_solver_full_chain.py
1 import glob
2 import os
3 import math
4 
5 import pandas as pd
6 import numpy as np
7 
8 import torch.utils
9 
10 from sklearn.cluster import DBSCAN
11 
12 from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
13 from ambiguity_solver_network import prepareDataSet, DuplicateClassifier, Normalise
14 
15 
16 def readDataSet(CKS_files: list[str]) -> pd.DataFrame:
17  """Read the dataset from the different file, remove the pure duplicate tracks and combine the datasets"""
18  """
19  @param[in] CKS_files: DataFrame contain the data from each track files (1 file per events usually)
20  @return: combined DataFrame containing all the track, ordered by events and then by truth particle ID in each event
21  """
22  globalindex = 0
23  data = []
24  for f in CKS_files:
25  datafile = pd.read_csv(f)
26  datafile = prepareDataSet(datafile)
27  # Combine dataset
28  data.append(datafile)
29  return data
30 
31 
32 def prepareInferenceData(data: pd.DataFrame) -> tuple[np.ndarray, np.ndarray]:
33  """Prepare the data"""
34  """
35  @param[in] data: input DataFrame to be prepared
36  @return: array of the network input and the corresponding truth
37  """
38  # Remove truth and useless variable
39  target_column = "good/duplicate/fake"
40  # Separate the truth from the input variables
41  y = LabelEncoder().fit(data[target_column]).transform(data[target_column])
42  input = data.drop(
43  columns=[
44  target_column,
45  "track_id",
46  "nMajorityHits",
47  "nSharedHits",
48  "truthMatchProbability",
49  "Hits_ID",
50  "chi2",
51  "pT",
52  "cluster",
53  ]
54  )
55  # Prepare the input feature
56  x_cat = OrdinalEncoder().fit_transform(input.select_dtypes("object"))
57  x = np.concatenate((x_cat, input), axis=1)
58  return x, y
59 
60 
61 def clusterTracks(
62  event: pd.DataFrame, DBSCAN_eps: float = 0.07, DBSCAN_min_samples: int = 2
63 ) -> pd.DataFrame:
64  """
65  Cluster together all the track that appear to belong to the same truth particle
66  To cluster the tracks together, a DBSCAN is first used followed by a sub clustering based on hits shared by tracks.
67  """
68  """
69  @param[in] event: input DataFrame that contain all track in one event
70  @param[in] DBSCAN_eps: minimum radius used by the DBSCAN to cluster track together
71  @param[in] DBSCAN_min_samples: minimum number of tracks needed for DBSCAN to create a cluster
72  @return: DataFrame identical to the output with an added column with the cluster
73  """
74  # Perform the DBSCAN clustering and sort the Db by cluster ID
75  trackDir = event[["eta", "phi"]].to_numpy()
76  clustering = DBSCAN(eps=DBSCAN_eps, min_samples=DBSCAN_min_samples).fit(trackDir)
77  # Set "cluster" to 0 if you want to see the performance without DBSCAN
78  event["cluster"] = clustering.labels_
79  # event["cluster"] = 0
80  sorted = event.sort_values(["cluster", "nMeasurements"], ascending=[True, False])
81  updatedCluster = []
82  cluster_hits = sorted.loc[:, ("Hits_ID", "cluster")]
83  # Further split each cluster into subCluster that have shared hits' IDs
84  for key, frame in cluster_hits.groupby("cluster"):
85  clusterarray = frame.to_numpy()
86  clusterarray = subClustering(clusterarray, key, key)
87  updatedCluster.extend(clusterarray[:, 1])
88  sorted.loc[:, ("cluster")] = updatedCluster
89  # Turn back the cluster ID into int
90  sorted = sorted.sort_values("cluster")
91  clusterarray = sorted.loc[:, ("Hits_ID", "cluster")].to_numpy()
92  clusterarray = renameCluster(clusterarray)
93  sorted.loc[:, ("cluster")] = clusterarray[:, 1]
94  return sorted
95 
96 
97 def subClustering(clusterarray: np.ndarray, c: int, lastCluster: float) -> np.ndarray:
98  """SubClustering algorithm, cluster together tracks that share hits (TODO : doesn't handle real shared hits)"""
99  """
100  @param[in] clusterarray: numpy array containing the hits IDs and the cluster ID
101  @param[in] c: ID of the cluster we are working on
102  @param[in] lastCluster: ID given to the last subcluster
103  @return: numpy array with updated cluster IDs
104  """
105  # New cluster ID set to the float increment
106  newCluster = math.nextafter(lastCluster, c + 1)
107  if newCluster >= c + 1:
108  raise RuntimeError(
109  "Too many subcluster in the clusters, this shouldn't be possible."
110  )
111  hits_IDs = []
112  set_IDs = set(hits_IDs)
113  # Update the cluster ID of all tracks sharing a hit with the first hits that haven't been updated yet
114  for track in clusterarray:
115  if track[1] == c:
116  if hits_IDs == []:
117  hits_IDs = track[0]
118  set_IDs = set(hits_IDs)
119  if set_IDs & set(track[0]):
120  track[1] = newCluster
121  # If all ID have been updated our work is done return the updated array
122  if hits_IDs == []:
123  return clusterarray
124  else:
125  # Perform a new subclusterning for the remaining tracks
126  clusterarray = subClustering(clusterarray, c, newCluster)
127  return clusterarray
128 
129 
130 def renameCluster(clusterarray: np.ndarray) -> np.ndarray:
131  """Rename the cluster IDs to be int starting from 0"""
132  """
133  @param[in] clusterarray: numpy array containing the hits IDs and the cluster ID
134  @return: numpy array with updated cluster IDs
135  """
136  last_id = -1
137  new_id = -1
138  for track in clusterarray:
139  if track[1] != last_id:
140  last_id = track[1]
141  new_id = new_id + 1
142  track[1] = new_id
143  return clusterarray
144 
145 
146 # ==================================================================
147 
148 import time
149 
150 start = time.time()
151 
152 import sys
153 
154 sys.setrecursionlimit(10**6)
155 
156 # ttbar events as test input
157 CKF_files = sorted(glob.glob("odd_output" + "/event0000000[0-9][0-9]-tracks_ckf.csv"))
158 data = readDataSet(CKF_files)
159 
160 # Data of each event after clustering
161 clusteredData = []
162 # data of each event after ambiguity resolution
163 cleanedData = []
164 
165 t1 = time.time()
166 
167 # Cluster togather tracks belonging to the same particle
168 for event in data:
169  clustered = clusterTracks(event)
170  clusteredData.append(clustered)
171 
172 t2 = time.time()
173 
174 duplicateClassifier = torch.load("duplicateClassifier.pt")
175 
176 t3 = time.time()
177 
178 # Performed the MLP based ambiguity resolution
179 for clusteredEvent in clusteredData:
180  # Prepare the data
181  x_test, y_test = prepareInferenceData(clusteredEvent)
182  # Write the network score to a list
183  output_predict = []
184  for x in x_test:
185  x = torch.tensor(x, dtype=torch.float32)
186  output_predict.append(duplicateClassifier(x).item())
187 
188  clusteredEvent["score"] = output_predict
189  cleanedEvent = clusteredEvent
190  # For each cluster only keep the track with the highest score
191  idx = (
192  cleanedEvent.groupby(["cluster"])["score"].transform(max)
193  == cleanedEvent["score"]
194  )
195  cleanedEvent = cleanedEvent[idx]
196  cleanedData.append(cleanedEvent)
197 
198 t4 = time.time()
199 
200 # Compute the algorithm performances
201 nb_part = 0
202 nb_track = 0
203 nb_fake = 0
204 nb_duplicate = 0
205 
206 nb_good_match = 0
207 nb_reco_part = 0
208 nb_reco_fake = 0
209 nb_reco_duplicate = 0
210 nb_reco_track = 0
211 
212 for clusteredEvent, cleanedEvent in zip(clusteredData, cleanedData):
213  nb_part += clusteredEvent.loc[
214  clusteredEvent["good/duplicate/fake"] != "fake"
215  ].index.nunique()
216  nb_track += clusteredEvent.shape[0]
217  nb_fake += clusteredEvent.loc[
218  clusteredEvent["good/duplicate/fake"] == "fake"
219  ].shape[0]
220  nb_duplicate += clusteredEvent.loc[
221  clusteredEvent["good/duplicate/fake"] == "duplicate"
222  ].shape[0]
223 
224  nb_good_match += cleanedEvent.loc[
225  cleanedEvent["good/duplicate/fake"] == "good"
226  ].shape[0]
227  nb_reco_fake += cleanedEvent.loc[
228  cleanedEvent["good/duplicate/fake"] == "fake"
229  ].shape[0]
230  nb_reco_duplicate += cleanedEvent.loc[
231  cleanedEvent["good/duplicate/fake"] == "duplicate"
232  ].shape[0]
233  nb_reco_part += cleanedEvent.loc[
234  cleanedEvent["good/duplicate/fake"] != "fake"
235  ].index.nunique()
236  nb_reco_track += cleanedEvent.shape[0]
237 end = time.time()
238 
239 print("===Initial efficiencies===")
240 print("nb particles : ", nb_part)
241 print("nb track : ", nb_track)
242 print("duplicate rate: ", 100 * nb_duplicate / nb_track, " %")
243 print("Fake rate: ", 100 * nb_fake / nb_track, " %")
244 
245 print("===computed efficiencies===")
246 print("nb particles : ", nb_part)
247 print("nb good match : ", nb_good_match)
248 print("nb particle reco : ", nb_reco_part)
249 print("nb track reco : ", nb_reco_track)
250 print("Efficiency (good track) : ", 100 * nb_good_match / nb_part, " %")
251 print("Efficiency (particle reco) : ", 100 * nb_reco_part / nb_part, " %")
252 print(
253  "duplicate rate: ",
254  100 * ((nb_good_match + nb_reco_duplicate) - nb_reco_part) / nb_reco_track,
255  " %",
256 )
257 print("Fake rate: ", 100 * nb_reco_fake / nb_reco_track, " %")
258 
259 print("===computed speed===")
260 print("Clustering : ", (t2 - t1) * 1000 / len(CKF_files), "ms")
261 print("Inference : ", (t4 - t3) * 1000 / len(CKF_files), "ms")
262 print("tot : ", (end - start) * 1000 / len(CKF_files), "ms")
263 
264 for file, cleanedEvent in zip(CKF_files, cleanedData):
265  newFile = file[:-4] + "-Cleaned.csv"
266  cleanedEvent = cleanedEvent.sort_values("track_id")
267  cleanedEvent.to_csv(path_or_buf=newFile)