10 from sklearn.cluster
import DBSCAN
12 from sklearn.preprocessing
import LabelEncoder, OrdinalEncoder
13 from ambiguity_solver_network
import prepareDataSet, DuplicateClassifier, Normalise
17 """Read the dataset from the different file, remove the pure duplicate tracks and combine the datasets"""
19 @param[in] CKS_files: DataFrame contain the data from each track files (1 file per events usually)
20 @return: combined DataFrame containing all the track, ordered by events and then by truth particle ID in each event
25 datafile = pd.read_csv(f)
33 """Prepare the data"""
35 @param[in] data: input DataFrame to be prepared
36 @return: array of the network input and the corresponding truth
39 target_column =
"good/duplicate/fake"
41 y = LabelEncoder().
fit(data[target_column]).
transform(data[target_column])
48 "truthMatchProbability",
56 x_cat = OrdinalEncoder().fit_transform(input.select_dtypes(
"object"))
57 x = np.concatenate((x_cat, input), axis=1)
62 event: pd.DataFrame, DBSCAN_eps: float = 0.07, DBSCAN_min_samples: int = 2
65 Cluster together all the track that appear to belong to the same truth particle
66 To cluster the tracks together, a DBSCAN is first used followed by a sub clustering based on hits shared by tracks.
69 @param[in] event: input DataFrame that contain all track in one event
70 @param[in] DBSCAN_eps: minimum radius used by the DBSCAN to cluster track together
71 @param[in] DBSCAN_min_samples: minimum number of tracks needed for DBSCAN to create a cluster
72 @return: DataFrame identical to the output with an added column with the cluster
75 trackDir = event[[
"eta",
"phi"]].to_numpy()
76 clustering = DBSCAN(eps=DBSCAN_eps, min_samples=DBSCAN_min_samples).
fit(trackDir)
78 event[
"cluster"] = clustering.labels_
80 sorted = event.sort_values([
"cluster",
"nMeasurements"], ascending=[
True,
False])
82 cluster_hits = sorted.loc[:, (
"Hits_ID",
"cluster")]
84 for key, frame
in cluster_hits.groupby(
"cluster"):
85 clusterarray = frame.to_numpy()
87 updatedCluster.extend(clusterarray[:, 1])
88 sorted.loc[:, (
"cluster")] = updatedCluster
90 sorted = sorted.sort_values(
"cluster")
91 clusterarray = sorted.loc[:, (
"Hits_ID",
"cluster")].to_numpy()
93 sorted.loc[:, (
"cluster")] = clusterarray[:, 1]
97 def subClustering(clusterarray: np.ndarray, c: int, lastCluster: float) -> np.ndarray:
98 """SubClustering algorithm, cluster together tracks that share hits (TODO : doesn't handle real shared hits)"""
100 @param[in] clusterarray: numpy array containing the hits IDs and the cluster ID
101 @param[in] c: ID of the cluster we are working on
102 @param[in] lastCluster: ID given to the last subcluster
103 @return: numpy array with updated cluster IDs
106 newCluster = math.nextafter(lastCluster, c + 1)
107 if newCluster >= c + 1:
109 "Too many subcluster in the clusters, this shouldn't be possible."
112 set_IDs =
set(hits_IDs)
114 for track
in clusterarray:
118 set_IDs =
set(hits_IDs)
119 if set_IDs &
set(track[0]):
120 track[1] = newCluster
131 """Rename the cluster IDs to be int starting from 0"""
133 @param[in] clusterarray: numpy array containing the hits IDs and the cluster ID
134 @return: numpy array with updated cluster IDs
138 for track
in clusterarray:
139 if track[1] != last_id:
154 sys.setrecursionlimit(10**6)
157 CKF_files = sorted(glob.glob(
"odd_output" +
"/event0000000[0-9][0-9]-tracks_ckf.csv"))
170 clusteredData.append(clustered)
174 duplicateClassifier = torch.load(
"duplicateClassifier.pt")
179 for clusteredEvent
in clusteredData:
185 x = torch.tensor(x, dtype=torch.float32)
188 clusteredEvent[
"score"] = output_predict
189 cleanedEvent = clusteredEvent
192 cleanedEvent.groupby([
"cluster"])[
"score"].
transform(max)
193 == cleanedEvent[
"score"]
195 cleanedEvent = cleanedEvent[idx]
196 cleanedData.append(cleanedEvent)
209 nb_reco_duplicate = 0
212 for clusteredEvent, cleanedEvent
in zip(clusteredData, cleanedData):
213 nb_part += clusteredEvent.loc[
214 clusteredEvent[
"good/duplicate/fake"] !=
"fake"
216 nb_track += clusteredEvent.shape[0]
217 nb_fake += clusteredEvent.loc[
218 clusteredEvent[
"good/duplicate/fake"] ==
"fake"
220 nb_duplicate += clusteredEvent.loc[
221 clusteredEvent[
"good/duplicate/fake"] ==
"duplicate"
224 nb_good_match += cleanedEvent.loc[
225 cleanedEvent[
"good/duplicate/fake"] ==
"good"
227 nb_reco_fake += cleanedEvent.loc[
228 cleanedEvent[
"good/duplicate/fake"] ==
"fake"
230 nb_reco_duplicate += cleanedEvent.loc[
231 cleanedEvent[
"good/duplicate/fake"] ==
"duplicate"
233 nb_reco_part += cleanedEvent.loc[
234 cleanedEvent[
"good/duplicate/fake"] !=
"fake"
236 nb_reco_track += cleanedEvent.shape[0]
239 print(
"===Initial efficiencies===")
240 print(
"nb particles : ", nb_part)
241 print(
"nb track : ", nb_track)
242 print(
"duplicate rate: ", 100 * nb_duplicate / nb_track,
" %")
243 print(
"Fake rate: ", 100 * nb_fake / nb_track,
" %")
245 print(
"===computed efficiencies===")
246 print(
"nb particles : ", nb_part)
247 print(
"nb good match : ", nb_good_match)
248 print(
"nb particle reco : ", nb_reco_part)
249 print(
"nb track reco : ", nb_reco_track)
250 print(
"Efficiency (good track) : ", 100 * nb_good_match / nb_part,
" %")
251 print(
"Efficiency (particle reco) : ", 100 * nb_reco_part / nb_part,
" %")
254 100 * ((nb_good_match + nb_reco_duplicate) - nb_reco_part) / nb_reco_track,
257 print(
"Fake rate: ", 100 * nb_reco_fake / nb_reco_track,
" %")
259 print(
"===computed speed===")
260 print(
"Clustering : ", (t2 - t1) * 1000 / len(CKF_files),
"ms")
261 print(
"Inference : ", (t4 - t3) * 1000 / len(CKF_files),
"ms")
262 print(
"tot : ", (end - start) * 1000 / len(CKF_files),
"ms")
264 for file, cleanedEvent
in zip(CKF_files, cleanedData):
265 newFile = file[:-4] +
"-Cleaned.csv"
266 cleanedEvent = cleanedEvent.sort_values(
"track_id")
267 cleanedEvent.to_csv(path_or_buf=newFile)