Analysis Software
Documentation for sPHENIX simulation software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TauVsDIS_MachineLearning_Differentiation.py
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file TauVsDIS_MachineLearning_Differentiation.py
1  # Load libraries
2 import pandas
3 from pandas.tools.plotting import scatter_matrix
4 import matplotlib.pyplot as plt
5 from sklearn import model_selection
6 from sklearn.metrics import classification_report
7 from sklearn.metrics import confusion_matrix
8 from sklearn.metrics import accuracy_score
9 from sklearn.linear_model import LogisticRegression, RidgeClassifier
10 from sklearn.tree import DecisionTreeClassifier
11 from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
12 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
13 from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
14 from sklearn.neural_network import MLPClassifier
15 from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
16 from sklearn.gaussian_process import GaussianProcessClassifier
17 from sklearn.gaussian_process.kernels import RBF
18 from sklearn.svm import SVC
19 from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
20 from sklearn.cluster import SpectralClustering
21 from sklearn.mixture import GaussianMixture
22 from sklearn.cluster import KMeans
23 from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
24 from sklearn.calibration import CalibratedClassifierCV
25 from pandas.tools.plotting import andrews_curves
26 from pandas.tools.plotting import parallel_coordinates
27 from pandas.tools.plotting import radviz
28 import seaborn as sns
29 import numpy as np
30 
31 # Load dataset
32 #url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
33 #names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
34 path = "./data/JetSummary_p250_e20_1000events_r05.csv"
35 #names = ['n_Total', 'n_Above_0p001', 'n_Above_0p01', 'n_Above_0p1', 'n_Above_1', 'n_Above_10', 'eta_avg', 'eta_std', 'phi_avg', 'phi_std', 'Delta_eta_avg', 'Delta_eta_std', 'Delta_phi_avg', 'Delta_phi_std', 'Delta_eta_avg_w', 'Delta_eta_std_w', 'Delta_phi_avg_w', 'Delta_phi_std_w', 'towerenergy_sum','class']
36 names = ['n_track','charge_tot','eta','vertex','class']
37 dataset = pandas.read_csv(path, names=names)
38 
39 # shape
40 #print((dataset.shape)[0])
41 
42 # head
43 #print(dataset.head(20))
44 
45 # descriptions and visualizations
46 #print(dataset.describe())
47 #andrews_curves(dataset, 'class')
48 #parallel_coordinates(dataset, 'class')
49 #radviz(dataset, 'class')
50 
51 # class distribution
52 #print(dataset.groupby('class').size())
53 
54 # box and whisker plots
55 #dataset.plot(kind='box', subplots=True, layout=(19,19), sharex=False, sharey=False)
56 #plt.show()
57 
58 # histograms
59 #dataset.hist()
60 #plt.show()
61 
62 # scatter plot matrix
63 #sns.pairplot(dataset, hue="class")
64 #plt.show()
65 #plt.savefig('destination_path.eps', format='eps', dpi=1000)
66 
67 # Split-out validation dataset
68 array = dataset.values
69 #X = array[:,0:19]
70 #Y = array[:,19]
71 X = array[:,0:4]
72 Y = array[:,4]
73 validation_size = 0.60
74 seed = 7
75 X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
76 
77 ding = np.column_stack((X_train,Y_train))
78 print(ding)
79 print(ding.dtype)
80 dong = np.column_stack((X_validation,Y_validation))
81 
82 #np.savetxt("./data/JetSummary_1000_training.csv", ding, delimiter=", ")
83 #np.savetxt("./data/JetSummary_1000_validation.csv", dong, fmt='%i, %i, %i, %i, %i, %i, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %s')
84 #np.savetxt("./data/JetSummary_1000_training.csv", ding, fmt='%i, %i, %i, %i, %i, %i, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %f, %s')
85 
86 
87 # Test options and evaluation metric
88 seed = 7
89 scoring = 'accuracy'
90 
91 # Spot Check Algorithms
92 models = []
93 models.append(('LR', LogisticRegression()))
94 models.append(('LDA', LinearDiscriminantAnalysis()))
95 #models.append(('QDA', QuadraticDiscriminantAnalysis()))
96 models.append(('KNN', KNeighborsClassifier()))
97 models.append(('CART', DecisionTreeClassifier()))
98 models.append(('GNB', GaussianNB()))
99 #models.append(('SVM', SVC()))
100 models.append(('SVMlin', SVC(kernel="linear", C=0.025)))
101 #models.append(('SVMpoly', SVC(kernel="poly")))
102 #models.append(('SVMg2', SVC(gamma=2, C=1)))
103 #models.append(('Neural', MLPClassifier()))
104 models.append(('RFC', RandomForestClassifier()))
105 models.append(('ADA', AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)))
106 models.append(('ADA3', AdaBoostClassifier(base_estimator=None, n_estimators=100, learning_rate=0.5, algorithm='SAMME.R', random_state=None)))
107 #models.append(('EXT', ExtraTreesClassifier()))
108 #models.append(('RC', RidgeClassifier()))
109 #models.append(('RNeigh', RadiusNeighborsClassifier(200)))
110 #models.append(('ccCV', CalibratedClassifierCV()))
111 #models.append(('DTC', DecisionTreeClassifier()))
112 #models.append(('ETC', ExtraTreeClassifier()))
113 
114 
115 
116 # evaluate each model in turn
117 results = []
118 names = []
119 for name, model in models:
120  kfold = model_selection.KFold(n_splits=10, random_state=seed)
121  cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
122  results.append(cv_results)
123  names.append(name)
124  msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
125  print(msg)
126 
127 # Compare Algorithms
128 #fig = plt.figure()
129 #fig.suptitle('Algorithm Comparison')
130 #ax = fig.add_subplot(111)
131 #plt.boxplot(results)
132 #ax.set_xticklabels(names)
133 
134 # Make predictions on validation dataset
135 ada = AdaBoostClassifier()
136 ada.fit(X_train, Y_train)
137 predictions = ada.predict(X_validation)
138 
139 #LR = LogisticRegression(class_weight ={'DIS':3,'tau':1})
140 #LR = LogisticRegression()
141 #LR.fit(X_train, Y_train)
142 #predictions = LR.predict(X_validation)
143 print(accuracy_score(Y_validation, predictions))
144 print(confusion_matrix(Y_validation, predictions))
145 print(classification_report(Y_validation, predictions))
146 
147 
148 plt.show()