Lung Cancer Data Set
github link : https://github.com/amirshnll/Lung-Cancer
dataset link : http://archive.ics.uci.edu/ml/datasets/Lung+Cancer
1 2 3 4 | # Author : Amir Shokri # github link : https://github.com/amirshnll/Lung-Cancer # dataset link : http://archive.ics.uci.edu/ml/datasets/Lung+Cancer # email : amirsh.nll@gmail.com |
1 2 3 4 5 6 7 8 9 10 11 | import numpy as np, matplotlib.pyplot as plt import pandas as pd from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score from sklearn import tree from sklearn.neural_network import MLPClassifier from sklearn.linear_model import LogisticRegression import seaborn as sns sns.set() |
1 2 3 4 5 6 | def Read_Data(address, Name='*.csv', Sperator=';'): Data = pd.read_csv(address+Name, sep=Sperator, header=None) # Data = Data.dropna() X = Data.drop([0], axis=1) Y = Data.iloc[:,0] return X, Y |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | def KNN_Plot(X, Y, n1, n2, knn_title): ''' n1 and n2 are Neighbours ''' neighbors = np.arange(n1, n2) train_acc = np.empty(len(neighbors)) test_acc = np.empty(len(neighbors)) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state=42, stratify=Y) for i, k in enumerate(neighbors): knn_model = KNeighborsClassifier(n_neighbors=k, weights='distance', algorithm='auto', p=2) knn_model.fit(x_train, y_train) TAcc = knn_model.score(x_train, y_train) TAcc *= 100 TAcc = float(format(TAcc,'.2f')) train_acc[i] = TAcc pred = knn_model.predict(x_test) Test_acc = accuracy_score(y_test, pred) Test_acc *= 100 Test_acc = float(format(Test_acc,'.2f')) test_acc[i] = Test_acc plt.plot(neighbors, train_acc, label='Train Accuracy') plt.plot(neighbors, test_acc, label='Test Accuracy') plt.legend(loc='best') plt.title(knn_title) plt.xlabel('Neighbors') plt.ylabel('Accuracy (%)') plt.xticks(neighbors) plt.show() return knn_model |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | def NB(x, y): x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42, stratify=y) nb_clf = GaussianNB() nb_clf.fit(x_train, y_train) Predict_nb_clf = nb_clf.predict(x_test) Accuracy = accuracy_score(y_test, Predict_nb_clf) Accuracy *= 100 print('GaussianNB Accuracy: ') print(float(format(Accuracy,'.2f'))) print('---------------------------------\n') return Accuracy |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | def Tree(X, Y): x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state=42, stratify=Y) clf = tree.DecisionTreeClassifier(random_state=80) clf.fit(x_train, y_train) Predict = clf.predict(x_test) Accuracy = accuracy_score(y_test, Predict) Accuracy *= 100 print('DecisionTree Accuracy: ') print(float(format(Accuracy,'.2f'))) print('---------------------------------\n') return Accuracy |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | def MLP(X, Y): x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state=42, stratify=Y) mlp = MLPClassifier(hidden_layer_sizes=(800,), max_iter=1000, random_state=50) mlp.fit(x_train, y_train) Predict = mlp.predict(x_test) Accuracy = accuracy_score(y_test, Predict) Accuracy *= 100 print('MLP Accuracy: ') print(float(format(Accuracy,'.2f'))) print('---------------------------------\n') return Accuracy |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | def LogisticRegressionClf(X, Y): x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state=50, stratify=Y) clf = LogisticRegression(random_state=50, solver='lbfgs', max_iter=200) clf.fit(x_train, y_train) Predict = clf.predict(x_test) Accuracy = accuracy_score(y_test, Predict) Accuracy *= 100 print('LogisticRegression Accuracy: ') print(float(format(Accuracy,'.2f'))) print('---------------------------------\n') return Accuracy |
1 2 3 | address = 'C:/' X, Y = Read_Data(address, Name='lc.csv', Sperator=';') print(X,Y) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | 1 2 3 4 5 6 7 8 9 10 ... 47 48 49 50 51 52 53 \ 0 0 3 3 1 0 3 1 3 1 1 ... 2 2 2 2 2 2 2 1 0 3 3 2 0 3 3 3 1 1 ... 2 2 2 2 2 2 2 2 0 2 3 2 1 3 3 3 1 2 ... 2 2 2 2 2 2 2 3 0 3 2 1 1 3 3 3 2 2 ... 2 2 2 2 2 2 2 4 0 3 3 2 0 3 3 3 1 2 ... 2 2 2 2 2 2 2 5 0 3 2 1 0 3 3 3 1 2 ... 2 2 2 2 1 2 2 6 0 2 2 1 0 3 1 3 3 3 ... 2 2 1 2 2 2 2 7 0 3 1 1 0 3 1 3 1 1 ... 2 2 2 2 2 2 2 8 0 2 3 2 0 2 2 2 1 2 ... 2 2 2 1 3 2 1 9 0 2 2 0 0 3 2 3 1 1 ... 2 2 2 2 2 2 2 10 0 2 3 2 0 1 2 1 1 2 ... 2 2 2 2 2 1 1 11 0 2 1 1 0 1 2 2 1 2 ... 2 2 2 2 2 2 2 12 0 2 2 1 1 2 3 3 1 1 ... 2 2 2 2 2 1 1 13 0 3 2 2 1 2 2 2 1 1 ... 2 2 2 2 2 2 2 14 0 3 2 2 0 1 1 3 1 1 ... 2 2 2 2 2 2 2 15 0 2 1 1 0 2 1 3 1 1 ... 2 2 2 2 2 1 1 16 0 1 2 1 0 3 3 3 1 2 ... 2 2 2 2 2 1 1 17 0 3 3 2 0 2 1 3 1 1 ... 2 2 1 2 2 2 2 18 0 2 3 1 1 2 2 1 1 1 ... 3 3 3 3 1 3 3 19 0 2 3 1 1 1 2 1 1 1 ... 2 2 2 2 2 2 2 20 0 3 3 1 0 3 3 1 1 1 ... 2 2 2 2 3 2 2 21 0 2 2 2 0 2 1 2 1 1 ... 2 2 2 2 2 2 2 22 0 2 2 1 0 2 2 2 1 1 ... 3 3 2 2 3 2 2 23 0 3 2 2 0 2 2 2 1 1 ... 2 2 2 3 1 2 2 24 0 2 1 1 0 2 2 1 1 1 ... 2 2 3 2 2 2 2 25 0 2 3 2 1 2 2 3 1 1 ... 2 2 2 2 2 2 2 26 0 2 3 1 0 2 3 3 1 1 ... 2 2 2 2 2 2 2 54 55 56 0 1 2 2 1 2 1 2 2 2 2 2 3 1 2 2 4 2 1 2 5 2 1 2 6 1 2 2 7 1 2 2 8 1 2 2 9 2 2 2 10 2 2 1 11 1 2 2 12 1 2 2 13 2 2 2 14 1 2 2 15 1 2 2 16 2 2 1 17 2 1 2 18 2 2 1 19 2 2 1 20 2 2 1 21 1 2 1 22 2 2 1 23 2 2 2 24 2 2 1 25 1 2 2 26 2 2 2 [27 rows x 56 columns] 0 1 1 1 2 1 3 1 4 1 5 1 6 1 7 1 8 2 9 2 10 2 11 2 12 2 13 2 14 2 15 2 16 2 17 2 18 3 19 3 20 3 21 3 22 3 23 3 24 3 25 3 26 3 Name: 0, dtype: int64 |
1 2 3 4 | n1 = 1 n2 = 12 knn_title = 'lung cancer knn Classifier' KNN_Plot(X, Y, n1, n2, knn_title) |
KNeighborsClassifier(n_neighbors=11, weights=’distance’)
1 | Accuracy = NB(X, Y) |
GaussianNB Accuracy:
66.67
1 | Accuracy = Tree(X, Y) |
DecisionTree Accuracy:
83.33
1 | Accuracy = MLP(X, Y) |
MLP Accuracy:
16.67
1 | LGR_Accuraccy = LogisticRegressionClf(X, Y) |
LogisticRegression Accuracy:
50.0