from numpy import mean from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score from sklearn.model_selection import cross_val_score from sklearn.naive_bayes import GaussianNB from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import scale from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from sys import argv from preprocessing import parse_data, split_k_sets def choose_model(model): if model == "gnb": return GaussianNB() elif model == "svc": return LinearSVC(random_state=42) elif model == "knn": return KNeighborsClassifier(n_neighbors=10) elif model == "tree": return DecisionTreeClassifier(random_state=42) elif model == "neuralnet": return MLPClassifier(hidden_layer_sizes=10) else: print("Unknown model selected. The choices are: ") print("gnb: Gaussian Naive Bayes") print("svc: Linear Support Vector Classification") print("knn: K-neighbors") print("tree: Decision tree") print("neuralnet: MLP Classifier") exit() def predict_data(data, target, model): model = choose_model(model) if model == "knn": data = scale(data) accuracy_scores = [] confusion_matrices = [] auc = [] for train_index, test_index in split_k_sets(data): model.fit(data.iloc[train_index], target.iloc[train_index]) prediction = model.predict(data.iloc[test_index]) accuracy_scores.append(accuracy_score(target.iloc[test_index], prediction)) confusion_matrices.append(confusion_matrix(target.iloc[test_index], prediction)) auc.append(roc_auc_score(target.iloc[test_index], prediction)) cv_score = cross_val_score(model, data, target, cv=10) evaluate_performance( confusion_matrix=mean(confusion_matrices, axis=0), accuracy=mean(accuracy_scores), cv_score=mean(cv_score), auc=mean(auc), ) def evaluate_performance(confusion_matrix, accuracy, cv_score, auc): print("Accuracy Score: " + str(accuracy)) print("Confusion matrix: ") print(str(confusion_matrix)) print("Cross validation score: " + str(cv_score)) print("AUC: " + str(auc)) def usage(): print("Usage: " + argv[0] + " ") print("preprocessing actions:") print("fill: fills the na values with the mean") print("drop: drops the na values") print("models:") print("gnb: Gaussian Naive Bayes") print("svc: Linear Support Vector Classification") print("knn: K-neighbors") print("tree: Decision tree") print("neuralnet: MLP Classifier") exit() def main(): if len(argv) != 3: usage() data, target = parse_data(source="data/mamografia.csv", action=str(argv[1])) predict_data(data=data, target=target, model=str(argv[2])) if __name__ == "__main__": main()