from sys import argv from matplotlib.pyplot import * from numpy import arange, mean from pandas import DataFrame, cut from seaborn import countplot, heatmap, set_style, set_theme from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.neural_network import MLPClassifier from sklearn.preprocessing import scale from sklearn.svm import LinearSVC from sklearn.tree import DecisionTreeClassifier from preprocessing import parse_data, split_k_sets def choose_model(model): if model == "gnb": return GaussianNB() elif model == "svc": return LinearSVC(random_state=42) elif model == "knn": return KNeighborsClassifier(n_neighbors=10) elif model == "tree": return DecisionTreeClassifier(random_state=42) elif model == "neuralnet": return MLPClassifier(hidden_layer_sizes=10) def predict_data(data, target, model, results): model = choose_model(model) if model == "knn": data = scale(data) confusion_matrices, auc, fpr, tpr = [], [], [], [] for train_index, test_index in split_k_sets(data): model.fit(data.iloc[train_index], target.iloc[train_index]) prediction = model.predict(data.iloc[test_index]) confusion_matrices.append(confusion_matrix(target.iloc[test_index], prediction)) auc.append(roc_auc_score(target.iloc[test_index], prediction)) fpr_item, tpr_item, _ = roc_curve(target.iloc[test_index], prediction) fpr.append(fpr_item) tpr.append(tpr_item) populated_results = populate_results( df=results, model=model, fpr=mean(fpr, axis=0), tpr=mean(tpr, axis=0), auc=mean(auc), confusion_matrix=mean(confusion_matrices, axis=0), ) return populated_results def plot_roc_auc_curve(results): fig = figure(figsize=(8, 6)) for model in results.index: rounded_auc = round(results.loc[model]["auc"], 3) plot( results.loc[model]["fpr"], results.loc[model]["tpr"], label=f"{model} , AUC={rounded_auc}", ) xticks(arange(0.0, 1.0, step=0.1)) yticks(arange(0.0, 1.0, step=0.1)) legend(loc="lower right") fig_title = "ROC AUC curve" title(fig_title) xlabel("False positive rate") ylabel("True positive rate") fig.savefig(f"docs/assets/{fig_title.replace(' ', '_').lower()}.png") def plot_confusion_matrix(results): set_style("white") matrix = results.filter(items=["model", "confusion_matrix"]) fig, axes = subplots(nrows=1, ncols=5, figsize=(8, 6)) for i in range(len(axes)): heatmap( ax=axes[i], data=matrix.iloc[i]["confusion_matrix"], cmap="Blues", square=True, annot=True, cbar=False, ) axes[i].set_title(matrix.index[i]) fig_title = "Confusion Matrix" suptitle(fig_title) show() fig.savefig(f"docs/assets/{fig_title.replace(' ', '_').lower()}.png") def plot_attributes_correlation(data, target): transformed_data = transform_dataframe(data, target) fig, axes = subplots(nrows=5, ncols=1, figsize=(8, 6)) for i in range(len(axes)): countplot( ax=axes[i], x=transformed_data.columns[i], data=transformed_data, hue="Severity", ) axes[i].set_title(transformed_data.columns[i]) fig_title = "Attribute's correlation" suptitle(fig_title) show() fig.savefig(f"docs/assets/{fig_title.replace(' ', '_').lower()}.png") def plot_all_figures(results, data, target): set_theme() plot_roc_auc_curve(results=results) plot_confusion_matrix(results=results) plot_attributes_correlation(data=data, target=target) def create_result_dataframes(): results = DataFrame(columns=["model", "fpr", "tpr", "auc", "confusion_matrix"]) indexed_results = results.set_index("model") return indexed_results, indexed_results def populate_results(df, model, fpr, tpr, auc, confusion_matrix): renamed_model = rename_model(model=f"{model}") columns = ["model", "fpr", "tpr", "auc", "confusion_matrix"] values = [renamed_model, fpr, tpr, auc, confusion_matrix] dictionary = dict(zip(columns, values)) populated_df = df.append(dictionary, ignore_index=True) return populated_df def rename_model(model): short_name = ["gnb", "svc", "knn", "tree", "neuralnet"] models = [ "GaussianNB()", "LinearSVC(random_state=42)", "KNeighborsClassifier(n_neighbors=10)", "DecisionTreeClassifier(random_state=42)", "MLPClassifier(hidden_layer_sizes=10)", ] mapping = dict(zip(models, short_name)) return mapping[model] def transform_dataframe(data, target): joined_df = data.join(target) binned_df = joined_df.copy() binned_df["Age"] = cut(x=joined_df["Age"], bins=[15, 30, 45, 60, 75]) return binned_df def usage(): print("Usage: " + argv[0] + "") print("preprocessing actions:") print("fill: fills the na values with the mean") print("drop: drops the na values") exit() def main(): models = ["gnb", "svc", "knn", "tree", "neuralnet"] if len(argv) != 2: usage() data, target = parse_data(source="data/mamografia.csv", action=str(argv[1])) individual_result, complete_results = create_result_dataframes() for model in models: model_results = predict_data( data=data, target=target, model=model, results=individual_result ) complete_results = complete_results.append( individual_result.append(model_results) ) indexed_results = complete_results.set_index("model") plot_all_figures(results=indexed_results, data=data, target=target) if __name__ == "__main__": main()