180 lines
5.8 KiB
Python
180 lines
5.8 KiB
Python
from sys import argv
|
|
|
|
from matplotlib.pyplot import *
|
|
from numpy import arange, mean
|
|
from pandas import DataFrame, cut
|
|
from seaborn import countplot, heatmap, set_style, set_theme
|
|
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve
|
|
from sklearn.naive_bayes import GaussianNB
|
|
from sklearn.neighbors import KNeighborsClassifier
|
|
from sklearn.neural_network import MLPClassifier
|
|
from sklearn.preprocessing import scale
|
|
from sklearn.svm import LinearSVC
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
|
|
from preprocessing import parse_data, split_k_sets
|
|
|
|
|
|
def choose_model(model):
|
|
if model == "gnb":
|
|
return GaussianNB()
|
|
elif model == "svc":
|
|
return LinearSVC(random_state=42)
|
|
elif model == "knn":
|
|
return KNeighborsClassifier(n_neighbors=10)
|
|
elif model == "tree":
|
|
return DecisionTreeClassifier(random_state=42)
|
|
elif model == "neuralnet":
|
|
return MLPClassifier(hidden_layer_sizes=10)
|
|
|
|
|
|
def predict_data(data, target, model, results):
|
|
model = choose_model(model)
|
|
if model == "knn":
|
|
data = scale(data)
|
|
confusion_matrices, auc, fpr, tpr = [], [], [], []
|
|
for train_index, test_index in split_k_sets(data):
|
|
model.fit(data.iloc[train_index], target.iloc[train_index])
|
|
prediction = model.predict(data.iloc[test_index])
|
|
confusion_matrices.append(confusion_matrix(target.iloc[test_index], prediction))
|
|
auc.append(roc_auc_score(target.iloc[test_index], prediction))
|
|
fpr_item, tpr_item, _ = roc_curve(target.iloc[test_index], prediction)
|
|
fpr.append(fpr_item)
|
|
tpr.append(tpr_item)
|
|
populated_results = populate_results(
|
|
df=results,
|
|
model=model,
|
|
fpr=mean(fpr, axis=0),
|
|
tpr=mean(tpr, axis=0),
|
|
auc=mean(auc),
|
|
confusion_matrix=mean(confusion_matrices, axis=0),
|
|
)
|
|
return populated_results
|
|
|
|
|
|
def plot_roc_auc_curve(results):
|
|
fig = figure(figsize=(8, 6))
|
|
for model in results.index:
|
|
rounded_auc = round(results.loc[model]["auc"], 3)
|
|
plot(
|
|
results.loc[model]["fpr"],
|
|
results.loc[model]["tpr"],
|
|
label=f"{model} , AUC={rounded_auc}",
|
|
)
|
|
xticks(arange(0.0, 1.0, step=0.1))
|
|
yticks(arange(0.0, 1.0, step=0.1))
|
|
legend(loc="lower right")
|
|
fig_title = "ROC AUC curve"
|
|
title(fig_title)
|
|
xlabel("False positive rate")
|
|
ylabel("True positive rate")
|
|
fig.savefig(f"docs/assets/{fig_title.replace(' ', '_').lower()}.png")
|
|
|
|
|
|
def plot_confusion_matrix(results):
|
|
set_style("white")
|
|
matrix = results.filter(items=["model", "confusion_matrix"])
|
|
fig, axes = subplots(nrows=1, ncols=5, figsize=(8, 6))
|
|
for i in range(len(axes)):
|
|
heatmap(
|
|
ax=axes[i],
|
|
data=matrix.iloc[i]["confusion_matrix"],
|
|
cmap="Blues",
|
|
square=True,
|
|
annot=True,
|
|
cbar=False,
|
|
)
|
|
axes[i].set_title(matrix.index[i])
|
|
fig_title = "Confusion Matrix"
|
|
suptitle(fig_title)
|
|
show()
|
|
fig.savefig(f"docs/assets/{fig_title.replace(' ', '_').lower()}.png")
|
|
|
|
|
|
def plot_attributes_correlation(data, target):
|
|
transformed_data = transform_dataframe(data, target)
|
|
fig, axes = subplots(nrows=5, ncols=1, figsize=(8, 6))
|
|
for i in range(len(axes)):
|
|
countplot(
|
|
ax=axes[i],
|
|
x=transformed_data.columns[i],
|
|
data=transformed_data,
|
|
hue="Severity",
|
|
)
|
|
axes[i].set_title(transformed_data.columns[i])
|
|
fig_title = "Attribute's correlation"
|
|
suptitle(fig_title)
|
|
show()
|
|
fig.savefig(f"docs/assets/{fig_title.replace(' ', '_').lower()}.png")
|
|
|
|
|
|
def plot_all_figures(results, data, target):
|
|
set_theme()
|
|
plot_roc_auc_curve(results=results)
|
|
plot_confusion_matrix(results=results)
|
|
plot_attributes_correlation(data=data, target=target)
|
|
|
|
|
|
def create_result_dataframes():
|
|
results = DataFrame(columns=["model", "fpr", "tpr", "auc", "confusion_matrix"])
|
|
indexed_results = results.set_index("model")
|
|
return indexed_results, indexed_results
|
|
|
|
|
|
def populate_results(df, model, fpr, tpr, auc, confusion_matrix):
|
|
renamed_model = rename_model(model=f"{model}")
|
|
columns = ["model", "fpr", "tpr", "auc", "confusion_matrix"]
|
|
values = [renamed_model, fpr, tpr, auc, confusion_matrix]
|
|
dictionary = dict(zip(columns, values))
|
|
populated_df = df.append(dictionary, ignore_index=True)
|
|
return populated_df
|
|
|
|
|
|
def rename_model(model):
|
|
short_name = ["gnb", "svc", "knn", "tree", "neuralnet"]
|
|
models = [
|
|
"GaussianNB()",
|
|
"LinearSVC(random_state=42)",
|
|
"KNeighborsClassifier(n_neighbors=10)",
|
|
"DecisionTreeClassifier(random_state=42)",
|
|
"MLPClassifier(hidden_layer_sizes=10)",
|
|
]
|
|
mapping = dict(zip(models, short_name))
|
|
return mapping[model]
|
|
|
|
|
|
def transform_dataframe(data, target):
|
|
joined_df = data.join(target)
|
|
binned_df = joined_df.copy()
|
|
binned_df["Age"] = cut(x=joined_df["Age"], bins=[15, 30, 45, 60, 75])
|
|
return binned_df
|
|
|
|
|
|
def usage():
|
|
print("Usage: " + argv[0] + "<preprocessing action>")
|
|
print("preprocessing actions:")
|
|
print("fill: fills the na values with the mean")
|
|
print("drop: drops the na values")
|
|
exit()
|
|
|
|
|
|
def main():
|
|
models = ["gnb", "svc", "knn", "tree", "neuralnet"]
|
|
if len(argv) != 2:
|
|
usage()
|
|
data, target = parse_data(source="data/mamografia.csv", action=str(argv[1]))
|
|
individual_result, complete_results = create_result_dataframes()
|
|
for model in models:
|
|
model_results = predict_data(
|
|
data=data, target=target, model=model, results=individual_result
|
|
)
|
|
complete_results = complete_results.append(
|
|
individual_result.append(model_results)
|
|
)
|
|
indexed_results = complete_results.set_index("model")
|
|
plot_all_figures(results=indexed_results, data=data, target=target)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|