IN-P1/src/processing.py

90 lines
2.9 KiB
Python

from numpy import mean
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sys import argv
from preprocessing import parse_data, split_k_sets
def choose_model(model):
if model == "gnb":
return GaussianNB()
elif model == "svc":
return LinearSVC(random_state=42)
elif model == "knn":
return KNeighborsClassifier(n_neighbors=10)
elif model == "tree":
return DecisionTreeClassifier(random_state=42)
elif model == "neuralnet":
return MLPClassifier(hidden_layer_sizes=10)
else:
print("Unknown model selected. The choices are: ")
print("gnb: Gaussian Naive Bayes")
print("svc: Linear Support Vector Classification")
print("knn: K-neighbors")
print("tree: Decision tree")
print("neuralnet: MLP Classifier")
exit()
def predict_data(data, target, model):
model = choose_model(model)
if model == "knn":
data = scale(data)
accuracy_scores = []
confusion_matrices = []
auc = []
for train_index, test_index in split_k_sets(data):
model.fit(data.iloc[train_index], target.iloc[train_index])
prediction = model.predict(data.iloc[test_index])
accuracy_scores.append(accuracy_score(target.iloc[test_index], prediction))
confusion_matrices.append(confusion_matrix(target.iloc[test_index], prediction))
auc.append(roc_auc_score(target.iloc[test_index], prediction))
cv_score = cross_val_score(model, data, target, cv=10)
evaluate_performance(
confusion_matrix=mean(confusion_matrices, axis=0),
accuracy=mean(accuracy_scores),
cv_score=mean(cv_score),
auc=mean(auc),
)
def evaluate_performance(confusion_matrix, accuracy, cv_score, auc):
print("Accuracy Score: " + str(accuracy))
print("Confusion matrix: ")
print(str(confusion_matrix))
print("Cross validation score: " + str(cv_score))
print("AUC: " + str(auc))
def usage():
print("Usage: " + argv[0] + "<preprocessing action> <model>")
print("preprocessing actions:")
print("fill: fills the na values with the mean")
print("drop: drops the na values")
print("models:")
print("gnb: Gaussian Naive Bayes")
print("svc: Linear Support Vector Classification")
print("knn: K-neighbors")
print("tree: Decision tree")
print("neuralnet: MLP Classifier")
exit()
def main():
if len(argv) != 3:
usage()
data, target = parse_data(source="data/mamografia.csv", action=str(argv[1]))
predict_data(data=data, target=target, model=str(argv[2]))
if __name__ == "__main__":
main()