Import code from previous assignment
This commit is contained in:
49
src/preprocessing.py
Normal file
49
src/preprocessing.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from pandas import read_csv
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.model_selection import KFold
|
||||
|
||||
|
||||
def replace_values(df):
|
||||
columns = ["BI-RADS", "Margin", "Density", "Age"]
|
||||
for column in columns:
|
||||
df[column].fillna(value=df[column].mean(), inplace=True)
|
||||
return df
|
||||
|
||||
|
||||
def process_na(df, action):
|
||||
if action == "drop":
|
||||
return df.dropna()
|
||||
elif action == "fill":
|
||||
return replace_values(df)
|
||||
else:
|
||||
print("Unknown action selected. The choices are: ")
|
||||
print("fill: fills the na values with the mean")
|
||||
print("drop: drops the na values")
|
||||
exit()
|
||||
|
||||
|
||||
def encode_columns(df):
|
||||
label_encoder = LabelEncoder()
|
||||
encoded_df = df.copy()
|
||||
encoded_df["Shape"] = label_encoder.fit_transform(df["Shape"])
|
||||
encoded_df["Severity"] = label_encoder.fit_transform(df["Severity"])
|
||||
return encoded_df
|
||||
|
||||
|
||||
def split_train_target(df):
|
||||
train_data = df.drop(columns=["Severity"])
|
||||
target_data = df["Severity"]
|
||||
return train_data, target_data
|
||||
|
||||
|
||||
def split_k_sets(df):
|
||||
k_fold = KFold(shuffle=True, random_state=42)
|
||||
return k_fold.split(df)
|
||||
|
||||
|
||||
def parse_data(source, action):
|
||||
df = read_csv(filepath_or_buffer=source, na_values="?")
|
||||
processed_df = process_na(df=df, action=action)
|
||||
encoded_df = encode_columns(df=processed_df)
|
||||
test_data, target_data = split_train_target(df=encoded_df)
|
||||
return test_data, target_data
|
||||
89
src/processing.py
Normal file
89
src/processing.py
Normal file
@@ -0,0 +1,89 @@
|
||||
from numpy import mean
|
||||
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn.naive_bayes import GaussianNB
|
||||
from sklearn.neural_network import MLPClassifier
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.preprocessing import scale
|
||||
from sklearn.svm import LinearSVC
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
|
||||
from sys import argv
|
||||
|
||||
from preprocessing import parse_data, split_k_sets
|
||||
|
||||
|
||||
def choose_model(model):
|
||||
if model == "gnb":
|
||||
return GaussianNB()
|
||||
elif model == "svc":
|
||||
return LinearSVC(random_state=42)
|
||||
elif model == "knn":
|
||||
return KNeighborsClassifier(n_neighbors=10)
|
||||
elif model == "tree":
|
||||
return DecisionTreeClassifier(random_state=42)
|
||||
elif model == "neuralnet":
|
||||
return MLPClassifier(hidden_layer_sizes=10)
|
||||
else:
|
||||
print("Unknown model selected. The choices are: ")
|
||||
print("gnb: Gaussian Naive Bayes")
|
||||
print("svc: Linear Support Vector Classification")
|
||||
print("knn: K-neighbors")
|
||||
print("tree: Decision tree")
|
||||
print("neuralnet: MLP Classifier")
|
||||
exit()
|
||||
|
||||
|
||||
def predict_data(data, target, model):
|
||||
model = choose_model(model)
|
||||
if model == "knn":
|
||||
data = scale(data)
|
||||
accuracy_scores = []
|
||||
confusion_matrices = []
|
||||
auc = []
|
||||
for train_index, test_index in split_k_sets(data):
|
||||
model.fit(data.iloc[train_index], target.iloc[train_index])
|
||||
prediction = model.predict(data.iloc[test_index])
|
||||
accuracy_scores.append(accuracy_score(target.iloc[test_index], prediction))
|
||||
confusion_matrices.append(confusion_matrix(target.iloc[test_index], prediction))
|
||||
auc.append(roc_auc_score(target.iloc[test_index], prediction))
|
||||
cv_score = cross_val_score(model, data, target, cv=10)
|
||||
evaluate_performance(
|
||||
confusion_matrix=mean(confusion_matrices, axis=0),
|
||||
accuracy=mean(accuracy_scores),
|
||||
cv_score=mean(cv_score),
|
||||
auc=mean(auc),
|
||||
)
|
||||
|
||||
|
||||
def evaluate_performance(confusion_matrix, accuracy, cv_score, auc):
|
||||
print("Accuracy Score: " + str(accuracy))
|
||||
print("Confusion matrix: ")
|
||||
print(str(confusion_matrix))
|
||||
print("Cross validation score: " + str(cv_score))
|
||||
print("AUC: " + str(auc))
|
||||
|
||||
|
||||
def usage():
|
||||
print("Usage: " + argv[0] + "<preprocessing action> <model>")
|
||||
print("preprocessing actions:")
|
||||
print("fill: fills the na values with the mean")
|
||||
print("drop: drops the na values")
|
||||
print("models:")
|
||||
print("gnb: Gaussian Naive Bayes")
|
||||
print("svc: Linear Support Vector Classification")
|
||||
print("knn: K-neighbors")
|
||||
print("tree: Decision tree")
|
||||
print("neuralnet: MLP Classifier")
|
||||
exit()
|
||||
|
||||
|
||||
def main():
|
||||
if len(argv) != 3:
|
||||
usage()
|
||||
data, target = parse_data(source="data/mamografia.csv", action=str(argv[1]))
|
||||
predict_data(data=data, target=target, model=str(argv[2]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user