from pandas import DataFrame, read_csv from sklearn.preprocessing import normalize def replace_values(df): for column in df.columns: df[column].fillna(value=df[column].mean(), inplace=True) return df def process_na(df, action): if action == "drop": return df.dropna() elif action == "fill": return replace_values(df) else: print("Unknown action selected. The choices are: ") print("fill: fills the na values with the mean") print("drop: drops the na values") exit() def filter_dataframe(df): relevant_columns = [ "HORA", "DIASEMANA", "COMUNIDAD_AUTONOMA", "ISLA", "TOT_HERIDOS_LEVES", "TOT_HERIDOS_GRAVES", "TOT_VEHICULOS_IMPLICADOS", "TOT_MUERTOS", "TIPO_VIA", "LUMINOSIDAD", "FACTORES_ATMOSFERICOS", ] filtered_df = df.filter(items=relevant_columns) return filtered_df def normalize_numerical_values(df): cols = [ "TOT_HERIDOS_LEVES", "TOT_HERIDOS_GRAVES", "TOT_VEHICULOS_IMPLICADOS", "TOT_MUERTOS", ] filtered_df = df.filter(items=cols) normalized_data = normalize(X=filtered_df) normalized_df = DataFrame(data=normalized_data, columns=filtered_df.columns) df.update(normalized_df) return df def parse_data(source, action): df = read_csv(filepath_or_buffer=source, na_values="?") processed_df = process_na(df=df, action=action) filtered_df = filter_dataframe(df=processed_df) normalized_df = normalize_numerical_values(df=filtered_df) return normalized_df