from pandas import DataFrame, read_csv from sklearn.preprocessing import LabelEncoder, normalize from sklearn.model_selection import KFold def construct_dataframes(train, test): file_list = [train, test] df_dict = {} for file in file_list: df_dict[file] = read_csv(filepath_or_buffer=file) df_list = list(df_dict.values()) return df_list def drop_null_values(df_list) -> DataFrame: for df in df_list: df.dropna(inplace=True) df.drop(columns="Tipo_marchas", inplace=True) return df_list def trim_column_names(df_list) -> DataFrame: columns = ["Consumo", "Motor_CC", "Potencia"] for df in df_list: for col in columns: df[col] = df[col].str.replace(pat="[^.0-9]", repl="").astype(float) return df_list def encode_fields(df_list): files = [ "ao" "asientos" "ciudad" "combustible" "consumo" "descuento" "kilometros" "mano" "motor_cc" "nombre" "potencia" "potencia" ] for data in files: pass def split_k_sets(df): k_fold = KFold(shuffle=True, random_state=42) return k_fold.split(df) def parse_data(train, test): df_list = construct_dataframes(train=train, test=test) processed_df_list = drop_null_values(df_list) numeric_df_list = trim_column_names(processed_df_list) return numeric_df_list