可用
def mode_of_pyspark_columns(sql_df, cat_col_list, verbose=False):
col_with_mode=[]
for col in cat_col_list:
#Filter null 对原代码修正
df = sql_df.filter(sql_df[col].isNull()==False)
#Find unique_values_with_count
unique_classes = df.select(col).distinct().rdd.map(lambda x: x[0]).collect()
unique_values_with_count=[]
for uc in unique_classes:
unique_values_with_count.append([uc, df.filter(df[col]==uc).count()])
#sort unique values w.r.t their count values
sorted_unique_values_with_count= sorted(unique_values_with_count, key = lambda x: x[1], reverse =True)
if (verbose==True): print(col, sorted_unique_values_with_count, " and mode is ", sorted_unique_values_with_count[0][0])
col_with_mode.append([col, sorted_unique_values_with_count[0][0]])
return col_with_mode
#Fill missing values for mode
from pyspark.sql.functions import when, lit
def fill_missing_with_mode(df, cat_col_list):
col_with_mode =mode_of_pyspark_columns(df, cat_col_list)
for col, mode in col_with_mode:
df = df.withColumn(col, when(df[col].isNull()==True,
lit(mode)).otherwise(df[col]))
return df