大型数据集如何先随机读取1%
# 使用skiprows参数,x > 0确保首行读入,np.random.rand() > 0.01
import numpy as np
import pandas as pd
# 假设读取的bigdata.csv文件有100G
df = pd.read_csv('bigdata.csv',skiprows=lambda x: sx>0 and np.random.rand()>0.01)
# 查看
df.shape()
如何通过标准差找出异常值
df = pd.DataFrame({
'a':[1,3,np.nan],'b':[4,np.nan,np.nan]})
# 异常值设定为平均值上下1.96个标准差区间以外的值
mean = df['a'].mean()
std = df['a'].std()
toprange = mean + std * 1.96
botrange = mean + std * 1.96
# 过滤区间外的值
cdf = df
cdf = cdf.drop(cdf[cdf['a'] > toprange].index)
cdf = cdf.drop(cdf[cdf['a'] < botrange].index)
cdf