import numpy as np
array = np.array([[1,2,3],[2,3,4]])print(array)print('number of dim:', array.ndim)print('shape:', array.shape)print('size:', array.size)
a = np.array([[1,2,3],[2,3,4]], dtype=np.int32)print(a.dtype)
a = np.zeros((3,4))print(a)
a = np.ones((3,4), dtype=np.int16)print(a)
a = np.empty((3,4))print(a)
a = np.arange(10,20,2)print(a)
a = np.arange(12).reshape((3,4))print(a)
a = np.linspace(1,10,5)print(a)
a = np.array([10,20,30,40])
b = np.arange(4)print(a, b)
c = a + b
print(c)
d = a - b
print(d)
e = b**2print(e)
f =10* np.sin(a)print(f)print(b)print(b<3)
a = np.array([[1,1],[0,1]])
b = np.arange(4).reshape((2,2))print(a)print(b)
c = a * b
print(c)
c_dot = np.dot(a, b)
c_dot_2 = a.dot(b)print(c_dot)print(c_dot_2)
a = np.random.random((2,4))print(a)print(np.sum(a))print(np.min(a))print(np.max(a))print(np.sum(a, axis=1))print(np.min(a, axis=0))print(np.max(a, axis=1))
A = np.arange(14,2,-1).reshape((3,4))print(A)print(np.argmin(A))print(np.argmax(A))print(np.mean(A))print(A.mean())print(np.average(A))#print(A.average())print(np.median(A))print(np.cumsum(A))print(np.diff(A))print(np.nonzero(A))print(np.sort(A))print(np.transpose(A))print(A.T)print((A.T).dot(A))print(np.clip(A,5,9))print(np.mean(A, axis=0))# index
A = np.arange(3,15).reshape((3,4))print(A)print(A[2])print(A[1][1])print(A[1,1])print(A[2,:])print(A[:,1])print(A[1,1:3])for row in A:print(row)for col in A.T:print(col)print(A.flatten())for item in A.flat:print(item)# numpy array merge
A = np.array([1,1,1])
B = np.array([2,2,2])print(np.vstack((A, B)))# vertical stack
A = A[:, np.newaxis]
B = B[:, np.newaxis]print(np.hstack((A, B)))# horizontal stack
C = np.concatenate((A, B, B, A), axis=0)print(C)
D = np.concatenate((A, B, B, A), axis=1)print(D)# numpy array split
A = np.arange(12).reshape((3,4))print(A)print(np.split(A,2, axis=1))print(np.split(A,3, axis=0))# deng fenprint(np.array_split(A,3, axis=1))print(np.vsplit(A,3))print(np.hsplit(A,2))# numpy array copy
a = np.arange(4)print(a)
b = a
c = a
d = b
a[0]=11print(a)print(b)print(c)print(d)
e = a.copy()# deep copyprint(e)
a[3]=44print(a)print(e)
pandas
import pandas as pd
import csv
import numpy as np
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
from datetime import datetime
dateparse =lambda dates: pd.datetime.strptime(dates,'%Y%m%d')#data = pd.read_csv('kc_train.csv', header=None, parse_dates=True, date_parser=dateparse, index_col=0)
data = pd.read_csv('kc_train.csv', header=None)print(data.head(5))
df = pd.DataFrame(data)print(df.describe())print(df[0:3])# print(df[0])#select by label: locprint(df.loc[:,[0,1]])# select by position: ilocprint(df.iloc[0:5,0:3])# boolean indexingprint(df[df[2]>3])# modify values
df[2][df[2]>3]=0print(df)# add col
df[15]=0print(df)# handle mis data
df.iloc[0,1]= np.nan
print(df)print(df.dropna(axis=0, how='any'))# how={'any', 'all'}print(df.fillna(value=0))# if data misprint(np.any(df.isnull())==True)# concatenating# concat dataframe
df1 = df
res = pd.concat([df, df1], axis=0, ignore_index=True)print(res)# join, ['inner', 'outer']
df2 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'])print(df2)print(df3)
res = pd.concat([df2, df3], join='inner', ignore_index=True)print(res)
res = pd.concat([df2, df3], join='outer', ignore_index=True, sort=False)print(res)# join_axes
df2 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df3 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
res = pd.concat([df2, df3], axis=1).reindex_like
res = pd.concat([df2, df3], axis=1, join_axes=[df2.index])print(res)# append
df4 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df5 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df6 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
res = df4.append(df5, ignore_index=True)print(res)
res = df4.append([df5, df6], ignore_index=True)print(res)
df8 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
res = df8.append(s1, ignore_index=True)print(res)# merging two df by key/keys.(may be used in database)# simple example
left = pd.DataFrame({'key':['K0','K1','K2','K3'],'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})print(left)print(right)
res = pd.merge(left, right, on='key')print(res)# consider two keys
left = pd.DataFrame({'key1':['K0','K0','K1','K2'],'key2':['K0','K1','K0','K1'],'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['K0','K1','K1','K2'],'key2':['K0','K0','K0','K0'],'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})print(left)print(right)
res = pd.merge(left, right, on=['key1','key2'], how='inner')# default for how='inner'# how = ['left', 'right', 'outer', 'inner']
res = pd.merge(left, right, on=['key1','key2'], how='left')print(res)# indicator
df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})print(df1)print(df2)
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)# give the indicator a custom name
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')# merged by index
left = pd.DataFrame({'A':['A0','A1','A2'],'B':['B0','B1','B2']},
index=['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C2','C3'],'D':['D0','D2','D3']},
index=['K0','K2','K3'])print(left)print(right)# left_index and right_index
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')# handle overlapping
boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})
girls = pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]})
res = pd.merge(boys, girls, on='k', suffixes=['_boy','_girl'], how='inner')print(res)# join function in pandas is similar with merge. If know merge, you will understand join# plot data# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()# DataFrame
data = pd.DataFrame(np.random.randn(1000,4),
index=np.arange(1000),
columns=list("ABCD"))
data = data.cumsum()# plot methods:# 'bar', 'hist', 'box', 'kde', 'area', 'scatter', 'hexbin', 'pie'
data.plot()
plt.show()
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label='Class 1')
data.plot.scatter(x='A', y='C', color='DarkGreen', label='Class 2', ax=ax)
plt.show()