Pandas 和 NumPy 使用文档整理
1. NumPy 核心功能
1.1 数组创建
import numpy as np
arr = np.array([1, 2, 3])
zeros = np.zeros((3, 4))
ones = np.ones((2, 3))
empty = np.empty((2, 2))
full = np.full((2, 2), 7)
arange = np.arange(10)
linspace = np.linspace(0, 1, 5)
random_arr = np.random.rand(3, 3)
identity = np.eye(3)
1.2 数组属性
arr = np.array([[1, 2, 3], [4, 5, 6]])arr.shape
arr.ndim
arr.size
arr.dtype
arr.itemsize
1.3 数组操作
arr = np.arange(10)
arr[2]
arr[2:5]
arr[2:5] = 9
arr2d = np.array([[1, 2, 3], [4, 5, 6]])
arr2d[0, 1]
arr2d[:, 1]
arr[arr > 5]
arr.reshape((2, 5))
arr.flatten()
arr.T
np.concatenate([arr1, arr2])
np.vstack((arr1, arr2))
np.hstack((arr1, arr2))
np.split(arr, 3)
1.4 数学运算
arr1 + arr2
arr1 - arr2
arr1 * arr2
arr1 / arr2
arr1 @ arr2
np.dot(arr1, arr2)
np.sqrt(arr)
np.exp(arr)
np.sin(arr)
np.log(arr)
np.abs(arr)
np.round(arr, 2)
arr.sum()
arr.mean()
arr.std()
arr.var()
arr.min()
arr.max()
arr.argmin()
arr.argmax()
np.percentile(arr, 50)
2. Pandas 核心功能
2.1 数据结构
import pandas as pd
s = pd.Series([1, 3, 5, np.nan, 6, 8])pd.Series([4,5,6,7,8])
0 4
1 5
2 6
3 7
4 8
dtype: int64pd.Series([4,5,6,7,8],index=['a','b','c','d','e'])
a 4
b 5
c 6
d 7
e 8
dtype: int64pd.Series({"a":1,"b":2}) pd.Series(111,index=['a','b','c'])
df = pd.DataFrame({'A': 1.0,'B': pd.Timestamp('20230101'),'C': pd.Series(1, index=list(range(4)), dtype='float32'),'D': np.array([3] * 4, dtype='int32'),'E': pd.Categorical(["test", "train", "test", "train"]),'F': 'foo'
})
pd.DataFrame({'one':[1,2,3,4],'two':[4,3,2,1]})
pd.DataFrame({'one':pd.Series([1,2,3],index=['a','b','c']),'two':pd.Series([1,2,3],index=['b','a','c'])
})
pd.DataFrame(np.array([[10,20],[30,40]]),index=['a','b'],columns=['c1','c2'])
pd.DataFrame([np.arange(1,8),np.arange(11,18)])
s1 = pd.Series(np.arange(1,9,2))
s2 = pd.Series(np.arange(2,10,2))
s3 = pd.Series(np.arange(5,7),index=[1,2])
pd.DataFrame({'c1':s1,'c2':s2,'c3':s3})
2.2 数据查看与选择
df.head()
df.tail(3)
df.index
df.columns
df.describe()
df.info()
df['A']
df[0:3]
df.loc[0]
df.loc[:, ['A', 'B']]
df.iloc[0]
df.iloc[:, 1:3]
df[df.A > 0]
df[df['E'].isin(['train'])]
2.3 数据操作
df.dropna()
df.fillna(0)
df.isna()
pd.concat([df1, df2])
pd.merge(df1, df2, on='key')
df1.join(df2)
df.groupby('E').sum()
df.groupby(['E', 'F']).mean()
df.sort_values(by='B')
df.sort_index(ascending=False)
df.apply(np.sum)
df.apply(lambda x: x.max() - x.min())
2.4 时间序列处理
dates = pd.date_range('20230101', periods=6)
ts = pd.Series(np.random.randn(6), index=dates)
ts.resample('M').mean()
ts.rolling(window=3).mean()
2.5 输入输出
pd.read_csv('file.csv')
pd.read_excel('file.xlsx')
pd.read_sql('SELECT * FROM table', con)
df.to_csv('output.csv')
df.to_excel('output.xlsx')
df.to_sql('table_name', con)
3. Pandas 和 NumPy 结合使用
arr = df.values
df = pd.DataFrame(arr)
df.apply(np.sqrt)
np.mean(df['A'])
result = np.random.randn(1000)
pd.Series(result).describe()
4. 性能优化技巧
for i in range(len(df)):df.iloc[i] = df.iloc[i] * 2
df = df * 2
df.eval('A + B')
df.query('A > 0 & B < 0')
df['category_col'] = df['category_col'].astype('category')
for chunk in pd.read_csv('large_file.csv', chunksize=10000):process(chunk)
5. 常用实用函数
5.1 Pandas 实用函数
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])
pd.crosstab(df['A'], df['B'])
pd.get_dummies(df['category_col'])
df['time_diff'] = df['end_time'] - df['start_time']
df['col'].value_counts()
df.drop_duplicates()
df.replace({1: 'one', 2: 'two'})
pd.cut(df['age'], bins=[0, 18, 35, 60, 100])
5.2 NumPy 实用函数
np.stack([arr1, arr2])
np.vstack([arr1, arr2])
np.hstack([arr1, arr2])
np.split(arr, 3)
np.vsplit(arr, 2)
np.hsplit(arr, 2)
arr1 = np.ones((3, 4))
arr2 = np.arange(4)
arr1 + arr2
arr[[1, 3, 5]]
arr[arr > 5]
np.polyfit(x, y, 2)
np.polyval([1, 2, 3], 5)