当前位置：首页 > news >正文

【一起来学AI大模型】数据处理核心：NumPy/Pandas/Matplotlib 精要指南

news 2025/7/7 6:49:35

一、NumPy 科学计算核心

import numpy as np# 1. 数组创建与操作
arr = np.array([[1, 2, 3], [4, 5, 6]])  # 二维数组
zeros = np.zeros((3, 4))                 # 3x4零矩阵
arange = np.arange(0, 10, 2)             # [0, 2, 4, 6, 8]# 2. 数组运算（向量化）
arr * 2              # 标量运算
arr + np.array([1, 0, -1])  # 广播机制
np.dot(arr.T, arr)   # 矩阵乘法# 3. 高级索引
bool_idx = arr > 3   # 布尔索引
arr[arr > 3] = 0     # 条件赋值
sliced = arr[:, 1:3] # 行列切片# 4. 常用函数
np.mean(arr, axis=0) # 列均值
np.std(arr)          # 标准差
np.unique(arr)       # 去重

二、Pandas 数据分析核心

import pandas as pd# 1. 数据结构
series = pd.Series([1, 3, 5], index=['a', 'b', 'c'])
df = pd.DataFrame({'A': [1, 2, 3],'B': ['X', 'Y', 'Z']
})# 2. 数据加载与输出
df = pd.read_csv('data.csv', encoding='utf-8')
df.to_excel('output.xlsx', index=False)# 3. 数据清洗
df.dropna(subset=['A'])       # 删除缺失值
df.fillna(df.mean())          # 均值填充
df.duplicated().sum()         # 重复值检测
df.drop_duplicates(inplace=True)# 4. 数据转换
df['C'] = df['A'].apply(lambda x: x*2)  # 列运算
df['D'] = np.where(df['A']>2, 'High', 'Low')  # 条件列
df.pivot_table(values='A', index='B', aggfunc='mean') # 透视表# 5. 分组聚合
grouped = df.groupby('B')
grouped.agg({'A': ['min', 'max', 'mean'],'C': 'sum'
})

三、Matplotlib 可视化核心

import matplotlib.pyplot as plt
import seaborn as sns# 1. 基础图表
plt.figure(figsize=(10, 6))# 折线图
plt.subplot(2, 2, 1)
plt.plot(df['A'], marker='o', linestyle='--')# 柱状图
plt.subplot(2, 2, 2)
plt.bar(df['B'], df['A'], color='skyblue')# 散点图
plt.subplot(2, 2, 3)
plt.scatter(df['A'], df['C'], s=100, alpha=0.6)# 2. 高级可视化
# 箱线图
sns.boxplot(x='B', y='A', data=df)# 热力图
corr = df.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')# 3. 图表定制
plt.title('Sales Analysis', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Revenue', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(['Actual', 'Forecast'])
plt.tight_layout()# 保存输出
plt.savefig('analysis.png', dpi=300, bbox_inches='tight')
plt.show()

四、综合实战案例：销售数据分析

# 1. 数据准备
dates = pd.date_range('2023-01-01', periods=100)
sales = np.random.randint(50, 200, size=100)
products = np.random.choice(['A', 'B', 'C'], size=100)
df = pd.DataFrame({'Date': dates, 'Sales': sales, 'Product': products})# 2. 数据处理
df['Month'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.day_name()
monthly = df.groupby('Month')['Sales'].sum()# 3. 可视化分析
plt.figure(figsize=(12, 8))# 销售趋势
plt.subplot(2, 2, 1)
df.groupby('Date')['Sales'].sum().plot(title='Daily Sales Trend')# 产品占比
plt.subplot(2, 2, 2)
df['Product'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90)# 箱线图分析
plt.subplot(2, 2, 3)
sns.boxplot(x='Product', y='Sales', data=df)# 周销售分析
plt.subplot(2, 2, 4)
sns.barplot(x='DayOfWeek', y='Sales', data=df, order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])plt.tight_layout()
plt.savefig('sales_report.png')

五、性能优化技巧

# 1. 向量化替代循环
# 慢: 
df['new'] = [x*2 for x in df['A']]
# 快: 
df['new'] = df['A'] * 2# 2. 避免链式索引
# 危险: 
df[df['A'] > 2]['B'] = 'High'
# 推荐: 
df.loc[df['A'] > 2, 'B'] = 'High'# 3. 大数据分块处理
chunk_iter = pd.read_csv('big_data.csv', chunksize=10000)
results = []
for chunk in chunk_iter:result = process(chunk)results.append(result)
final = pd.concat(results)# 4. 高效数据类型转换
df['A'] = df['A'].astype('int32')  # 减少内存使用
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

六、常用数据模式

# 1. 时间序列处理
df.set_index('Date', inplace=True)
monthly_mean = df.resample('M').mean()# 2. 多源数据合并
orders = pd.read_csv('orders.csv')
customers = pd.read_csv('customers.csv')
merged = pd.merge(orders, customers, on='customer_id', how='left')# 3. 异常值处理
Q1 = df['Sales'].quantile(0.25)
Q3 = df['Sales'].quantile(0.75)
IQR = Q3 - Q1
filtered = df[(df['Sales'] > (Q1 - 1.5*IQR)) & (df['Sales'] < (Q3 + 1.5*IQR))]# 4. 数据分桶
bins = [0, 18, 35, 60, 100]
labels = ['Child', 'Youth', 'Adult', 'Senior']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)

七、学习路径建议

基础掌握：
- NumPy数组操作
- Pandas数据加载/清洗
- Matplotlib基础图表
进阶技能：
- 时间序列分析
- 多表合并操作
- Seaborn高级可视化
- 性能优化技巧
高级应用：
- 大规模数据处理（Dask）
- 交互式可视化（Plotly）
- 地理空间分析（Geopandas）
- 机器学习集成（Scikit-learn）