Pandas-之数据合并与连接
数据合并与连接
目录
- concat 合并
- merge 连接
- join 连接
- append 追加
- 实际应用示例
concat 合并
concat 用于沿某个轴连接多个 DataFrame 或 Series。
基本用法
import pandas as pd# 创建示例数据
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'],'B': ['B0', 'B1', 'B2'],'C': ['C0', 'C1', 'C2']
})df2 = pd.DataFrame({'A': ['A3', 'A4', 'A5'],'B': ['B3', 'B4', 'B5'],'C': ['C3', 'C4', 'C5']
})# 垂直合并(默认 axis=0)
result = pd.concat([df1, df2])
print(result)
# 输出:
# A B C
# 0 A0 B0 C0
# 1 A1 B1 C1
# 2 A2 B2 C2
# 0 A3 B3 C3
# 1 A4 B4 C4
# 2 A5 B5 C5# 重置索引
result = pd.concat([df1, df2], ignore_index=True)
print(result)# 水平合并(axis=1)
result = pd.concat([df1, df2], axis=1)
print(result)
处理索引
import pandas as pddf1 = pd.DataFrame({'A': ['A0', 'A1'],'B': ['B0', 'B1']
}, index=[0, 1])df2 = pd.DataFrame({'A': ['A2', 'A3'],'B': ['B2', 'B3']
}, index=[2, 3])# 保留原始索引
result = pd.concat([df1, df2])
print(result)# 创建多级索引
result = pd.concat([df1, df2], keys=['x', 'y'])
print(result)
# 输出:
# A B
# x 0 A0 B0
# 1 A1 B1
# y 2 A2 B2
# 3 A3 B3# 访问特定组
print(result.loc['x'])
处理不同的列
import pandas as pddf1 = pd.DataFrame({'A': ['A0', 'A1'],'B': ['B0', 'B1'],'C': ['C0', 'C1']
})df2 = pd.DataFrame({'B': ['B2', 'B3'],'C': ['C2', 'C3'],'D': ['D2', 'D3']
})# 合并(缺失值用 NaN 填充)
result = pd.concat([df1, df2], ignore_index=True)
print(result)
# 输出:
# A B C D
# 0 A0 B0 C0 NaN
# 1 A1 B1 C1 NaN
# 2 NaN B2 C2 D2
# 3 NaN B3 C3 D3# 只保留共同的列
result = pd.concat([df1, df2], join='inner', ignore_index=True)
print(result)
多个 DataFrame 合并
import pandas as pddf1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})
df3 = pd.DataFrame({'A': [9, 10], 'B': [11, 12]})# 合并多个 DataFrame
result = pd.concat([df1, df2, df3], ignore_index=True)
print(result)# 使用列表推导式
dfs = [df1, df2, df3]
result = pd.concat(dfs, ignore_index=True)
print(result)
merge 连接
merge 类似于 SQL 的 JOIN 操作,基于键合并两个 DataFrame。
基本连接类型
import pandas as pd# 左表
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],'A': ['A0', 'A1', 'A2', 'A3'],'B': ['B0', 'B1', 'B2', 'B3']
})# 右表
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K4'],'C': ['C0', 'C1', 'C2', 'C4'],'D': ['D0', 'D1', 'D2', 'D4']
})# 内连接(默认,只保留两表都有的键)
result = pd.merge(left, right, on='key')
print("内连接:")
print(result)
# 输出:
# key A B C D
# 0 K0 A0 B0 C0 D0
# 1 K1 A1 B1 C1 D1
# 2 K2 A2 B2 C2 D2# 左连接(保留左表所有行)
result = pd.merge(left, right, on='key', how='left')
print("\n左连接:")
print(result)# 右连接(保留右表所有行)
result = pd.merge(left, right, on='key', how='right')
print("\n右连接:")
print(result)# 外连接(保留两表所有行)
result = pd.merge(left, right, on='key', how='outer')
print("\n外连接:")
print(result)
不同列名
import pandas as pdleft = pd.DataFrame({'key_left': ['K0', 'K1', 'K2'],'A': ['A0', 'A1', 'A2'],'B': ['B0', 'B1', 'B2']
})right = pd.DataFrame({'key_right': ['K0', 'K1', 'K3'],'C': ['C0', 'C1', 'C3'],'D': ['D0', 'D1', 'D3']
})# 指定左右键
result = pd.merge(left, right, left_on='key_left', right_on='key_right')
print(result)# 使用 how 参数
result = pd.merge(left, right, left_on='key_left', right_on='key_right', how='outer')
print("\n外连接:")
print(result)
多键连接
import pandas as pdleft = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],'key2': ['K0', 'K1', 'K0', 'K1'],'A': ['A0', 'A1', 'A2', 'A3'],'B': ['B0', 'B1', 'B2', 'B3']
})right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],'key2': ['K0', 'K0', 'K0', 'K0'],'C': ['C0', 'C1', 'C2', 'C3'],'D': ['D0', 'D1', 'D2', 'D3']
})# 多键连接
result = pd.merge(left, right, on=['key1', 'key2'])
print("多键内连接:")
print(result)result = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print("\n多键外连接:")
print(result)
处理重复键
import pandas as pdleft = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K2'],'A': ['A0', 'A1', 'A2', 'A3'],'B': ['B0', 'B1', 'B2', 'B3']
})right = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3'],'C': ['C0', 'C1', 'C2', 'C3'],'D': ['D0', 'D1', 'D2', 'D3']
})# 一对多连接(会产生多行)
result = pd.merge(left, right, on='key')
print("一对多连接:")
print(result)
# 输出:
# key A B C D
# 0 K0 A0 B0 C0 D0
# 1 K1 A1 B1 C1 D1
# 2 K1 A1 B1 C2 D2
# 3 K1 A2 B2 C1 D1
# 4 K1 A2 B2 C2 D2
suffixes 参数
import pandas as pdleft = pd.DataFrame({'key': ['K0', 'K1', 'K2'],'value': [1, 2, 3]
})right = pd.DataFrame({'key': ['K0', 'K1', 'K2'],'value': [4, 5, 6]
})# 合并时有相同列名,使用 suffixes
result = pd.merge(left, right, on='key', suffixes=('_left', '_right'))
print(result)
# 输出:
# key value_left value_right
# 0 K0 1 4
# 1 K1 2 5
# 2 K2 3 6
基于索引连接
import pandas as pdleft = pd.DataFrame({'A': ['A0', 'A1', 'A2'],'B': ['B0', 'B1', 'B2']
}, index=['K0', 'K1', 'K2'])right = pd.DataFrame({'C': ['C0', 'C1', 'C2'],'D': ['D0', 'D1', 'D2']
}, index=['K0', 'K2', 'K3'])# 基于索引连接
result = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print("基于索引的外连接:")
print(result)# 混合:左表用索引,右表用列
right_reset = right.reset_index()
result = pd.merge(left, right_reset, left_index=True, right_on='index', how='outer')
print("\n混合连接:")
print(result)
join 连接
join 是基于索引的连接方法,是 merge 的简化版本。
import pandas as pdleft = pd.DataFrame({'A': ['A0', 'A1', 'A2'],'B': ['B0', 'B1', 'B2']
}, index=['K0', 'K1', 'K2'])right = pd.DataFrame({'C': ['C0', 'C1', 'C2'],'D': ['D0', 'D1', 'D2']
}, index=['K0', 'K2', 'K3'])# 左连接(默认)
result = left.join(right)
print("左连接:")
print(result)# 外连接
result = left.join(right, how='outer')
print("\n外连接:")
print(result)# 内连接
result = left.join(right, how='inner')
print("\n内连接:")
print(result)# 右连接
result = left.join(right, how='right')
print("\n右连接:")
print(result)# 多个 DataFrame
right2 = pd.DataFrame({'E': ['E0', 'E1', 'E2']
}, index=['K0', 'K1', 'K3'])result = left.join([right, right2])
print("\n连接多个 DataFrame:")
print(result)
append 追加
append 方法用于在 DataFrame 末尾追加数据(已弃用,推荐使用 concat)。
import pandas as pddf1 = pd.DataFrame({'A': ['A0', 'A1'],'B': ['B0', 'B1']
})df2 = pd.DataFrame({'A': ['A2', 'A3'],'B': ['B2', 'B3']
})# 使用 append(已弃用,不推荐)
# result = df1.append(df2, ignore_index=True)# 推荐使用 concat
result = pd.concat([df1, df2], ignore_index=True)
print(result)# 追加单行
new_row = pd.Series({'A': 'A4', 'B': 'B4'})
result = pd.concat([df1, new_row.to_frame().T], ignore_index=True)
print("\n追加单行:")
print(result)
实际应用示例
示例 1:员工和部门信息合并
import pandas as pd# 员工表
employees = pd.DataFrame({'员工ID': ['E001', 'E002', 'E003', 'E004'],'姓名': ['张三', '李四', '王五', '赵六'],'部门ID': ['D001', 'D001', 'D002', 'D003'],'工资': [8000, 10000, 12000, 11000]
})# 部门表
departments = pd.DataFrame({'部门ID': ['D001', 'D002', 'D003', 'D004'],'部门名称': ['销售部', '技术部', '财务部', '人事部'],'部门经理': ['经理A', '经理B', '经理C', '经理D']
})# 合并
result = pd.merge(employees, departments, on='部门ID', how='left')
print("员工-部门信息:")
print(result)# 统计各部门人数和平均工资
dept_stats = pd.merge(employees.groupby('部门ID').agg({'员工ID': 'count','工资': 'mean'}).rename(columns={'员工ID': '人数', '工资': '平均工资'}),departments,on='部门ID',how='left'
)
print("\n部门统计:")
print(dept_stats)
示例 2:销售数据合并
import pandas as pd
import numpy as npnp.random.seed(42)# 销售订单表
orders = pd.DataFrame({'订单ID': ['O001', 'O002', 'O003', 'O004', 'O005'],'客户ID': ['C001', 'C002', 'C001', 'C003', 'C002'],'产品ID': ['P001', 'P002', 'P001', 'P003', 'P002'],'数量': [10, 5, 8, 3, 7],'订单日期': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05']
})# 客户表
customers = pd.DataFrame({'客户ID': ['C001', 'C002', 'C003', 'C004'],'客户名称': ['公司A', '公司B', '公司C', '公司D'],'城市': ['北京', '上海', '广州', '深圳']
})# 产品表
products = pd.DataFrame({'产品ID': ['P001', 'P002', 'P003', 'P004'],'产品名称': ['产品A', '产品B', '产品C', '产品D'],'单价': [100, 200, 150, 180]
})# 合并订单和客户
orders_customers = pd.merge(orders, customers, on='客户ID', how='left')
print("订单-客户信息:")
print(orders_customers)# 合并订单、客户和产品
full_info = pd.merge(pd.merge(orders, customers, on='客户ID', how='left'),products,on='产品ID',how='left'
)# 计算订单金额
full_info['订单金额'] = full_info['数量'] * full_info['单价']
print("\n完整订单信息:")
print(full_info)# 客户统计
customer_stats = full_info.groupby('客户名称').agg({'订单ID': 'count','订单金额': 'sum'
}).rename(columns={'订单ID': '订单数', '订单金额': '总金额'})
print("\n客户统计:")
print(customer_stats)
示例 3:时间序列数据合并
import pandas as pd
import numpy as np# 创建时间序列数据
dates = pd.date_range('2024-01-01', periods=10, freq='D')# 价格数据
prices = pd.DataFrame({'日期': dates,'股票A': np.random.uniform(10, 20, 10),'股票B': np.random.uniform(15, 25, 10)
})# 交易量数据
volumes = pd.DataFrame({'日期': dates,'股票A': np.random.randint(1000, 5000, 10),'股票B': np.random.randint(2000, 6000, 10)
})# 合并价格和交易量
combined = pd.merge(prices.melt(id_vars='日期', var_name='股票', value_name='价格'),volumes.melt(id_vars='日期', var_name='股票', value_name='交易量'),on=['日期', '股票']
)print("合并后的数据:")
print(combined)# 计算成交额
combined['成交额'] = combined['价格'] * combined['交易量']
print("\n添加成交额:")
print(combined)
总结
数据合并与连接是数据分析的重要操作:
- concat:简单合并,沿轴连接
- merge:基于键连接,类似 SQL JOIN
- join:基于索引连接
- 连接类型:inner, left, right, outer
- 多键连接:可以基于多个列/索引连接
