当前位置：首页 > news >正文

Pandas-之Series 数据结构详解

news 2025/11/2 7:05:36

Series 数据结构详解

什么是 Series

Series 是 Pandas 中的一维标记数组，可以包含任何数据类型。Series 类似于 Python 中的列表或字典，但提供了更多的功能。

Series 有两个主要组成部分：

数据（values）：实际存储的数据
索引（index）：数据对应的标签

创建 Series

1. 从列表创建

import pandas as pd# 使用默认索引（从 0 开始）
s1 = pd.Series([1, 3, 5, 7, 9])
print(s1)
# 输出：
# 0    1
# 1    3
# 2    5
# 3    7
# 4    9
# dtype: int64# 指定自定义索引
s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
print(s2)
# 输出：
# a    10
# b    20
# c    30
# d    40
# dtype: int64

2. 从字典创建

import pandas as pd# 字典的键自动成为索引
data = {'北京': 2154, '上海': 2487, '广州': 1881, '深圳': 1756}
s3 = pd.Series(data)
print(s3)
# 输出：
# 北京    2154
# 上海    2487
# 广州    1881
# 深圳    1756
# dtype: int64# 指定索引顺序（如果字典中没有的键，值为 NaN）
s4 = pd.Series(data, index=['北京', '上海', '广州', '深圳', '杭州'])
print(s4)
# 输出：
# 北京    2154.0
# 上海    2487.0
# 广州    1881.0
# 深圳    1756.0
# 杭州       NaN
# dtype: float64

3. 从标量值创建

import pandas as pd# 创建所有值相同的 Series
s5 = pd.Series(100, index=['a', 'b', 'c', 'd'])
print(s5)
# 输出：
# a    100
# b    100
# c    100
# d    100
# dtype: int64

4. 从 NumPy 数组创建

import pandas as pd
import numpy as np# 从 NumPy 数组创建
arr = np.array([1, 2, 3, 4, 5])
s6 = pd.Series(arr)
print(s6)
# 输出：
# 0    1
# 1    2
# 2    3
# 3    4
# 4    5
# dtype: int32# 指定索引
s7 = pd.Series(arr, index=['x', 'y', 'z', 'w', 'v'])
print(s7)

Series 的基本属性

import pandas as pds = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])# 查看索引
print(s.index)
# 输出：Index(['a', 'b', 'c', 'd', 'e'], dtype='object')# 查看数据值
print(s.values)
# 输出：[1 2 3 4 5]# 查看数据类型
print(s.dtype)
# 输出：int64# 查看大小
print(s.size)
# 输出：5# 查看形状
print(s.shape)
# 输出：(5,)# 查看是否为空
print(s.empty)
# 输出：False# 查看维度
print(s.ndim)
# 输出：1# 查看名称
s.name = '我的数据'
print(s.name)
# 输出：我的数据

Series 的索引

1. 位置索引（类似列表）

import pandas as pds = pd.Series([10, 20, 30, 40, 50])# 通过位置索引访问（从 0 开始）
print(s[0])      # 输出：10
print(s[2])      # 输出：30
print(s[-1])     # 输出：50（最后一个元素）# 切片操作
print(s[1:4])    # 输出索引 1 到 3 的元素
# 输出：
# 1    20
# 2    30
# 3    40
# dtype: int64

2. 标签索引

import pandas as pds = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])# 通过标签访问
print(s['a'])      # 输出：10
print(s['c'])      # 输出：30# 通过多个标签访问
print(s[['a', 'c', 'e']])
# 输出：
# a    10
# c    30
# e    50
# dtype: int64# 切片操作（标签切片包含结束标签）
print(s['a':'c'])
# 输出：
# a    10
# b    20
# c    30
# dtype: int64

3. 使用 loc 和 iloc

import pandas as pds = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])# loc：基于标签的索引
print(s.loc['a'])           # 输出：10
print(s.loc['a':'c'])       # 包含结束标签# iloc：基于位置的索引
print(s.iloc[0])            # 输出：10
print(s.iloc[1:3])          # 不包含结束位置（类似列表切片）# 使用布尔索引
print(s.loc[s > 30])
# 输出：
# d    40
# e    50
# dtype: int64

Series 的基本操作

1. 添加元素

import pandas as pds = pd.Series([1, 2, 3])# 方法1：使用索引赋值（如果索引不存在会添加）
s['new'] = 4
print(s)# 方法2：使用 append（已弃用，不推荐）
# 推荐使用 concat
s_new = pd.Series([4], index=['new'])
s = pd.concat([s, s_new])
print(s)

2. 删除元素

import pandas as pds = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])# 使用 drop 方法（返回新 Series，不修改原 Series）
s1 = s.drop('a')
print(s1)# 修改原 Series
s.drop('a', inplace=True)
print(s)# 删除多个元素
s.drop(['b', 'c'], inplace=True)
print(s)

3. 修改元素

import pandas as pds = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])# 通过索引修改
s['a'] = 10
print(s)# 通过位置修改
s.iloc[1] = 20
print(s)# 批量修改
s[['c', 'd']] = 30
print(s)

Series 的数据类型

import pandas as pd# 整数类型
s1 = pd.Series([1, 2, 3])
print(s1.dtype)  # int64# 浮点数类型
s2 = pd.Series([1.1, 2.2, 3.3])
print(s2.dtype)  # float64# 字符串类型
s3 = pd.Series(['a', 'b', 'c'])
print(s3.dtype)  # object# 布尔类型
s4 = pd.Series([True, False, True])
print(s4.dtype)  # bool# 日期类型
s5 = pd.Series(pd.date_range('2024-01-01', periods=3))
print(s5.dtype)  # datetime64[ns]# 转换数据类型
s = pd.Series([1, 2, 3, 4])
print(s.astype(float))
# 输出：
# 0    1.0
# 1    2.0
# 2    3.0
# 3    4.0
# dtype: float64

Series 的常用方法

1. 统计方法

import pandas as pd
import numpy as nps = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])# 基本统计
print(f"总和: {s.sum()}")              # 55
print(f"平均值: {s.mean()}")           # 5.5
print(f"中位数: {s.median()}")         # 5.5
print(f"最大值: {s.max()}")            # 10
print(f"最小值: {s.min()}")            # 1
print(f"标准差: {s.std()}")            # 3.027...
print(f"方差: {s.var()}")              # 9.166...# 百分位数
print(f"25%分位数: {s.quantile(0.25)}")  # 3.25
print(f"75%分位数: {s.quantile(0.75)}")  # 7.75# 描述性统计（一次性查看所有统计信息）
print(s.describe())
# 输出：
# count    10.000000
# mean      5.500000
# std       3.027650
# min       1.000000
# 25%       3.250000
# 50%       5.500000
# 75%       7.750000
# max      10.000000
# dtype: float64

2. 查找方法

import pandas as pds = pd.Series([10, 20, 30, 40, 50], index=['a', 'b', 'c', 'd', 'e'])# 查找第一个和最后一个值
print(s.first_valid_index())  # a
print(s.last_valid_index())   # e# 查找最大值和最小值的索引
print(s.idxmax())  # e
print(s.idxmin())  # a# 检查是否包含某个值
print(30 in s.values)  # True
print('c' in s.index)  # True# 查找值的索引位置
print(s[s == 30].index.tolist())  # ['c']

3. 排序方法

import pandas as pds = pd.Series([30, 10, 50, 20, 40], index=['e', 'a', 'c', 'b', 'd'])# 按值排序（升序）
s1 = s.sort_values()
print(s1)
# 输出：
# a    10
# b    20
# e    30
# d    40
# c    50
# dtype: int64# 按值排序（降序）
s2 = s.sort_values(ascending=False)
print(s2)# 按索引排序
s3 = s.sort_index()
print(s3)
# 输出：
# a    10
# b    20
# c    50
# d    40
# e    30
# dtype: int64

Series 的数学运算

1. 基本算术运算

import pandas as pds1 = pd.Series([1, 2, 3, 4, 5])
s2 = pd.Series([10, 20, 30, 40, 50])# 加法
print(s1 + s2)
# 输出：
# 0    11
# 1    22
# 2    33
# 3    44
# 4    55
# dtype: int64# 减法
print(s2 - s1)# 乘法
print(s1 * s2)# 除法
print(s2 / s1)# 幂运算
print(s1 ** 2)
# 输出：
# 0     1
# 1     4
# 2     9
# 3    16
# 4    25
# dtype: int64# 与标量运算
print(s1 + 10)
print(s1 * 2)

2. 比较运算

import pandas as pds = pd.Series([10, 20, 30, 40, 50])# 大于
print(s > 30)
# 输出：
# 0    False
# 1    False
# 2    False
# 3     True
# 4     True
# dtype: bool# 小于等于
print(s <= 30)# 等于
print(s == 30)# 不等于
print(s != 30)# 条件筛选
print(s[s > 30])
# 输出：
# 3    40
# 4    50
# dtype: int64

Series 的字符串操作

Series 中的字符串操作需要通过 .str 访问器：

import pandas as pds = pd.Series(['apple', 'banana', 'cherry', 'date'])# 转换为大写
print(s.str.upper())
# 输出：
# 0      APPLE
# 1     BANANA
# 2     CHERRY
# 3       DATE
# dtype: object# 转换为小写
print(s.str.lower())# 获取长度
print(s.str.len())
# 输出：
# 0    5
# 1    6
# 2    6
# 3    4
# dtype: int64# 首字母大写
print(s.str.capitalize())# 每个单词首字母大写
print(s.str.title())# 替换字符串
print(s.str.replace('a', 'A'))# 检查是否包含某个字符串
print(s.str.contains('a'))
# 输出：
# 0     True
# 1     True
# 2    False
# 3     True
# dtype: bool# 分割字符串
s2 = pd.Series(['apple,banana', 'cherry,date'])
print(s2.str.split(','))

实际应用示例

示例 1：股票价格分析

import pandas as pd# 创建股票价格数据
prices = pd.Series([100, 102, 101, 105, 107, 106, 108], index=['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05', '2024-01-06', '2024-01-07'])# 计算价格变化
price_change = prices.diff()
print("每日价格变化：")
print(price_change)# 计算价格变化百分比
price_pct_change = prices.pct_change() * 100
print("\n每日价格变化百分比：")
print(price_pct_change)# 计算移动平均（3日）
ma3 = prices.rolling(window=3).mean()
print("\n3日移动平均：")
print(ma3)# 统计信息
print(f"\n最高价格: {prices.max()}")
print(f"最低价格: {prices.min()}")
print(f"平均价格: {prices.mean():.2f}")
print(f"价格波动（标准差）: {prices.std():.2f}")

示例 2：学生成绩管理

import pandas as pd# 创建学生成绩数据
scores = pd.Series([85, 92, 78, 96, 88, 90, 82], index=['张三', '李四', '王五', '赵六', '孙七', '周八', '吴九'])# 找出优秀学生（>= 90分）
excellent = scores[scores >= 90]
print("优秀学生：")
print(excellent)# 找出需要提高的学生（< 85分）
need_improve = scores[scores < 85]
print("\n需要提高的学生：")
print(need_improve)# 统计
print(f"\n平均分: {scores.mean():.2f}")
print(f"及格人数: {(scores >= 60).sum()}")
print(f"优秀人数: {(scores >= 90).sum()}")
print(f"最高分: {scores.max()}, 学生: {scores.idxmax()}")
print(f"最低分: {scores.min()}, 学生: {scores.idxmin()}")

示例 3：数据清洗

import pandas as pd
import numpy as np# 创建包含缺失值和异常值的数据
s = pd.Series([1, 2, np.nan, 4, 5, 100, 7, 8, np.nan, 10])print("原始数据：")
print(s)# 检查缺失值
print(f"\n缺失值数量: {s.isna().sum()}")# 填充缺失值（用均值）
s_filled = s.fillna(s.mean())
print("\n填充缺失值后：")
print(s_filled)# 删除缺失值
s_dropped = s.dropna()
print("\n删除缺失值后：")
print(s_dropped)# 检测异常值（使用 IQR 方法）
Q1 = s.quantile(0.25)
Q3 = s.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQRoutliers = s[(s < lower_bound) | (s > upper_bound)]
print(f"\n异常值: {outliers.tolist()}")