用ε贪婪算法解决多老虎机问题
一、代码
# 4-2 用ε贪婪算法解决多老虎机问题
import numpy as np
import matplotlib.pyplot as plt
# 参数设置
c = 10 # 老虎机数量
l = 501 # 最后时刻
runs = 1000 # 运行次数
epsilon = 0.01 # ε贪婪中的ε
# 构造随机种子为指定值的随机数生成器
rng = np.random.default_rng(1)
# 初始化每一时刻的奖赏和
timestep_rewards = np.zeros(l) # 修改为大小为 l 的数组
timestep_rewards_epsilon = np.zeros(l) # 修改为大小为 l 的数组
# 运行循环
for run in range(runs):
# 每次运行的初始化
occ_actions = np.zeros(c) # 每个行动被选择的次数
acc_rewards = np.zeros(c) # 每个行动下累加获得的奖赏
estimated_rewards = np.zeros(c) # 每台老虎机奖赏期望值的估计值
occ_actions_epsilon = np.zeros(c) # 每个行动被选择的次数(ε贪婪)
acc_rewards_epsilon = np.zeros(c) # 每个行动下累加获得的奖赏(ε贪婪)
estimated_rewards_epsilon = np.zeros(c) # 每台老虎机奖赏期望值的估计值(ε贪婪)
means_bandits = rng.normal(0, 1, c) # 每次运行时的老虎机奖赏期望值
# 从第1个时刻到第l-1个时刻
for t in range(l - 1):
# 计算每台老虎机的奖赏期望值的估计值
for i in range(c):
estimated_rewards[i] = 0 if acc_rewards[i] == 0 else acc_rewards[i] / occ_actions[i]
estimated_rewards_epsilon[i] = 0 if acc_rewards_epsilon[i] == 0 else acc_rewards_epsilon[i] / occ_actions_epsilon[i]
# 选择贪婪行动
a_t = np.argmax(estimated_rewards).item()
# ε贪婪
if rng.random() > epsilon:
# 选择贪婪行动
a_t_epsilon = np.argmax(estimated_rewards_epsilon).item()
else:
# 随机选择行动
a_t_epsilon = rng.integers(0, c)
# 选择该行动后(在下一时刻)获得的奖赏
r_tp1 = rng.normal(means_bandits[a_t], 1)
r_tp1_epsilon = rng.normal(means_bandits[a_t_epsilon], 1)
# 累加当前行动选择获得的奖赏
occ_actions[a_t] += 1
acc_rewards[a_t] += r_tp1
occ_actions_epsilon[a_t_epsilon] += 1
acc_rewards_epsilon[a_t_epsilon] += r_tp1_epsilon
# 累加每个时刻下获得的奖赏
timestep_rewards[t + 1] += r_tp1
timestep_rewards_epsilon[t + 1] += r_tp1_epsilon
# 画平均奖赏线
plt.figure()
plt.plot(np.arange(1, l), timestep_rewards[1:] / runs, linewidth=2)
plt.plot(np.arange(1, l), timestep_rewards_epsilon[1:] / runs, 'r', linewidth=2)
plt.ylabel('Averaged rewards')
plt.xlabel('Time steps')
plt.title('Greedy v.s. Epsilon-greedy')
plt.legend(['Greedy', 'Epsilon-greedy'])
plt.show()