当前位置：首页 > news >正文

强化学习3 Q-learning

news 2025/11/11 13:17:22

Q-learning 是无模型（model-free） 强化学习算法，不需要预先知道环境的转移概率 ( P(s’|s,a) )（即执行动作 ( a ) 从状态 ( s ) 转移到 ( s’ ) 的概率），仅依赖实际交互中观察到的“状态-动作-奖励-下一状态”（( s,a,r,s’ )）数据更新 Q 值。

无需转移概率的原因
- Q-learning 直接学习“状态-动作对”的预期累积奖励（Q 值），而非建模环境的转移规则。
- 它通过不断与环境交互，用实际观察到的 ( s’ ) 替代对所有可能下一状态的概率加权求和，简化了学习过程。
仅需依赖的信息
- 当前状态 ( s ) 和可选动作 ( a )（通过 get_legal_actions 获取）。
- 执行动作后的即时奖励 ( r )（环境反馈）。
- 执行动作后到达的实际下一状态 ( s’ )（环境反馈）。
与“有模型算法”的区别
- 有模型算法（如动态规划）需要预先知道 ( P(s’|s,a) ) 和奖励函数 ( R(s,a) )，才能计算最优策略。
- Q-learning 完全不需要这些先验知识，仅通过试错（trial and error）从交互中学习，更适用于环境模型未知的场景。

首先安装环境依赖

!pip3 install -q gymnasium[classic-control]

接着定义强化学习框架。主要用于训练、评估和可视化不同智能体（agent）在特定环境中的表现。

import os
import sys
import pandas as pd
from IPython.display import clear_outputimport random
import timeimport matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import clear_output
from tqdm import tqdm, trange%matplotlib inlineSEED = 42if "google.colab" in sys.modules and not os.path.exists(".setup_complete"):!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash!touch .setup_complete# This code creates a virtual display to draw game images on.
# It will have no effect if your machine has a monitor.
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY")) == 0:!bash ../xvfb startos.environ["DISPLAY"] = ":1"def moving_average(x, span=100):return pd.DataFrame({"x": np.asarray(x)}).x.ewm(span=span).mean().valuesdef seed_everything(env, seed=None):if seed is None:seed = SEEDrandom.seed(seed)os.environ["PYTHONHASHSEED"] = str(seed)np.random.seed(seed)env.reset(seed=seed)# 可视化智能体在环境中的行为，实时显示每一步的状态和累积奖励
def visualize_agent(env, agent, max_steps=100, delay=0.1):"""Visualize the agent's behavior in the environment.Args:env: The environmentagent: The trained agentmax_steps: Maximum number of steps to takedelay: Time delay between steps for visualization"""s, _ = env.reset()total_reward = 0for step in range(max_steps):# Render the environmentclear_output(True)plt.figure(figsize=(8, 6))plt.imshow(env.render())plt.title(f"Step: {step}, Total Reward: {total_reward:.2f}")plt.axis("off")plt.show()# Get action from the agenta = agent.get_best_action(s)  # Use best action for visualization# Take a step in the environmentnext_s, r, done, _, _ = env.step(a)# Update state and rewards = next_stotal_reward += r# Add delay for better visualizationtime.sleep(delay)if done:# Show final stateclear_output(True)plt.figure(figsize=(8, 6))plt.imshow(env.render())plt.title(f"Final State - Steps: {step + 1}, Total Reward: {total_reward:.2f}")plt.axis("off")plt.show()break# 同时训练多个不同的智能体并比较它们的性能
# 支持多组随机种子实验以确保结果稳定性
# 实时绘制平均奖励曲线及置信区间（标准差）def benchmark_agents(exp_setups,num_episodes=1000,plot_every=100,t_max=10000,span=100,patch_every=None,patch_foo=None,num_seeds=3,
):all_rewards = {}envs = {exp_setup["name"]: exp_setup["env"]() for exp_setup in exp_setups}agents_buiders = {exp_setup["name"]: exp_setup["agent_builder"] for exp_setup in exp_setups}train_foo = {exp_setup["name"]: exp_setup["train_foo"] for exp_setup in exp_setups}for seed in range(num_seeds):SEED = seed + 42  # Using different seedsagents = {agent_name: agent() for agent_name, agent in agents_buiders.items()}# Create a separate environment for each agent using the env functionfor agent_name, agent in agents.items():agents[agent_name].env = envs[agent_name]seed_rewards = {agent_name: [] for agent_name in agents_buiders}# Seed each environment separatelyfor agent_name in agents:seed_everything(envs[agent_name], seed=SEED)tbar = trange(num_episodes)tbar.set_description(f"Seed {seed + 1}/{num_seeds}")for i in tbar:for agent_name, agent in agents.items():seed_rewards[agent_name].append(train_foo[agent_name](envs[agent_name], agent))if i % 10 == 0:tbar.set_postfix({agent_name: seed_rewards[agent_name][-1] for agent_name in agents}, refresh=True)# Store rewards for this seedfor agent_name, rewards_list in seed_rewards.items():if agent_name not in all_rewards:all_rewards[agent_name] = []all_rewards[agent_name].append(rewards_list)# Average rewards across seedsavg_rewards = {agent_name: np.mean(np.array(seed_results), axis=0) for agent_name, seed_results in all_rewards.items()}# Calculate standard deviation for confidence intervalsstd_rewards = {agent_name: np.std(np.array(seed_results), axis=0) for agent_name, seed_results in all_rewards.items()}# Plot average performance across seeds with confidence tubesclear_output(True)plt.figure(figsize=(10, 6))for agent_name, rewards_list in avg_rewards.items():mean_rewards = moving_average(rewards_list, span=span)std_rewards_smoothed = moving_average(std_rewards[agent_name], span=span)# Plot mean lineplt.plot(mean_rewards, label=f"{agent_name} (avg of {num_seeds} seeds)")# Plot confidence tubes (mean ± std)plt.fill_between(range(len(mean_rewards)),mean_rewards - std_rewards_smoothed,mean_rewards + std_rewards_smoothed,alpha=0.2,)# Draw solid contour lines for the confidence tube bordersplt.plot(range(len(mean_rewards)), mean_rewards - std_rewards_smoothed, "--", color="gray", alpha=0.7)plt.plot(range(len(mean_rewards)), mean_rewards + std_rewards_smoothed, "--", color="gray", alpha=0.7)plt.title(f"{envs[list(envs.keys())[0]].spec.id} - Average performance across {num_seeds} seeds with confidence intervals")plt.legend()plt.show()return avg_rewards

实现Q-learning agent

import math
import random
from collections import defaultdictimport numpy as npclass QLearningAgent:def __init__(self, alpha, epsilon, discount, env):"""Q-Learning Agentbased on https://inst.eecs.berkeley.edu/~cs188/sp19/projects.htmlInstance variables you have access to- self.epsilon (exploration prob)- self.alpha (learning rate)- self.discount (discount rate aka gamma)Functions you should use- self.get_legal_actions(state) {state, hashable -> list of actions, each is hashable}which returns legal actions for a state- self.get_qvalue(state,action)which returns Q(state,action)- self.set_qvalue(state,action,value)which sets Q(state,action) := value!!!Important!!!Note: please avoid using self._qValues directly.There's a special self.get_qvalue/set_qvalue for that."""self.env = envself._qvalues = defaultdict(lambda: defaultdict(lambda: 0))self.alpha = alphaself.epsilon = epsilonself.discount = discountdef get_legal_actions(self, _state):return list(range(self.env.action_space.n))def get_qvalue(self, state, action):"""Returns Q(state,action)"""return self._qvalues[state][action]def set_qvalue(self, state, action, value):"""Sets the Qvalue for [state,action] to the given value"""self._qvalues[state][action] = value# ---------------------START OF YOUR CODE---------------------#def get_value(self, state):"""Compute your agent's estimate of V(s) using current q-valuesV(s) = max_over_action Q(state,action) over possible actions.Note: please take into account that q-values can be negative."""possible_actions = self.get_legal_actions(state)# If there are no legal actions, return 0.0if len(possible_actions) == 0:return 0.0# <YOUR CODE HERE>value = max(self.get_qvalue(state, action) for action in possible_actions)# </END OF YOUR CODE>return valuedef update(self, state, action, reward, next_state, *args, **kwargs):"""You should do your Q-Value update here:Q(s,a) := (1 - alpha) * Q(s,a) + alpha * (r + gamma * V(s'))"""# agent parametersgamma = self.discountlearning_rate = self.alpha# <YOUR CODE HERE>current_q = self.get_qvalue(state, action)next_v = self.get_value(next_state)new_q_value = (1 - learning_rate) * current_q + learning_rate * (reward + gamma * next_v)# </END OF YOUR CODE>self.set_qvalue(state, action, new_q_value)def get_best_action(self, state):"""Compute the best action to take in a state (using current q-values)."""possible_actions = self.get_legal_actions(state)# If there are no legal actions, return Noneif len(possible_actions) == 0:return None# <YOUR CODE HERE>max_q = max(self.get_qvalue(state, action) for action in possible_actions)best_actions = [action for action in possible_actions if self.get_qvalue(state, action) == max_q]best_action = best_actions[0]  # </END OF YOUR CODE>return best_actiondef get_action(self, state):"""Compute the action to take in the current state, including exploration.With probability self.epsilon, we should take a random action.otherwise - the best policy action (self.get_best_action).Note: To pick randomly from a list, use random.choice(list).To pick True or False with a given probablity, generate uniform number in [0, 1]and compare it with your probability"""# Pick Actionpossible_actions = self.get_legal_actions(state)action = None# If there are no legal actions, return Noneif len(possible_actions) == 0:return None# agent parameters:epsilon = self.epsilon# Tip: Use self.env.np_random.random() to generate a random number# <YOUR CODE HERE>if self.env.np_random.random() < epsilon:chosen_action = random.choice(possible_actions)else:chosen_action = self.get_best_action(state)# </END OF YOUR CODE>return chosen_action

定义Taxi-v3环境

import gymnasium as gymenv = gym.make("Taxi-v3", render_mode="rgb_array")n_actions = env.action_space.n

可视化初始状态

s, _ = env.reset(seed=SEED)
plt.imshow(env.render())
plt.show()

模型训练循环

def play_and_train(env, agent, t_max=10**4):"""This function should- run a full game, actions given by agent's e-greedy policy- train agent using agent.update(...) whenever it is possible- return total reward"""total_reward = 0.0s, _ = env.reset()for t in range(t_max):# get agent to pick action given state s.# <YOUR CODE HERE>a = agent.get_action(s)# </END OF YOUR CODE>next_s, r, terminated, truncated, _ = env.step(a)done = terminated# train (update) agent for state s# <YOUR CODE HERE>agent.update(s, a, r, next_s)# </END OF YOUR CODE>s = next_stotal_reward += rif done:breakreturn total_reward

用 Q-Learning 算法训练智能体在 Taxi-v3 环境中学习最优策略，并实时监控训练效果。探索率（epsilon）按 0.99 的比例衰减，减少后期随机探索，让智能体更倾向于选择已知最优动作。

from IPython.display import clear_outputagent = QLearningAgent(alpha=0.5, epsilon=0.25, discount=0.99, env=env)rewards = []
seed_everything(env)for i in range(1000):rewards.append(play_and_train(env, agent))agent.epsilon *= 0.99if i % 100 == 0:clear_output(True)plt.title("eps = {:e}, mean reward = {:.1f}".format(agent.epsilon, np.mean(rewards[-10:])))plt.plot(rewards)plt.plot(moving_average(rewards))plt.show()assert env.unwrapped.spec.id == "Taxi-v3" and np.mean(rewards[-100:]) >= 4.5, ("Please make sure that your agent is able to learn the optimal policy"
)