当前位置: 首页 > news >正文

flash网站设计教程北京建设网上银行

flash网站设计教程,北京建设网上银行,乡镇网站建设,简易网页制作工具视频讲解: DQN 玩 2048 实战|第三期!优化网络,使用GPU、Env奖励优化 1. 仅考虑局部合并奖励:目前的奖励只设置为合并方块时获得的分数,只关注了每一步的即时合并收益,而没有对最终达成 2048 这个…

视频讲解:

DQN 玩 2048 实战|第三期!优化网络,使用GPU、Env奖励优化

1. 仅考虑局部合并奖励:目前的奖励只设置为合并方块时获得的分数,只关注了每一步的即时合并收益,而没有对最终达成 2048 这个目标给予额外的激励,如果没有对达成 2048 给予足够的奖励信号,Agent 可能不会将其作为一个重要的目标

2. 训练硬件资源利用不高,没有使用GPU进行加速,默认为CPU,较慢

代码修改如下:

step函数里面,输入维度增加max_tile最大的数是多少

if 2048 in self.board:reward += 10000done = True
state = self.board.flatten()
max_tile = np.max(self.board)
state = np.append(state, max_tile)
return state, reward, done
input_size = 17

检查系统中是否有可用的 GPU,如果有则使用 GPU 进行计算,否则使用 CPU。

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

在 train ,创建模型实例后,使用 .to(device) 将模型移动到指定的设备(GPU 或 CPU)

model = DQN(input_size, output_size).to(device)
target_model = DQN(input_size, output_size).to(device)

在训练和推理过程中,将输入数据(状态、动作、奖励等)也移动到指定的设备上。

state = torch.FloatTensor(state).unsqueeze(0).to(device)next_state = torch.FloatTensor(next_state).unsqueeze(0).to(device)states = torch.FloatTensor(states).to(device)
actions = torch.LongTensor(actions).to(device)
rewards = torch.FloatTensor(rewards).to(device)
next_states = torch.FloatTensor(next_states).to(device)
dones = torch.FloatTensor(dones).to(device)

将 state 和 next_state 先使用 .cpu() 方法移动到 CPU 上,再使用 .numpy() 方法转换为 NumPy 数组

replay_buffer.add(state.cpu().squeeze(0).numpy(), action, reward, next_state.cpu().squeeze(0).numpy(), done)

这个不改的话,会出现 TypeError: can't convert cuda:0 device type tensor to numpy 错误

完整代码如下:

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib.table import Tabledevice = torch.device("cuda" if torch.cuda.is_available() else "cpu")# 2048 游戏环境类
class Game2048:def __init__(self):self.board = np.zeros((4, 4), dtype=int)self.add_random_tile()self.add_random_tile()def add_random_tile(self):empty_cells = np.argwhere(self.board == 0)if len(empty_cells) > 0:index = random.choice(empty_cells)self.board[index[0], index[1]] = 2 if random.random() < 0.9 else 4def move_left(self):reward = 0new_board = np.copy(self.board)for row in range(4):line = new_board[row]non_zero = line[line != 0]merged = []i = 0while i < len(non_zero):if i + 1 < len(non_zero) and non_zero[i] == non_zero[i + 1]:merged.append(2 * non_zero[i])reward += 2 * non_zero[i]i += 2else:merged.append(non_zero[i])i += 1new_board[row] = np.pad(merged, (0, 4 - len(merged)), 'constant')if not np.array_equal(new_board, self.board):self.board = new_boardself.add_random_tile()return rewarddef move_right(self):self.board = np.fliplr(self.board)reward = self.move_left()self.board = np.fliplr(self.board)return rewarddef move_up(self):self.board = self.board.Treward = self.move_left()self.board = self.board.Treturn rewarddef move_down(self):self.board = self.board.Treward = self.move_right()self.board = self.board.Treturn rewarddef step(self, action):if action == 0:reward = self.move_left()elif action == 1:reward = self.move_right()elif action == 2:reward = self.move_up()elif action == 3:reward = self.move_down()done = not np.any(self.board == 0) and all([np.all(self.board[:, i] != self.board[:, i + 1]) for i in range(3)]) and all([np.all(self.board[i, :] != self.board[i + 1, :]) for i in range(3)])if 2048 in self.board:reward += 10000done = Truestate = self.board.flatten()max_tile = np.max(self.board)state = np.append(state, max_tile)return state, reward, donedef reset(self):self.board = np.zeros((4, 4), dtype=int)self.add_random_tile()self.add_random_tile()state = self.board.flatten()max_tile = np.max(self.board)state = np.append(state, max_tile)return state# 深度 Q 网络类
class DQN(nn.Module):def __init__(self, input_size, output_size):super(DQN, self).__init__()self.fc1 = nn.Linear(input_size, 128)self.fc2 = nn.Linear(128, 128)self.fc3 = nn.Linear(128, output_size)def forward(self, x):x = torch.relu(self.fc1(x))x = torch.relu(self.fc2(x))return self.fc3(x)# 经验回放缓冲区类
class ReplayBuffer:def __init__(self, capacity):self.buffer = deque(maxlen=capacity)def add(self, state, action, reward, next_state, done):self.buffer.append((state, action, reward, next_state, done))def sample(self, batch_size):batch = random.sample(self.buffer, batch_size)states, actions, rewards, next_states, dones = zip(*batch)return np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(dones)def __len__(self):return len(self.buffer)# 可视化函数
def visualize_board(board, ax):ax.clear()table = Table(ax, bbox=[0, 0, 1, 1])nrows, ncols = board.shapewidth, height = 1.0 / ncols, 1.0 / nrows# 定义颜色映射cmap = mcolors.LinearSegmentedColormap.from_list("", ["white", "yellow", "orange", "red"])for (i, j), val in np.ndenumerate(board):color = cmap(np.log2(val + 1) / np.log2(2048 + 1)) if val > 0 else "white"table.add_cell(i, j, width, height, text=val if val > 0 else "",loc='center', facecolor=color)ax.add_table(table)ax.set_axis_off()plt.draw()plt.pause(0.1)# 训练函数
def train():env = Game2048()input_size = 17output_size = 4model = DQN(input_size, output_size).to(device)target_model = DQN(input_size, output_size).to(device)target_model.load_state_dict(model.state_dict())target_model.eval()optimizer = optim.Adam(model.parameters(), lr=0.001)criterion = nn.MSELoss()replay_buffer = ReplayBuffer(capacity=10000)batch_size = 32gamma = 0.99epsilon = 1.0epsilon_decay = 0.995epsilon_min = 0.01update_target_freq = 10num_episodes = 1000fig, ax = plt.subplots()for episode in range(num_episodes):state = env.reset()state = torch.FloatTensor(state).unsqueeze(0).to(device)done = Falsetotal_reward = 0while not done:visualize_board(env.board, ax)if random.random() < epsilon:action = random.randint(0, output_size - 1)else:q_values = model(state)action = torch.argmax(q_values, dim=1).item()next_state, reward, done = env.step(action)next_state = torch.FloatTensor(next_state).unsqueeze(0).to(device)replay_buffer.add(state.cpu().squeeze(0).numpy(), action, reward, next_state.cpu().squeeze(0).numpy(), done)if len(replay_buffer) >= batch_size:states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)states = torch.FloatTensor(states).to(device)actions = torch.LongTensor(actions).to(device)rewards = torch.FloatTensor(rewards).to(device)next_states = torch.FloatTensor(next_states).to(device)dones = torch.FloatTensor(dones).to(device)q_values = model(states)q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)next_q_values = target_model(next_states)next_q_values = next_q_values.max(1)[0]target_q_values = rewards + gamma * (1 - dones) * next_q_valuesloss = criterion(q_values, target_q_values)optimizer.zero_grad()loss.backward()optimizer.step()state = next_statetotal_reward += rewardif episode % update_target_freq == 0:target_model.load_state_dict(model.state_dict())epsilon = max(epsilon * epsilon_decay, epsilon_min)print(f"Episode {episode}: Total Reward = {total_reward}, Epsilon = {epsilon}")plt.close()if __name__ == "__main__":train()

http://www.dtcms.com/a/596473.html

相关文章:

  • 高端网站建设设计公司有哪些wordpress网站迁移后插件
  • redis进阶 - 底层数据结构
  • 【自然语言处理】语料库:自然语言处理的基石资源与发展全景
  • Rust: 量化策略回测与简易线程池构建、子线程执行观测
  • 基于systemd的系统负载控制与检测方案
  • 闲谈-三十而已
  • LangChain 是一个 **大语言模型(LLM)应用开发框架**
  • 从RAM/ROM到Redis:项目架构设计的存储智慧
  • 高中课程免费教学网站网页塔防游戏排行榜
  • Access导出带图表的 HTML 报表:技术实现详解
  • 郑州上海做网站的公司嘉兴网站建设有前途吗
  • 学习JavaScript进阶记录(二)
  • 优化用户体验的小点:乐观更新链路 双数据库查询
  • C++—list:list的使用及模拟实现
  • EasyExcel 与 Apache POI 版本冲突导致的 `NoSuchMethodError` 异常
  • WebServer04
  • 品牌网站建设技术网站搜索引擎优化诊断
  • 优秀企业网站设计WordPress评论楼层
  • 卡索(CASO)汽车调查:新能源时代如何精准度量用户体验?
  • 手动模拟Spring(简易版)
  • 蓝牙钥匙 第88次 蓝牙钥匙未来发展趋势篇:用户体验未来趋势深度解析
  • jmeter集群压测配置方法和注意事项
  • [笔记]SolidWorks转URDF 在rviz2中显示
  • 抖音商城店铺用户体验优化研究(开题报告)
  • 北京网站推广公司排名永久免费的cms系统带商城
  • STM32外设学习--USART串口协议--学习笔记。
  • 汉狮做网站公司郑州苏州建设网站的网络公司
  • 【C#-sqlSugar-sqlite】在Windows从源码编译构建System.Data.SQLite.dll的方法
  • 【WPF】WrapPanel的用法
  • wpf 控件中Margin属性如何设置绑定?