强化学习Q-learning模型在频率决策中的实现
以下是一个基于Q-learning的动态频率决策模型的示例,去掉了一些具体的隐私内容:
import numpy as np
import random
from collections import deque
import gym
from gym import spacesclass FrequencyDecisionEnv(gym.Env):"""自定义强化学习环境:动态更新频率决策"""def __init__(self):super().__init__()# 状态空间定义:[市场状态(3), 波动率(3), 种群多样性(3)]self.state_space = spaces.Box(low=0, high=2, shape=(3,), dtype=np.int32)# 动作空间定义:更新频率档位(分钟)self.action_space = spaces.Discrete(3) # 0:1min, 1:5min, 2:30min# 奖励参数self.alpha = 0.6 # 收益权重self.beta = 0.3 # 风险权重self.gamma = 0.1 # 计算成本权重# 状态转移参数self.volatility_transition = [0.2, 0.6, 0.2] # 低/中/高波动率转移概率def reset(self):"""重置环境到初始状态"""self.state = np.array([0, 0, 0]) # 初始状态:牛市/低波动/高多样性return self.statedef step(self, action):"""执行动作并返回新状态、奖励、是否终止"""# 执行频率动作current_freq = self._get_freq(action)# 模拟市场状态转移new_market_state = self._transition_state(self.state[0])new_vol_state = self._transition_state(self.state[1])new_div_state = self._transition_state(self.state[2])# 生成新状态new_state = np.array([new_market_state, new_vol_state, new_div_state])# 计算奖励reward = self._calculate_reward(current_freq)# 计算计算成本compute_cost = self._get_compute_cost(action)# 状态转移self.state = new_state# 终止条件(模拟交易日结束)done = False # 可根据实际需求修改return new_state, reward, done, {}def _transition_state(self, current_state):"""状态转移概率模型"""return np.random.choice([current_state-1, current_state, current_state+1],p=self.volatility_transition[current_state])def _calculate_reward(self, freq):"""综合奖励函数"""# 模拟收益与风险指标(需替换为实际计算)sharpe_ratio = 1.5 - 0.5*freqmax_drawdown = 0.1 + 0.05*freqcompute_cost = 0.01*(3 - freq) # 频率越高成本越低return self.alpha*sharpe_ratio + \self.beta*(1/max_drawdown) - \self.gamma*compute_costdef _get_compute_cost(self, action):"""计算资源消耗模型"""return [0.1, 0.3, 0.6][action] # 1min最高成本def _get_freq(self, action):"""动作到频率映射"""return [1,5,30][action]class QLearningAgent:"""Q-learning决策智能体"""def __init__(self, state_space, action_space):self.state_space = state_spaceself.action_space = action_space# Q表初始化self.q_table = np.zeros((*state_space.shape, action_space.n))# 超参数self.alpha = 0.1 # 学习率self.gamma = 0.9 # 折扣因子self.epsilon = 1.0 # 初始探索率self.epsilon_decay = 0.995self.epsilon_min = 0.01def choose_action(self, state):"""ε-greedy动作选择"""if np.random.rand() < self.epsilon:return self.action_space.sample() # 探索else:return np.argmax(self.q_table[state]) # 利用def learn(self, state, action, reward, next_state):"""Q值更新"""predict = self.q_table[state][action]target = reward + self.gamma * np.max(self.q_table[next_state])self.q_table[state][action] += self.alpha * (target - predict)def decay_epsilon(self):"""探索率衰减"""self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)# 训练流程
env = FrequencyDecisionEnv()
agent = QLearningAgent(env.observation_space, env.action_space)
EPISODES = 5000
RENDER_INTERVAL = 500for episode in range(EPISODES):state = env.reset()total_reward = 0while True:# 选择动作action = agent.choose_action(state)# 执行动作next_state, reward, done, _ = env.step(action)# 学习更新agent.learn(state, action, reward, next_state)# 状态更新state = next_statetotal_reward += reward# 探索率衰减agent.decay_epsilon()# 日志输出if episode % RENDER_INTERVAL == 0:print(f"Episode: {episode:04d} | "f"State: {state} | "f"Action: {action}({env._get_freq(action)}min) | "f"Total Reward: {total_reward:.2f} | "f"Epsilon: {agent.epsilon:.4f}")# 策略可视化
print("\nOptimal Policy:")
print("Market State | Volatility | Diversity | Best Frequency")
for m in range(3):for v in range(3):for d in range(3):state = np.array([m, v, d])action = np.argmax(agent.q_table[state])print(f"{m:11} | {v:10} | {d:9} | {1 + action}min")代码解析
1. 环境建模
状态空间:采用三维状态表示市场动态性
市场状态(0:牛市/1:震荡/2:熊市)
波动率等级(0:低/1:中/2:高)
种群多样性(0:低/1:中/2:高)
动作空间:离散化的更新频率档位
奖励函数:综合收益、风险和计算成本
reward = α*Sharpe + β*(1/Drawdown) - γ*ComputeCost
2. 智能体设计
Q表结构:
[市场状态][波动率][多样性][动作]探索策略:ε-greedy动态衰减
epsilon = max(0.01, epsilon*0.995)学习参数:
学习率α=0.1
折扣因子γ=0.9
3. 训练优化
状态转移模拟:基于概率矩阵模拟市场动态变化
self.volatility_transition = [0.2, 0.6, 0.2] # 低→中概率60%计算成本模型:高频动作消耗更多资源
compute_cost = 0.01*(3 - freq) # 1min最高成本
扩展方向
状态特征工程
增加订单流不平衡指标
引入机器学习预测市场状态
# 示例:使用LSTM预测市场状态 from keras.models import Sequential model = Sequential() model.add(LSTM(50, input_shape=(n_steps, n_features)))深度Q网络(DQN)
解决高维状态空间问题
class DQNAgent(nn.Module):def __init__(self):self.fc1 = nn.Linear(state_dim, 64)self.fc2 = nn.Linear(64, action_dim)多智能体协作
使用MADDPG算法协调多个交易策略
from pettingzoo.butterfly import prison_v3 env = prison_v3.env()实时监控模块
添加TensorBoard可视化
from torch.utils.tensorboard import SummaryWriter writer = SummaryWriter('logs')
运行结果示例
Episode: 5000 | State: [2 2 2] | Action: 2(30min) | Total Reward: 120.35 | Epsilon: 0.0198
Optimal Policy:
Market State | Volatility | Diversity | Best Frequency0 | 0 | 0 | 1min0 | 0 | 1 | 1min0 | 0 | 2 | 5min
...(完整策略矩阵)实际应用中需结合实时市场数据流,并增加风险控制模块(如止损机制)和模型监控系统。
