网站建设公司报价表自媒体平台大全
一、环境适当调整
- 数据收集:
RecordEpisodeStatistics
- 进行起始跳过n帧:
baseSkipFrame
- 一条生命结束记录为done:
EpisodicLifeEnv
- 得分处理成0或1:
ClipRewardEnv
- 叠帧:
FrameStack
- 图像环境的基本操作,方便CNN捕捉智能体的行动
- 向量空间reset处理修复
gym.vector.SyncVectorEnv
: 原始代码中的reset是随机的- 继承重写的
spSyncVectorEnv
方法,支持每个向量的环境的seed一致,利于同一seed下环境的训练
class spSyncVectorEnv(gym.vector.SyncVectorEnv):"""step_await _terminateds reset"""def __init__(self,env_fns: Iterable[Callable[[], Env]],observation_space: Space = None,action_space: Space = None,copy: bool = True,random_reset: bool = False,seed: int = None):super().__init__(env_fns, observation_space, action_space, copy)self.random_reset = random_resetself.seed = seeddef step_wait(self) -> Tuple[Any, NDArray[Any], NDArray[Any], NDArray[Any], dict]:"""Steps through each of the environments returning the batched results.Returns:The batched environment step results"""observations, infos = [], {}for i, (env, action) in enumerate(zip(self.envs, self._actions)):(observation,self._rewards[i],self._terminateds[i],self._truncateds[i],info,) = env.step(action)if self._terminateds[i]:old_observation, old_info = observation, infoif self.random_reset:observation, info = env.reset(seed=np.random.randint(0, 999999))else:observation, info = env.reset() if self.seed is None else env.reset(seed=self.seed) info["final_observation"] = old_observationinfo["final_info"] = old_infoobservations.append(observation)infos = self._add_info(infos, info, i)self.observations = concatenate(self.single_observation_space, observations, self.observations)return (deepcopy(self.observations) if self.copy else self.observations,np.copy(self._rewards),np.copy(self._terminateds),np.copy(self._truncateds),infos,)
二、pytorch实践
2.1 智能体构建与训练
详细可见 Github: test_ppo_atari.Breakout_v5_ppo2_test
调整向量环境的reset 之后,
- 支持actor, criticor用同一个cnn层提取特征(
PPOSharedCNN
) - 对
eps
进行了调小->eps=0.165
,希望更新的策略范围更小一些; - 关闭学习率衰减
- 进行不同
ent_coef
的尝试: 稍微大一点,增加agent的探索;ent_coef=0.015
&batch_size=256+128batch
陡降-回升慢ent_coef=0.025
&batch_size=256
陡降回升-最终reward=311
- √
ent_coef=0.05
&batch_size=256
-最终PPO2__AtariEnv instance__20241029__2217 reward=416
ent_coef=0.05
&batch_size=256+128
ent_coef=0.1
&batch_size=256
提升过于平缓
env_name = 'ALE/Breakout-v5'
env_name_str = env_name.replace('/', '-')
gym_env_desc(env_name)
print("gym.__version__ = ", gym.__version__ )
path_ = os.path.dirname(__file__)
num_envs = 12
episod_life = True
clip_reward = True
resize_inner_area = True # True
env_pool_flag = False # True
seed = 202404
envs = spSyncVectorEnv([make_atari_env(env_name, skip=4, episod_life=episod_life, clip_reward=clip_reward, ppo_train=True, max_no_reward_count=120, resize_inner_area=resize_inner_area) for _ in range(num_envs)],random_reset=False,seed=202404
)
dist_type = 'norm'
cfg = Config(envs, save_path=os.path.join(path_, "test_models" ,f'PPO2_{env_name_str}-2'), seed=202404,num_envs=num_envs,episod_life=episod_life,clip_reward=clip_reward,resize_inner_area=resize_inner_area,env_pool_flag=env_pool_flag,# 网络参数 Atria-CNN + MLPactor_hidden_layers_dim=[512, 256], critic_hidden_layers_dim=[512, 128], # agent参数actor_lr=4.5e-4, gamma=0.99,# 训练参数num_episode=3600, off_buffer_size=128, max_episode_steps=128, PPO_kwargs={'cnn_flag': True,'clean_rl_cnn': True,'share_cnn_flag': True,'continue_action_flag': False,'lmbda': 0.95,'eps': 0.165, # 0.165'k_epochs': 4, # update_epochs'sgd_batch_size': 512, 'minibatch_size': 256, 'act_type': 'relu','dist_type': dist_type,'critic_coef': 1.0, # 1.0'ent_coef': 0.05, 'max_grad_norm': 0.5, 'clip_vloss': True,'mini_adv_norm': True,'anneal_lr': False,'num_episode': 3600,}
)
minibatch_size = cfg.PPO_kwargs['minibatch_size']
max_grad_norm = cfg.PPO_kwargs['max_grad_norm']
cfg.trail_desc = f"actor_lr={cfg.actor_lr},minibatch_size={minibatch_size},max_grad_norm={max_grad_norm},hidden_layers={cfg.actor_hidden_layers_dim}",
agent = PPO2(state_dim=cfg.state_dim,actor_hidden_layers_dim=cfg.actor_hidden_layers_dim,critic_hidden_layers_dim=cfg.critic_hidden_layers_dim,action_dim=cfg.action_dim,actor_lr=cfg.actor_lr,critic_lr=cfg.critic_lr,gamma=cfg.gamma,PPO_kwargs=cfg.PPO_kwargs,device=cfg.device,reward_func=None
)
agent.train()
ppo2_train(envs, agent, cfg, wandb_flag=True, wandb_project_name=f"PPO2-{env_name_str}-NEW",train_without_seed=False, test_ep_freq=cfg.off_buffer_size * 10, online_collect_nums=cfg.off_buffer_size,test_episode_count=10, add_max_step_reward_flag=False,play_func='ppo2_play',ply_env=ply_env
)
2.2 训练出的智能体观测
最后将训练的最好的网络拿出来进行观察
env = make_atari_env(env_name, skip=4, episod_life=episod_life, clip_reward=clip_reward, ppo_train=True, max_no_reward_count=120, resize_inner_area=resize_inner_area, render_mode='human')()
ppo2_play(env, agent, cfg, episode_count=2, play_without_seed=False, render=True, ppo_train=True)