Siva Renganathan · dce657f5
--- a/bonus_point_assignment_2/dqn.ipynb

+ 28

− 12
+++ b/bonus_point_assignment_2/dqn.ipynb

+ 28

− 12
 %% Cell type:code id:0d58f9e7 tags:
 ``` python
 import os
 import copy
+%% Output
+    The autoreload extension is already loaded. To reload it, use:
+      %reload_ext autoreload
 %% Cell type:code id:45b9b8fb tags:
 %% Cell type:code id:b80e1114 tags:
    def __init__(self, env):
        super().__init__()
        self.q_network = nn.Sequential(
-            nn.Linear(np.array(env.single_observation_space.shape).prod(), 128),
+            nn.Linear(np.array(env.single_observation_space.shape).prod(), 256),
            nn.LeakyReLU(),
-            nn.Linear(128, 128),
+            nn.Linear(256, 256),
            nn.ReLU(),
-            nn.Linear(128, env.single_action_space.n),
+            nn.Linear(256, env.single_action_space.n),
        )
        self.target_network = copy.deepcopy(self.q_network)
 %% Cell type:code id:53f8844a tags:
 # agent training specific parameters and hyperparameters
 hypp.total_timesteps = 100000  # the training duration in number of time steps
-hypp.learning_rate = 3e-4  # the learning rate for the optimizer
+hypp.learning_rate = 1e-4  # the learning rate for the optimizer
 hypp.gamma = 0.99  # decay factor of future rewards
-hypp.buffer_size = 5000  # the size of the replay memory buffer
+hypp.buffer_size = 50000  # the size of the replay memory buffer
-hypp.target_network_frequency = 500  # the frequency of synchronization with target network
+hypp.target_network_frequency = 1000  # the frequency of synchronization with target network
-hypp.batch_size = 128  # number of samples taken from the replay buffer for one step
+hypp.batch_size = 32  # number of samples taken from the replay buffer for one step
-hypp.start_e = 1  # probability of exploration (epsilon) at timestep 0
+hypp.start_e = 1.0   # probability of exploration (epsilon) at timestep 0
-hypp.end_e = 0.05  # minimal probability of exploration (epsilon)
+hypp.end_e = 0.1  # minimal probability of exploration (epsilon)
-hypp.exploration_fraction = 0.05  # the fraction of total_timesteps it takes to go from start_e to end_e
+hypp.exploration_fraction = 0.0002  # the fraction of total_timesteps it takes to go from start_e to end_e
-hypp.start_learning = 5000  # the timestep the learning starts (before that the replay buffer is filled)
+hypp.start_learning = 1000  # the timestep the learning starts (before that the replay buffer is filled)
-hypp.train_frequency = 10  # the frequency of training
+hypp.train_frequency = 1  # the frequency of training
 %% Cell type:markdown id:d0155331-bfae-478c-a71c-4413613f2dd5 tags:
 %% Cell type:code id:0efaad3d-04b1-41f7-8596-a529a2773ad9 tags:
 %% Cell type:code id:2892bcc9-7e04-444f-9e9b-000d58503569 tags:
 %% Cell type:code id:6b8ccd37-8e65-444b-9e0d-0465bae02926 tags:
 %% Cell type:code id:52deb542-1e49-417e-91d3-65ddc32cb2fe tags:
 %% Cell type:code id:3af83dcd-1f86-4309-848b-4756a61b83fc tags:
 %% Cell type:code id:0da39b1c tags:
 ``` python
 # ------------------ RUN INIT - DO NOT EDIT ---------------------- #
 # reinit run_name
+%% Output
 %% Cell type:code id:c49f4c2b tags:
 ``` python
 agent_name = exp.run_name
 agent_exp_type = exp.exp_type  # both are needed to identify the agent location
+%% Output
+    <IPython.core.display.Video object>
 %% Cell type:code id:008addbe tags:
 ``` python
 eval_params = edict()  # eval_params - evaluation settings for trained agent
 eval_params.run_name00 = exp.run_name
+%% Output