The essay hereby only uses SARSA($\lambda$) method to solve the variant question of previous question.
Consider a variant of the reinforcement learning problem which is identical to that in previous question but where the player does not incur a cost by investing effort but can invest high effort if she has a sufficient energy.
The player starts with a given energy level $B$ which is decremented by $c_H$ whenever in a round the player invests high effort. If the energy level would become negative after subtracting the value of $c_H$, it is set equal to $0$.
In each round, an amount of energy of value $a$ is added to the player, independently with probability $p$.
The player can invest high effort in a round only if her energy level is at least $c_H$.
The player receives a prize of value $R$ if winning the game and this is the only reward received.
Use the following setting of parameters:
$p_H=0.55$
$p_L=0.45$
$c_H=1$
$R=1000$
$B=10$
$a=2$
$p=0.2$
step size $\eta=0.001$
Discount parameter $\gamma=0.9$
I hereby build a TwoPlayerGame class to give the environment of states (2-dimension, first is energy (0~10), and second is game state (-3~3)), the step function to get next state (with some probabilities), reward and done flag of differnt actions, and the reset function to restart the game. All environment attributes are set down here with the greedy policy function.
import numpy as np
n_states = 7
n_actions = 2
low = 0
high = 1
d = 3
p_h = 0.55
p_l = 0.45
c_h = 1
RETURN = 1000
b = 10
a = 2
p = 0.2
step = 0.001
gamma = 0.9
lam = 0.9
episodes = 500000
def win_or_loss(p):
return np.random.choice([1, 0], p = [p, 1-p])
def epsilon_greedy_policy(q_values, b, epsilon=0.1):
if b < c_h:
return 0
if np.random.binomial(1, epsilon) == 1:
return np.random.choice([low, high])
else:
return np.random.choice([action_ for action_, value_ in enumerate(q_values) if value_ == np.max(q_values)])
class TwoPlayerGame:
def __init__(self, initial_state):
self.initial_state = initial_state
self.state = self.initial_state
self.reward = 0.0
self.is_terminal = False
def step(self, action):
i, j = self.state # two-dimension
if win_or_loss(p_h if action else p_l): # win or loss by different prob, set limit below
self.state = (min(max(i - c_h * action, 0) + a * np.random.binomial(1, p), b), min(j + 1, n_states-1))
else:
self.state = (min(max(i - c_h * action, 0) + a * np.random.binomial(1, p), b), max(j - 1, 0))
if self.state[1] == n_states-1: # terminate
self.reward = RETURN
self.is_terminal = True
elif self.state[1] == 0:
self.is_terminal = True
return self.state, self.reward, self.is_terminal
def reset(self):
self.state = self.initial_state
self.reward = 0.0
self.is_terminal = False
return self.state
def main():
qsa = np.zeros((b+1, n_states, n_actions))
esa = np.zeros_like(qsa)
game = TwoPlayerGame((b, 3))
episodes = 500000
for index in range(episodes):
s = game.reset()
a = epsilon_greedy_policy(qsa[s], s[0])
done = False
while not done:
s_p, r, done = game.step(a) # next step
a_p = epsilon_greedy_policy(qsa[s_p], s_p[0])
theta = r + gamma * qsa[s_p][a_p] - qsa[s][a]
esa[s][a] += 1
for i in range(11):
for j in range(n_states):
for k in range(n_actions): # all related qsa
qsa[i, j, k] += step * esa[i, j, k] * theta
esa[i, j, k] *= gamma * lam
s = s_p
a = a_p
if index % 100000 == 0 and index != 0:
print('%s Iteration(s):\n' %index, np.argmax(qsa, axis=2))
print("\n At last, Optimal Policy is\n", np.argmax(qsa, axis=2),
"\n The related action value:\n", qsa)
if __name__ == '__main__':
main()
Answer this time:
[[0 0 0 0 0 0 0]
[0 1 1 1 0 0 0]
[0 0 1 1 1 1 0]
[0 1 1 1 1 0 0]
[0 1 0 1 1 1 0]
[0 1 1 1 1 0 0]
[0 1 0 1 0 1 0]
[0 1 1 0 1 1 0]
[0 1 1 1 1 1 0]
[0 1 1 1 1 1 0]
[0 1 1 1 1 1 0]]
Answer some other time:
[[0 0 0 0 0 0 0]
[0 1 1 0 0 1 0]
[0 0 1 0 1 0 0]
[0 1 0 1 0 1 0]
[0 0 1 0 1 1 0]
[0 1 0 1 0 1 0]
[0 1 1 0 1 1 0]
[0 1 0 1 1 1 0]
[0 1 1 1 1 1 0]
[0 1 1 1 1 1 0]
[0 1 1 1 1 1 0]]