First import numpy
import numpy as np
Now define value array
utility = np.zeros(5)
We will use fixed number of iterations
iterations = 25
There will be two actions available - remove one or two matches
actions=np.array([1,2])
This function will give us expected utility for each action available in one particular state
def get_action_utility(state):
action_utility = np.zeros(len(actions))
for i in range(0, len(actions)):
action_utility[i] = (utility[np.mod(state - actions[i], 5)] + utility[np.mod(state - actions[i] - 1, 5)]) / 2
return action_utility
for iteration in range(0,iterations):
print("Iteration : %d" % iteration)
print(utility)
utility_new = np.zeros(5)
for state in range(1,5): # terminal state has probability of zero
utility_new[state] = -1 + np.amax( get_action_utility(state) )
utility = utility_new
Now we can pick the best action
for state in range(1,5):
print(get_action_utility(state))