First import numpy

In [1]:
import numpy as np

Now define value array

In [2]:
utility = np.zeros(5)

We will use fixed number of iterations

In [3]:
iterations = 25

There will be two actions available - remove one or two matches

In [4]:
actions=np.array([1,2])

This function will give us expected utility for each action available in one particular state

In [5]:
def get_action_utility(state):
    action_utility = np.zeros(len(actions))
    for i in range(0, len(actions)):
        action_utility[i] = (utility[np.mod(state - actions[i], 5)] + utility[np.mod(state - actions[i] - 1, 5)]) / 2
    return action_utility
In [6]:
for iteration in range(0,iterations):
        print("Iteration : %d" % iteration)
        print(utility)
        utility_new = np.zeros(5)
        for state in range(1,5): # terminal state has probability of zero
            utility_new[state] = -1 + np.amax( get_action_utility(state) )
        utility = utility_new
Iteration : 0
[ 0.  0.  0.  0.  0.]
Iteration : 1
[ 0. -1. -1. -1. -1.]
Iteration : 2
[ 0.  -1.5 -1.5 -1.5 -2. ]
Iteration : 3
[ 0.   -2.   -1.75 -1.75 -2.5 ]
Iteration : 4
[ 0.   -2.25 -2.   -2.   -2.75]
Iteration : 5
[ 0.    -2.375 -2.125 -2.125 -3.   ]
Iteration : 6
[ 0.     -2.5    -2.1875 -2.1875 -3.125 ]
Iteration : 7
[ 0.     -2.5625 -2.25   -2.25   -3.1875]
Iteration : 8
[ 0.      -2.59375 -2.28125 -2.28125 -3.25   ]
Iteration : 9
[ 0.       -2.625    -2.296875 -2.296875 -3.28125 ]
Iteration : 10
[ 0.       -2.640625 -2.3125   -2.3125   -3.296875]
Iteration : 11
[ 0.        -2.6484375 -2.3203125 -2.3203125 -3.3125   ]
Iteration : 12
[ 0.         -2.65625    -2.32421875 -2.32421875 -3.3203125 ]
Iteration : 13
[ 0.         -2.66015625 -2.328125   -2.328125   -3.32421875]
Iteration : 14
[ 0.         -2.66210938 -2.33007812 -2.33007812 -3.328125  ]
Iteration : 15
[ 0.         -2.6640625  -2.33105469 -2.33105469 -3.33007812]
Iteration : 16
[ 0.         -2.66503906 -2.33203125 -2.33203125 -3.33105469]
Iteration : 17
[ 0.         -2.66552734 -2.33251953 -2.33251953 -3.33203125]
Iteration : 18
[ 0.         -2.66601562 -2.33276367 -2.33276367 -3.33251953]
Iteration : 19
[ 0.         -2.66625977 -2.33300781 -2.33300781 -3.33276367]
Iteration : 20
[ 0.         -2.66638184 -2.33312988 -2.33312988 -3.33300781]
Iteration : 21
[ 0.         -2.66650391 -2.33319092 -2.33319092 -3.33312988]
Iteration : 22
[ 0.         -2.66656494 -2.33325195 -2.33325195 -3.33319092]
Iteration : 23
[ 0.         -2.66659546 -2.33328247 -2.33328247 -3.33325195]
Iteration : 24
[ 0.         -2.66662598 -2.33329773 -2.33329773 -3.33328247]

Now we can pick the best action

In [7]:
for state in range(1,5):
    print(get_action_utility(state))
[-1.66664886 -2.83330536]
[-1.33332062 -1.66664886]
[-2.49997711 -1.33332062]
[-2.33331299 -2.49997711]