Action value function - Q-Learning.py

!pip install gymnasium numpy

import gymnasium as gym 
import numpy as np 
import math 
import random
import matplotlib.pyplot as plt

env = gym.make('CartPole-v1')

STATE_BOUNDS = list(zip(env.observation_space.low, env.observation_space.high))
STATE_BOUNDS[1] = [-0.5, 0.5]
STATE_BOUNDS[3] = [-math.radians(50), math.radians(50)]

NUM_BUCKETS = (1, 1, 6, 3)
NUM_ACTIONS = env.action_space.n
q_table = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,))

def discretize_state(state):
    discrete_states = []

    for i in range(len(state)):
        if state[i] <= STATE_BOUNDS[i][0]:
            discrete_state = 0
        elif state[i] >= STATE_BOUNDS[i][1]:
            discrete_state = NUM_BUCKETS[i] - 1
        else:
            bound_width = STATE_BOUNDS[i][1] - STATE_BOUNDS[i][0]
            offset = (NUM_BUCKETS[i] - 1) * STATE_BOUNDS[i][0] / bound_width
            scaling = (NUM_BUCKETS[i] - 1) / bound_width
            discrete_state = int(round(scaling * state[i] - offset))
        discrete_states.append(discrete_state)
    return tuple(discrete_states)

MAX_EPISODES = 1000
MAX_STEPS = 500
SUCCESS_STEPS = 450
SUCCESS_STREAK = 100

EPSILON_MIN = 0.01
EPSILON_MAX = 1
ALPHA_MIN = 0.1
ALPHA_MAX = 0.5
GAMMA = 0.99
DECAY_COEFF = 25

def decay_epsilon(step):
    return max(EPSILON_MIN, min(EPSILON_MAX, 1.0-math.log10((step+1)/DECAY_COEFF)))

def decay_alpha(step):
    return max(ALPHA_MIN, min(ALPHA_MAX, 1.0-math.log10((step+1)/DECAY_COEFF)))

def select_action(state, epsilon):
    if random.random() < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table[state])
    return action

q_vals = []
success_steps = []

def train():
    successful_episodes = 0
    for episode in range(MAX_EPISODES):
        epsilon = decay_epsilon(episode)
        alpha = decay_alpha(episode)
        observation, _ = env.reset()
        state_discrete = discretize_state(observation)
        for step in range(MAX_STEPS):
            #env.render()
            action = select_action(state_discrete, epsilon)
            observation, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            state = discretize_state(observation)
            best_q = np.amax(q_table[state])
            q_table[state_discrete + (action,)] += alpha * (reward + GAMMA*(best_q) - q_table[state_discrete + (action,)])
            state_discrete = state
            if done:
                q_vals.append(best_q)
                success_steps.append(step)
                print("Episode %d finished after %d time steps" % (episode, step))
                print("best q value: %f" % (float(best_q)))
                if (step >= SUCCESS_STEPS):
                    successful_episodes += 1
                    print("=============SUCCESS=============")
                else:
                    successful_episodes = 0
                    print("=============FAIL=============")
                break
            if successful_episodes > SUCCESS_STREAK:
                print("Training successful")
                return

def plot_q():
    plt.plot(q_vals)
    plt.title('Q-values over training steps')
    plt.xlabel('Training steps')
    plt.ylabel('Q-value')
    plt.show()

def plot_steps():
    plt.plot(success_steps)
    plt.title('Successful steps over training episodes')
    plt.xlabel('Training episode')
    plt.ylabel('Successful steps')
    plt.show()

train()
env.close()
print(q_table)

plot_q()
plot_steps()