In [None]:
!pip install gymnasium==1.2.0

In [29]:
import numpy as np
if not hasattr(np, "bool8"):
    np.bool8 = np.bool_
import gymnasium as gym

In [None]:
print(gym.__version__)

In [None]:
env = gym.make("Taxi-v3", render_mode="ansi", is_rainy=True, fickle_passenger=True)
P = env.unwrapped.P

# Inspect a few transitions to confirm non-determinism
for s in list(P.keys())[:3]:
    print(f"State {s}:")
    for a, trans in P[s].items():
        print(f"  Action {a}: {trans}")

We will be using OpenAI Gym. In Gym, every environment has a state and action space, accessible via `env.action_space` and `env.observation_space`

The underlying source code for the environment is available at: https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/envs/toy_text/taxi.py

In [None]:
action_size = env.action_space.n
state_size = env.observation_space.n
print(f"Action size: {action_size}")
print(f"State size: {state_size}")

In [None]:
# action space is represnted by a Discrete object
print(env.action_space)

In [57]:
def get_random_trajectory(render=False):
    ob = env.reset()
    traj_length = 0
    rewards = []
    while True:
        action = env.action_space.sample()
        ob, reward, done, _, _ = env.step(action)
        traj_length += 1
        rewards.append(reward)
        if done:
            break
        if render:
            env.render()
    return rewards, traj_length

In [None]:
rewards, traj_length = get_random_trajectory(bool(False))
print(np.sum(rewards))


To implement value iteration and policy iteration, we need the underlying transition distributions. This is in general not available in Gym environments, but we can access it for Taxi.

`env.env.P` is a dictionary containing the underlying transition and reward dynamics for the environment.

In [None]:
print(env.unwrapped.P.keys())
transition_dict = env.unwrapped.P

We can view the transition distribution for the initial state. Each key corresponds to an action, and the values give transition probabilities. The possible transitions are specified via tuples representing the probability of the transition, the next state, the reward, and whether the episode terminates.

Actions are 0-indexed and correspond to the following:
    - 0: move south
    - 1: move north
    - 2: move east
    - 3: move west
    - 4: pickup passenger
    - 5: drop off passenger

In [None]:
init_state = transition_dict[0]
actions = ["South", "North", "East", "West", "Pickup", "Dropoff"]
for k in init_state:
    print(f"action {k}: transitions (p, ns, r, d):  {init_state[k]}")

In [None]:
# helper function to print policy
def print_policy(policy):
    reshaped_policy = policy.reshape(6,6)
    for i in range(6):
        x = " "
        for j in range(6):
            x += actions[int(reshaped_policy[i][j])]
            if j < 5:
                x += " | "
        print(x)


In [None]:
policy = np.random.randint(6, size=36)
print("Random policy: ")
print_policy(policy)


We have provided some possible functions and their signatures, though you are certainly free to modify as you see fit.

In [None]:
def value_iteration(values, gamma, iterations=100, termination=1e-4):
    for _ in range(iterations):
        max_update = 0
        # can make asynchronous updates to values
        for i in range(state_size):
            # ********** TODO ***********
            pass
            # ********** TODO ***********
        # terminate if values don't change much
        if max_update < termination:
            break
    return policy, values

# estimate values
def policy_evaluation(policy, init_values, gamma, termination=1e-4):
    # ********** TODO ***********
    return values

# update actions
def policy_improvement(values, gamma):
    # ********** TODO ***********
    return policy







