Experiment 3 - Balancing the pole
Contents
import pathlib
from typing import List, Tuple, Dict
import gym
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import HTML
from lcs.agents.acs2 import Configuration, ACS2
from lcs.metrics import population_metrics
from lcs.strategies.action_selection import EpsilonGreedy, ActionDelay, KnowledgeArray
from myst_nb import glue
from tabulate import tabulate
from src.bayes_estimation import bayes_estimate
from src.commons import NUM_EXPERIMENTS
from src.decorators import repeat, get_from_cache_or_run
from src.metrics import parse_experiments_results
from src.utils import build_plots_dir_path, build_cache_dir_path
from src.visualization import biased_exploration_colors, PLOT_DPI
COLORS = biased_exploration_colors()
plt.ioff() # turn off interactive plotting
root_dir = pathlib.Path().cwd().parent.parent.parent
cwd_dir = pathlib.Path().cwd()
plot_dir = build_plots_dir_path(root_dir) / cwd_dir.name
cache_dir = build_cache_dir_path(root_dir) / cwd_dir.name
def run_experiment(env_provider, explore_trials, exploit_trials, **conf):
env = env_provider()
env.reset()
cfg = Configuration(**conf)
explorer = ACS2(cfg)
metrics_explore = explorer.explore(env, explore_trials)
exploiter = ACS2(cfg, explorer.population)
metrics_exploit = explorer.exploit(env, exploit_trials)
# Parse results into DataFrame
metrics_df = parse_experiments_results(metrics_explore, metrics_exploit, cfg.metrics_trial_frequency)
return metrics_df
def average_experiment_runs(runs_dfs: List[pd.DataFrame]) -> pd.DataFrame:
return pd.concat(runs_dfs).groupby(['trial', 'phase']).mean().reset_index(level='phase')
def plot_cp(epsilon_greedy_df, action_delay_df, knowledge_array_df, op_initial_df, explore_trials, buckets, plot_filename=None):
fig = plt.figure(figsize=(14, 10))
# Plots layout
gs = fig.add_gridspec(2, 1, hspace=.4)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
# Global title
fig.suptitle(f'Performance of CartPole environment discretized with {buckets} buckets', fontsize=24)
# Each axis
ma_window = 5 # moving average window
# Steps in trial
epsilon_greedy_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Epsilon Greedy', c=COLORS['eg'], ax=ax1)
action_delay_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Action Delay', c=COLORS['ad'], ax=ax1)
knowledge_array_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Knowledge Array', c=COLORS['ka'], ax=ax1)
op_initial_df['steps_in_trial'].rolling(window=ma_window).mean().plot(label='Optimistic Initial Quality', c=COLORS['oiq'], ax=ax1)
ax1.axvline(x=explore_trials, color='red', linewidth=1, linestyle="--")
ax1.axhline(y=195, color='black', linewidth=1, linestyle="--")
ax1.set_xlabel('Trial')
ax1.set_ylabel('Steps')
ax1.set_title(f'Steps in each trial')
ax1.set_ylim(0, 200)
# Population
epsilon_greedy_df['reliable'].rolling(window=ma_window).mean().plot(label='Epsilon Greedy', c=COLORS['eg'], ax=ax2)
action_delay_df['reliable'].rolling(window=ma_window).mean().plot(label='Action Delay', c=COLORS['ad'], ax=ax2)
knowledge_array_df['reliable'].rolling(window=ma_window).mean().plot(label='Knowledge Array', c=COLORS['ka'], ax=ax2)
op_initial_df['reliable'].rolling(window=ma_window).mean().plot(label='Optimistic Initial Quality', c=COLORS['oiq'], ax=ax2)
ax2.axvline(x=explore_trials, color='red', linewidth=1, linestyle="--")
ax2.set_xlabel('Trial')
ax2.set_ylabel('Classifiers')
ax2.set_title(f'Reliable classifiers')
# Create legend
handles, labels = ax2.get_legend_handles_labels()
fig.legend(handles, labels, loc='lower center', ncol=4)
if plot_filename:
fig.savefig(plot_filename, dpi=PLOT_DPI)
return fig
# settings
USE_RAY = True
explore_trials, exploit_trials = 500, 500
# Bucket configurations
buckets_v1 = (1, 1, 6, 6)
buckets_v2 = (4, 4, 4, 4)
buckets_v3 = (2, 2, 6, 6)
buckets_v4 = (1, 2, 4, 4)
buckets_v5 = (1, 1, 8, 8)
glue('41_e3_explore_trials', explore_trials, display=False)
glue('41_e3_exploit_trials', exploit_trials, display=False)
Experiment 3 - Balancing the pole¶
The challenging part about the Cart Pole problem is that attributes from the perception vector are described with different scales. Moreover, two of them range to infinity. This situation might occur when applying the ALCS agent to the real-world domain.
Splitting each attribute into a fixed amount of buckets is infeasible. Proposed solution involved assigning maximum, experienced values for both the cart \(\sigma_1\) and pole \(\sigma_3\) velocity. In this case:
cart velocity \(\sigma_1 \in [-0.5, 0.5]\),
pole velocity at tip \(\sigma_3 \in [-3500, 3500]\)
Additionally, a specific discretizer was used to divide each attribute into a predefined number of bins. This procedure implies precautions when performing the cross-over operation; therefore, it was disabled.
The experiment analyzes both the impact of selecting the granularity of the discretization scheme and the biased exploration technique. The ACS2 agent is first executing 500
explore trials using a specific method and then tries to use gained knowledge by selecting best action in further 500
exploit trials.
Five different discretization schemes chosen arbitrarily, defining a number of bins per attribute are listed below:
1, 1, 6, 6
,4, 4, 4, 4
,2, 2, 6, 6
,1, 2, 4, 4
,1, 1, 8, 8
The metrics of reliable population size and actual performance were both depicted in Figure 4.6 and estimated probabilistically for the above-mentioned schemes.
class CartPoleObservationWrapper(gym.ObservationWrapper):
# https://medium.com/@tuzzer/cart-pole-balancing-with-q-learning-b54c6068d947
# _high = [env.observation_space.high[0], 0.5, env.observation_space.high[2], math.radians(50)]
# _low = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -math.radians(50)]
def __init__(self, env, buckets):
super().__init__(env)
self._high = [env.observation_space.high[0], 0.5, env.observation_space.high[2], 3500]
self._low = [env.observation_space.low[0], -0.5, env.observation_space.low[2], -3500]
self._buckets = buckets
def observation(self, obs):
ratios = [(obs[i] + abs(self._low[i])) / (self._high[i] - self._low[i]) for i in range(len(obs))]
new_obs = [int(round((self._buckets[i] - 1) * ratios[i])) for i in range(len(obs))]
new_obs = [min(self._buckets[i] - 1, max(0, new_obs[i])) for i in range(len(obs))]
return [str(o) for o in new_obs]
def cp_env_provider(buckets: Tuple[int]):
return CartPoleObservationWrapper(gym.make('CartPole-v0'), buckets)
def cp_metrics(agent, env):
pop = agent.population
metrics = {}
metrics.update(population_metrics(pop, env))
return metrics
cp_base_params = {
"classifier_length": 4,
"number_of_possible_actions": 2,
"epsilon": 0.9,
"beta": 0.01,
"gamma": 0.995,
"initial_q": 0.5,
"theta_exp": 50,
"theta_ga": 50,
"do_ga": True,
"chi": 0.0,
"mu": 0.03,
"metrics_trial_frequency": 1,
"user_metrics_collector_fcn": cp_metrics
}
def buckets_to_str(buckets, delimiter='_'):
return f'{delimiter.join(map(str, buckets))}'
def run_cart_pole_biased_exploration(buckets):
env_provider = lambda: cp_env_provider(buckets)
eg = run_experiment(env_provider, explore_trials, exploit_trials, **(cp_base_params | {'action_selector': EpsilonGreedy}))
ad = run_experiment(env_provider, explore_trials, exploit_trials, **(cp_base_params | {'action_selector': ActionDelay, 'biased_exploration_prob': 0.5}))
ka = run_experiment(env_provider, explore_trials, exploit_trials, **(cp_base_params | {'action_selector': KnowledgeArray, 'biased_exploration_prob': 0.5}))
oiq = run_experiment(env_provider, explore_trials, exploit_trials, **(cp_base_params | {'action_selector': EpsilonGreedy, 'biased_exploration_prob': 0.8}))
return eg, ad, ka, oiq
@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/{buckets_to_str(buckets_v1)}.dill')
@repeat(num_times=NUM_EXPERIMENTS, use_ray=USE_RAY)
def cp_buckets_v1():
return run_cart_pole_biased_exploration(buckets_v1)
@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/{buckets_to_str(buckets_v2)}.dill')
@repeat(num_times=NUM_EXPERIMENTS, use_ray=USE_RAY)
def cp_buckets_v2():
return run_cart_pole_biased_exploration(buckets_v2)
@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/{buckets_to_str(buckets_v3)}.dill')
@repeat(num_times=NUM_EXPERIMENTS, use_ray=USE_RAY)
def cp_buckets_v3():
return run_cart_pole_biased_exploration(buckets_v3)
@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/{buckets_to_str(buckets_v4)}.dill')
@repeat(num_times=NUM_EXPERIMENTS, use_ray=USE_RAY)
def cp_buckets_v4():
return run_cart_pole_biased_exploration(buckets_v4)
@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/{buckets_to_str(buckets_v5)}.dill')
@repeat(num_times=NUM_EXPERIMENTS, use_ray=USE_RAY)
def cp_buckets_v5():
return run_cart_pole_biased_exploration(buckets_v5)
def extract(experiment_runs):
eg_dfs, ad_dfs, ka_dfs, oiq_dfs = [], [], [], []
for eg_df, ad_df, ka_df, oiq_df in experiment_runs:
eg_dfs.append(eg_df)
ad_dfs.append(ad_df)
ka_dfs.append(ka_df)
oiq_dfs.append(oiq_df)
return eg_dfs, ad_dfs, ka_dfs, oiq_dfs
# Run the calculations
cp_bv1_eg_dfs, cp_bv1_ad_dfs, cp_bv1_ka_dfs, cp_bv1_oiq_dfs = extract(cp_buckets_v1())
cp_bv2_eg_dfs, cp_bv2_ad_dfs, cp_bv2_ka_dfs, cp_bv2_oiq_dfs = extract(cp_buckets_v2())
cp_bv3_eg_dfs, cp_bv3_ad_dfs, cp_bv3_ka_dfs, cp_bv3_oiq_dfs = extract(cp_buckets_v3())
cp_bv4_eg_dfs, cp_bv4_ad_dfs, cp_bv4_ka_dfs, cp_bv4_oiq_dfs = extract(cp_buckets_v4())
cp_bv5_eg_dfs, cp_bv5_ad_dfs, cp_bv5_ka_dfs, cp_bv5_oiq_dfs = extract(cp_buckets_v5())
# Plot visualization
glue('41-e3-cartpole-fig',
plot_cp(
average_experiment_runs(cp_bv1_eg_dfs),
average_experiment_runs(cp_bv1_ad_dfs),
average_experiment_runs(cp_bv1_ka_dfs),
average_experiment_runs(cp_bv1_oiq_dfs),
explore_trials=explore_trials,
buckets=buckets_v1,
plot_filename=f'{plot_dir}/cartpole-performance.png'),
display=False)
Results¶
ACS2 parameters
\(\beta=0.01\), \(\gamma = 0.995\), \(\theta_r = 0.9\), \(\theta_i=0.1\), \(\epsilon = 0.9\) \(\theta_{GA} = 50\), \(\theta_{AS}=20\), \(\theta_{exp}=50\), \(m_u=0.03\), \(u_{max}=4\), \(\chi=0.0\).
Statistical verification¶
To statistically assess the population size, the posterior data distribution was modelled using 50
metric values collected in the last trial and then sampled with 100,000 draws. For the obtained reward, the average value from exploit trials is considered a representative state of algorithm performance.
experiments_data = {
buckets_v1: [cp_bv1_eg_dfs, cp_bv1_ad_dfs, cp_bv1_ka_dfs, cp_bv1_oiq_dfs],
buckets_v2: [cp_bv2_eg_dfs, cp_bv2_ad_dfs, cp_bv2_ka_dfs, cp_bv2_oiq_dfs],
buckets_v3: [cp_bv3_eg_dfs, cp_bv3_ad_dfs, cp_bv3_ka_dfs, cp_bv3_oiq_dfs],
buckets_v4: [cp_bv4_eg_dfs, cp_bv4_ad_dfs, cp_bv4_ka_dfs, cp_bv4_oiq_dfs],
buckets_v5: [cp_bv5_eg_dfs, cp_bv5_ad_dfs, cp_bv5_ka_dfs, cp_bv5_oiq_dfs]
}
def train_bayes_model(dfs, query_condition, field):
data_arr = pd.concat(dfs).query(query_condition)[field].to_numpy()
bayes_model = bayes_estimate(data_arr)
return bayes_model['mu'], bayes_model['std']
def build_models(dfs: Dict, field: str, query_condition: str):
results = {}
for bucket, dfs in dfs.items():
posteriors = [train_bayes_model(df, query_condition, field) for df in dfs]
results[bucket] = posteriors
return results
def print_bayes_table(data):
table_data = [[buckets_to_str(bucket, ',')] + rewards for bucket, rewards in data.items()]
table = tabulate(table_data,
headers=['', 'Epsilon Greedy', 'Action Delay', 'Knowledge Array', 'Optimistic Initial Quality'],
tablefmt="html", stralign='right', floatfmt=".2f")
return HTML(table)
print_row = lambda r: f'{round(r[0].mean(), 2)} ± {round(r[0].std(), 2)}'
# Average Steps in exploit phase
avg_reward = lambda dfs: pd.concat(dfs).query('phase == "exploit"')['steps_in_trial'].mean()
average_rewards_data = {}
for bucket, dfs in experiments_data.items():
average_rewards_data[bucket] = list(map(avg_reward, dfs))
# reliable classifiers
@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/bayes/reliable.dill')
def build_reliable_models(dfs: Dict):
return build_models(dfs, field='reliable', query_condition=f'trial == {explore_trials - 1}')
# run computations
reliable_data = build_reliable_models(experiments_data)
reliable_table_data = {}
for bucket, models in reliable_data.items():
reliable_table_data[bucket] = list(map(print_row, models))
# Add glue objects
glue('average_steps', print_bayes_table(average_rewards_data), display=False)
glue('bayes_reliable_classifies', print_bayes_table(reliable_table_data), display=False)
Epsilon Greedy Action Delay Knowledge Array Optimistic Initial Quality 1,1,6,6 178.40 138.69 175.72 171.20 4,4,4,4 18.85 19.14 20.34 18.56 2,2,6,6 59.73 44.58 95.68 60.72 1,2,4,4 133.93 150.62 128.70 132.36 1,1,8,8 181.61 154.09 172.75 176.42
Epsilon Greedy Action Delay Knowledge Array Optimistic Initial Quality 1,1,6,6 8.0 ± 0.0 9.39 ± 0.18 9.01 ± 0.23 8.0 ± 0.0 4,4,4,4 6.33 ± 0.26 4.38 ± 0.19 5.65 ± 0.23 6.06 ± 0.23 2,2,6,6 6.99 ± 0.25 7.29 ± 0.25 8.27 ± 0.33 7.48 ± 0.31 1,2,4,4 11.23 ± 0.18 9.83 ± 0.18 10.01 ± 0.18 10.86 ± 0.22 1,1,8,8 9.14 ± 0.12 8.92 ± 0.12 10.42 ± 0.21 9.18 ± 0.14
Observations¶
Surprisingly, when using the discretization of 1, 1, 6, 6
, the agent can keep the pole upright for about 175 steps in each trial after performing just 500 learning trials. This score was possible for every method except the AD. On the other side, AD created more reliable classifiers quicker than other methods.
The experiment’s performance turned out to be very sensitive to the discretization bins chosen. For example, a slightly larger amount of bins for pole angle and velocity (eight bins in both cases) increased the number of upright steps. In official terms, the environment is still not solved. However, it turned out that the number of reliable classifiers required to obtain such a score is less than 10. That allows a very compact and human-readable form of storing knowledge (see Table below for example).
@get_from_cache_or_run(cache_path=f'{cache_dir}/cart_pole/epsilon_greedy_single_run.dill')
def cp_single_run():
cfg = Configuration(**(cp_base_params | {'action_selector': EpsilonGreedy}))
agent = ACS2(cfg)
agent.explore(cp_env_provider(buckets_v1), explore_trials)
return agent # only interested in resulting population
# execute run
cp_agent = cp_single_run()
reliable = [cl for cl in cp_agent.population if cl.is_reliable()]
for cl in sorted(reliable, key=lambda cl: -cl.fitness):
print(
f'[{cl.condition} {cl.action} {cl.effect}]\t\tmark: {cl.mark}\tquality: {cl.q:.2f}\treward: {cl.r:.2f}\tnumerosity: {cl.num}')
[##23 0 ####] mark: 00## quality: 0.96 reward: 3.34 numerosity: 1
[##32 1 ####] mark: 00## quality: 0.96 reward: 3.24 numerosity: 1
[##22 1 ####] mark: 00## quality: 0.98 reward: 2.78 numerosity: 1
[##33 0 ####] mark: 00## quality: 0.95 reward: 2.23 numerosity: 3
[##12 0 ####] mark: 00## quality: 0.98 reward: 1.44 numerosity: 1
[##12 1 ####] mark: empty quality: 1.00 reward: 1.36 numerosity: 20
[##43 1 ####] mark: 00## quality: 0.97 reward: 1.32 numerosity: 6
[##43 0 ####] mark: empty quality: 1.00 reward: 1.22 numerosity: 20
It can be seen that the majority of reliable classifiers are marked on the first two attributes, meaning that they too sweep and therefore should be more distinguishable (for example, by increasing discretization). In order to set them properly, a dedicated hyper-parameter optimization process is advised.
Despite fragility, the obtained result is auspicious, showing that ALCS methods can be compared to other highly sophisticated black-box approaches and maintain a highly verbose problem model.
Software packages used
import session_info
session_info.show()
Click to view session information
----- gym 0.21.0 lcs NA matplotlib 3.5.1 myst_nb 0.13.1 pandas 1.4.0 session_info 1.0.0 src (embedded book's utils module) tabulate 0.8.9 -----
Click to view modules imported as dependencies
PIL 8.4.0 arviz 0.11.2 asttokens NA attr 21.4.0 babel 2.9.1 backcall 0.2.0 beta_ufunc NA binom_ufunc NA brotli NA cachetools 5.0.0 certifi 2021.10.08 cffi 1.15.0 cftime 1.5.2 charset_normalizer 2.0.10 click 7.1.2 cloudpickle 2.0.0 colorama 0.4.4 colorful 0.5.4 colorful_orig 0.5.4 cryptography 36.0.1 cycler 0.10.0 cython_runtime NA databricks_cli NA dateutil 2.8.2 debugpy 1.5.1 decorator 5.1.1 defusedxml 0.7.1 dill 0.3.4 docutils 0.16 entrypoints 0.3 executing 0.8.2 fastprogress 0.2.7 filelock 3.4.2 google NA greenlet 1.1.2 grpc 1.43.0 hiredis 2.0.0 idna 3.3 imagesize NA importlib_metadata NA ipykernel 6.7.0 ipython_genutils 0.2.0 ipywidgets 7.6.5 jedi 0.18.1 jinja2 3.0.3 jsonschema 3.2.0 jupyter_cache 0.4.3 jupyter_sphinx 0.3.2 jupyterlab_pygments 0.1.2 kiwisolver 1.3.2 linkify_it 1.0.3 markdown_it 1.1.0 markupsafe 2.0.1 matplotlib_inline NA mdit_py_plugins 0.2.8 mistune 0.8.4 mlflow 1.23.1 mpl_toolkits NA msgpack 1.0.3 myst_parser 0.15.2 nbclient 0.5.10 nbconvert 6.4.1 nbformat 5.1.3 nbinom_ufunc NA netCDF4 1.5.8 numpy 1.22.1 packaging 21.3 pandocfilters NA parso 0.8.3 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA prompt_toolkit 3.0.26 psutil 5.9.0 ptyprocess 0.7.0 pure_eval 0.2.2 pvectorc NA pydev_ipython NA pydevconsole NA pydevd 2.6.0 pydevd_concurrency_analyser NA pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pygments 2.11.2 pylab NA pymc3 3.11.4 pyparsing 3.0.7 pyrsistent NA pytz 2021.3 ray 1.9.2 redis 4.1.2 requests 2.27.1 scipy 1.7.3 semver 2.13.0 setproctitle 1.2.2 setuptools 60.5.0 six 1.16.0 socks 1.7.1 sphinx 4.4.0 sphinxcontrib NA sqlalchemy 1.4.31 stack_data 0.1.4 testpath 0.5.0 theano 1.1.2 tornado 6.1 tqdm 4.62.3 traitlets 5.1.1 typing_extensions NA uc_micro 1.0.1 unicodedata2 NA urllib3 1.26.8 wcwidth 0.2.5 xarray 0.21.0 yaml 6.0 zipp NA zmq 22.3.0
----- IPython 8.0.1 jupyter_client 7.1.2 jupyter_core 4.9.1 notebook 6.4.8 ----- Python 3.9.10 | packaged by conda-forge | (main, Feb 1 2022, 21:24:11) [GCC 9.4.0] Linux-5.13.0-30-generic-x86_64-with-glibc2.31 ----- Session information updated at 2022-02-24 17:06