RL%20Testing/lev_PPO.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "57963548",
   "metadata": {},
   "source": [
    "# Improved PPO Training for Maglev Pod\n",
    "\n",
    "## Major Changes to Enable Learning:\n",
    "\n",
    "### 1. **Reward Function (lev_pod_env.py)**\n",
    "**Problem**: Squared penalties created rewards of -7000 to -8200, making learning impossible\n",
    "- `(gap_error * 100)²` could reach 10,000+ for small errors\n",
    "- +1.0 survival bonus was meaningless compared to penalties\n",
    "\n",
    "**Solution**: Exponential reward shaping with reasonable scales\n",
    "- Gap reward: `exp(-0.5 * (error/3mm)²)` → smooth 0 to 1.0 range\n",
    "- Small linear penalties for orientation (~0.02/degree)\n",
    "- Success bonus: +2.0 for excellent hovering (gap < 1mm, angles < 2°)\n",
    "- **New reward range: -10 to +3 per step** (was -8200 to +1 total)\n",
    "\n",
    "### 2. **Network Architecture**\n",
    "**Changes**:\n",
    "- Increased hidden units: 128 → 256\n",
    "- Added LayerNorm for training stability\n",
    "- Deeper shared layers (3 layers instead of 2)\n",
    "- Better initialization for exploration\n",
    "\n",
    "### 3. **Training Hyperparameters**\n",
    "**Changes**:\n",
    "- Policy LR: 3e-4 → 5e-4 (faster learning)\n",
    "- Value LR: 3e-4 → 1e-3 (even faster value updates)\n",
    "- Entropy coefficient: 0.01 → 0.02 (more exploration)\n",
    "- Added gradient clipping (max norm 0.5)\n",
    "- GAE lambda: 0.97 → 0.95 (less biased advantage estimates)\n",
    "- Episodes: 1000 → 2000\n",
    "\n",
    "### 4. **Termination Conditions**\n",
    "**Tightened for safety**:\n",
    "- Gap bounds: 2-40mm → 3-35mm\n",
    "- Angle tolerance: 20° → 15°\n",
    "- Failure penalty: -50 → -10 (scaled with new rewards)\n",
    "\n",
    "## Expected Behavior:\n",
    "- **Rewards should be positive or mildly negative** during good episodes\n",
    "- **Gap error should steadily decrease** from initial ~15mm toward target 16.49mm\n",
    "- **Episodes that reach 500 steps** indicate successful hovering\n",
    "- **Look for improvement over first 500 episodes**, then fine-tuning after"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f28b2866",
   "metadata": {},
   "outputs": [],
   "source": [
    "import gymnasium as gym\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import torch\n",
    "from torch import nn\n",
    "from torch import optim\n",
    "from torch.distributions import Normal\n",
    "from lev_pod_env import LevPodEnv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c49c95b6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Torch Version: 2.8.0\n",
      "CUDA Available: True\n",
      "Using device: cuda\n"
     ]
    }
   ],
   "source": [
    "print(\"Torch Version:\", torch.__version__)\n",
    "print(\"CUDA Available:\", torch.cuda.is_available())\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "print(\"Using device:\", device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "70bb54d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "class ActorCriticNetwork(nn.Module):\n",
    "    def __init__(self, state_dim, action_dim, hidden_dim=256):\n",
    "        super().__init__()\n",
    "        # Larger network with layer normalization for better learning\n",
    "        self.shared_layers = nn.Sequential(\n",
    "            nn.Linear(state_dim, hidden_dim),\n",
    "            nn.LayerNorm(hidden_dim),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(hidden_dim, hidden_dim),\n",
    "            nn.LayerNorm(hidden_dim),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(hidden_dim, hidden_dim // 2),\n",
    "            nn.ReLU()\n",
    "        )\n",
    "        # Policy outputs mean and log_std for continuous actions\n",
    "        self.policy_mean = nn.Sequential(\n",
    "            nn.Linear(hidden_dim // 2, hidden_dim // 2),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(hidden_dim // 2, action_dim),\n",
    "            nn.Tanh()  # Constrain to [-1, 1] range\n",
    "        )\n",
    "        # Initialize log_std to encourage exploration initially\n",
    "        self.policy_log_std = nn.Parameter(torch.ones(action_dim) * -0.5)\n",
    "        \n",
    "        self.value_layers = nn.Sequential(\n",
    "            nn.Linear(hidden_dim // 2, hidden_dim // 2),\n",
    "            nn.ReLU(),\n",
    "            nn.Linear(hidden_dim // 2, 1),\n",
    "        )\n",
    "\n",
    "    def value(self, observation):\n",
    "        shared_output = self.shared_layers(observation)\n",
    "        state_value = self.value_layers(shared_output)\n",
    "        return state_value\n",
    "    \n",
    "    def policy(self, observation):\n",
    "        shared_output = self.shared_layers(observation)\n",
    "        mean = self.policy_mean(shared_output)\n",
    "        std = torch.exp(self.policy_log_std)\n",
    "        return mean, std\n",
    "    \n",
    "    def forward(self, state):\n",
    "        shared_output = self.shared_layers(state)\n",
    "        mean = self.policy_mean(shared_output)\n",
    "        std = torch.exp(self.policy_log_std)\n",
    "        state_value = self.value_layers(shared_output)\n",
    "        return mean, std, state_value"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "2ed37deb",
   "metadata": {},
   "outputs": [],
   "source": [
    "class PPOTrainer():\n",
    "    def __init__(self, actor_critic, ppo_clip_val=0.2, target_kl_div=0.02,\n",
    "                  max_policy_train_iters=40, value_train_iters=40, policy_lr=5e-4, value_lr=1e-3, \n",
    "                  entropy_coef=0.02):\n",
    "        self.ac = actor_critic\n",
    "        self.ppo_clip_val = ppo_clip_val\n",
    "        self.target_kl_div = target_kl_div\n",
    "        self.max_policy_train_iters = max_policy_train_iters\n",
    "        self.value_train_iters = value_train_iters\n",
    "        self.entropy_coef = entropy_coef\n",
    "\n",
    "        policy_params = list(self.ac.shared_layers.parameters()) + \\\n",
    "                       list(self.ac.policy_mean.parameters()) + \\\n",
    "                       [self.ac.policy_log_std]\n",
    "        self.policy_optimizer = optim.Adam(policy_params, lr=policy_lr)\n",
    "\n",
    "        value_params = list(self.ac.shared_layers.parameters()) + \\\n",
    "                      list(self.ac.value_layers.parameters())\n",
    "        self.value_optimizer = optim.Adam(value_params, lr=value_lr)\n",
    "\n",
    "    def train_policy(self, obs, acts, old_log_probs, gaes):\n",
    "        for _ in range(self.max_policy_train_iters):\n",
    "            self.policy_optimizer.zero_grad()\n",
    "\n",
    "            new_mean, new_std = self.ac.policy(obs)\n",
    "            new_dist = Normal(new_mean, new_std)\n",
    "            new_log_probs = new_dist.log_prob(acts).sum(dim=-1)\n",
    "\n",
    "            policy_ratio = torch.exp(new_log_probs - old_log_probs)\n",
    "            clipped_ratio = policy_ratio.clamp(1 - self.ppo_clip_val, 1 + self.ppo_clip_val)\n",
    "\n",
    "            clipped_loss = clipped_ratio * gaes\n",
    "            unclipped_loss = policy_ratio * gaes\n",
    "\n",
    "            policy_loss = -torch.min(clipped_loss, unclipped_loss).mean()\n",
    "            \n",
    "            # Increased entropy bonus to encourage more exploration\n",
    "            entropy = new_dist.entropy().mean()\n",
    "            policy_loss = policy_loss - self.entropy_coef * entropy\n",
    "\n",
    "            policy_loss.backward()\n",
    "            # Gradient clipping for stability\n",
    "            torch.nn.utils.clip_grad_norm_(self.policy_optimizer.param_groups[0]['params'], 0.5)\n",
    "            self.policy_optimizer.step()\n",
    "\n",
    "            kl_div = (old_log_probs - new_log_probs).mean()\n",
    "            if kl_div > self.target_kl_div:\n",
    "                break\n",
    "          \n",
    "    def train_value(self, obs, returns):\n",
    "        for _ in range(self.value_train_iters):\n",
    "            self.value_optimizer.zero_grad()\n",
    "\n",
    "            values = self.ac.value(obs)\n",
    "            value_loss = (returns - values).pow(2).mean()\n",
    "            \n",
    "            value_loss.backward()\n",
    "            # Gradient clipping for stability\n",
    "            torch.nn.utils.clip_grad_norm_(self.value_optimizer.param_groups[0]['params'], 0.5)\n",
    "            self.value_optimizer.step()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "6f4f9f4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "def discount_rewards(rewards, gamma=0.99):\n",
    "    discounted = np.zeros_like(rewards, dtype=np.float32)\n",
    "    running_add = 0\n",
    "    for t in reversed(range(len(rewards))):\n",
    "        running_add = running_add * gamma + rewards[t]\n",
    "        discounted[t] = running_add\n",
    "    return discounted\n",
    "\n",
    "def calculate_gaes(rewards, values, gamma=0.99, lam=0.95):\n",
    "    # Add 0 for terminal state bootstrap value\n",
    "    next_values = np.concatenate([values[1:], [0]])\n",
    "    deltas = rewards + gamma * next_values - values\n",
    "    gaes = discount_rewards(deltas, gamma * lam)\n",
    "    return gaes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d7b17705",
   "metadata": {},
   "outputs": [],
   "source": [
    "def rollout(model, env, max_steps=500):  \n",
    "    train_data = [[],[],[],[],[]]  # obs, actions, rewards, values, log_probs\n",
    "    gap_heights = []  # Track gap heights during episode\n",
    "    obs, _ = env.reset()  # Gymnasium returns (obs, info)\n",
    "    \n",
    "    ep_reward = 0\n",
    "    for _ in range(max_steps):\n",
    "        with torch.no_grad():  # No gradients needed during rollout\n",
    "            mean, std, val = model(torch.tensor([obs], dtype=torch.float32, device=device))\n",
    "\n",
    "        # Sample continuous action from Normal distribution\n",
    "        act_distribution = Normal(mean, std)\n",
    "        act = act_distribution.sample()\n",
    "        act_log_prob = act_distribution.log_prob(act).sum(dim=-1)\n",
    "        \n",
    "        # Convert to numpy array for environment\n",
    "        act_np = act.squeeze(0).cpu().numpy()\n",
    "        next_obs, reward, terminated, truncated, _ = env.step(act_np)\n",
    "        \n",
    "        # Extract gap heights from observation (first 4 values are normalized gaps)\n",
    "        # Denormalize gaps: multiply by gap_scale (0.015m = 15mm)\n",
    "        gap_heights.append(obs[:4] * env.gap_scale * 1000)  # Convert to mm\n",
    "\n",
    "        # Store as Python scalars (moving to CPU only when necessary)\n",
    "        for i, item in enumerate([obs, act_np, reward, val.item(), act_log_prob.item()]):\n",
    "            train_data[i].append(item)\n",
    "        \n",
    "        obs = next_obs\n",
    "        ep_reward += reward\n",
    "        done = terminated or truncated\n",
    "\n",
    "        if done:\n",
    "            break\n",
    "        \n",
    "    train_data = [np.array(x, dtype=np.float32) for x in train_data]\n",
    "    train_data[3] = calculate_gaes(rewards=train_data[2], values=train_data[3])\n",
    "    \n",
    "    # Calculate average gap height error\n",
    "    gap_heights = np.array(gap_heights)  # Shape: (steps, 4 sensors)\n",
    "    avg_gap_per_step = gap_heights.mean(axis=1)  # Average across 4 sensors\n",
    "    target_gap_mm = 16.491741\n",
    "    avg_gap_error = np.abs(avg_gap_per_step - target_gap_mm).mean()\n",
    "\n",
    "    return train_data, ep_reward, avg_gap_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7ee8fb34",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading maglev model from maglev_model.pkl...\n",
      "Model loaded. Degree: 6\n",
      "Force R2: 1.0000\n",
      "Torque R2: 0.9999\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\pulip\\AppData\\Local\\Temp\\ipykernel_32220\\3744901434.py:9: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at C:\\bld\\libtorch_1762089177580\\work\\torch\\csrc\\utils\\tensor_new.cpp:256.)\n",
      "  mean, std, val = model(torch.tensor([obs], dtype=torch.float32, device=device))\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward: 1990.308654679309\n",
      "Episode length: 500\n",
      "Reward per step: 3.980617309358618\n",
      "Average gap error: 19.005 mm\n",
      "\n",
      "Theoretical Maximum:\n",
      "  Best per step: 3.00\n",
      "  Realistic good per step: 0.80\n",
      "  Best total (500 steps): 1500\n",
      "  Current % of realistic: 497.6%\n"
     ]
    }
   ],
   "source": [
    "environ = LevPodEnv(use_gui=False, initial_gap_mm=14, max_steps=500)  # Start below target\n",
    "model = ActorCriticNetwork(environ.observation_space.shape[0], environ.action_space.shape[0]).to(device)\n",
    "train_data, reward, gap_error = rollout(model, environ)\n",
    "print(\"Reward:\", reward)\n",
    "print(\"Episode length:\", len(train_data[0]))\n",
    "print(\"Reward per step:\", reward / len(train_data[0]))\n",
    "print(f\"Average gap error: {gap_error:.3f} mm\")\n",
    "\n",
    "# Calculate theoretical maximum reward for reference (NEW REWARD STRUCTURE)\n",
    "max_steps = 500\n",
    "# Best case: gap_reward=1.0, no penalties, success_bonus=2.0\n",
    "theoretical_max_per_step = 3.0  # 1.0 (gap) + 2.0 (success bonus) + 0 (no penalties)\n",
    "# Realistic good case: gap_reward~0.9, small penalties\n",
    "realistic_good_per_step = 0.8\n",
    "theoretical_max_total = theoretical_max_per_step * max_steps\n",
    "realistic_good_total = realistic_good_per_step * max_steps\n",
    "\n",
    "print(f\"\\nTheoretical Maximum:\")\n",
    "print(f\"  Best per step: {theoretical_max_per_step:.2f}\")\n",
    "print(f\"  Realistic good per step: {realistic_good_per_step:.2f}\")\n",
    "print(f\"  Best total (500 steps): {theoretical_max_total:.0f}\")\n",
    "print(f\"  Current % of realistic: {(reward/realistic_good_total)*100:.1f}%\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1b8b1ca2",
   "metadata": {},
   "source": [
    "## Key Improvements Made:\n",
    "\n",
    "1. **Better Reward Scaling**: Changed from squared penalties (up to 10,000+) to exponential rewards (~0 to 1) with smaller linear penalties\n",
    "2. **Larger Network**: Increased from 128 to 256 hidden units with LayerNorm for better learning capacity\n",
    "3. **More Exploration**: Increased entropy coefficient from 0.01 to 0.02, initialized log_std higher\n",
    "4. **Gradient Clipping**: Added to prevent exploding gradients\n",
    "5. **Higher Learning Rates**: Increased policy LR to 5e-4 and value LR to 1e-3\n",
    "6. **Tighter Termination**: Reduced angle tolerance to 15° and gap bounds to 3-35mm\n",
    "7. **Success Bonus**: Added +2.0 reward for excellent hovering (gap < 1mm, angles < 2°)\n",
    "\n",
    "Expected improvements:\n",
    "- Rewards should now be in range [-10, +3] instead of [-8000, +1]\n",
    "- Model should learn meaningful distinctions between good and bad states\n",
    "- Training should show steady improvement in gap error over episodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e6f27ed4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAA90AAAJOCAYAAACqS2TfAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAAwm5JREFUeJzs3XdYFFfbBvB7C7B0kaYoUlQUu2KJFbtBYzQae+9GY2IvMbHkM9GoUaOxxMSoSTSJGjWJJXax9xILsYJYUBAUpMPufH/sy8jSZGGX2YX7d11zOTv1mXOWdZ89Z87IBEEQQEREREREREQGJ5c6ACIiIiIiIqLiikk3ERERERERkZEw6SYiIiIiIiIyEibdREREREREREbCpJuIiIiIiIjISJh0ExERERERERkJk24iIiIiIiIiI2HSTURERERERGQkTLqJiIiIiIiIjIRJNxFRCeHt7Q2ZTAaZTIY5c+ZIHY7JYjkVTxs2bBDrVSaTSR0OGVnLli3Fuh48eHChj5f5vbNhw4ZCH4+IShYm3UTF3NGjR3W+LCiVSoSEhOhsEx8fr7ONuSQamWPOmORyOWxtbVGlShUMGTIEly9fljpMs5Y1UcltMpf3TElJqKOjo/HJJ5+gdu3asLe3h6WlJdzd3VG9enW8//77+OKLLxAVFSV1mAYjVUKdObHLawoLCyuymEoCQyfUxjJnzpxs/z9ZWVnB2dkZ1apVQ7du3fD9998jISFB6lCJyMiUUgdAREVLrVbj008/xR9//CF1KEYhCAISExNx+/Zt3L59G5s2bcLOnTvRsWNHqUMjMzFz5kzExsYCAJo0aSJxNPoLCwtDs2bN8PjxY53lkZGRiIyMxM2bN/HHH38gMDAQrq6uEkVJZFwffPAB3nnnHQBAjRo1JI5GSxAEpKamIiYmBjExMQgJCcGOHTswY8YMrF+/Hp07d5Y6RCIyEibdRCXQ9u3bcf78eTRo0EDqUAymfv366NWrF5KTk3H69Gns2bMHAJCWloZPP/20WCfdycnJUCgUsLCwMPq5Ro8ejYoVK2Zbbo7JaW5GjBghdQiFMm3aNDHhtrS0RK9eveDn54fExEQ8ePAA58+fx507dySOsvhxcnLCJ598kuO60qVLF3E01KtXL6lDyOaTTz6Bo6MjoqKicPz4cZw9exaAtmdKly5d8Pvvv6NHjx4SR0lERiEQUbF25MgRAUC2qW3btuI2r1690lk3e/bsbMe5c+eOMGbMGKFKlSqCtbW1YG1tLdSoUUOYNWuW8PLlS51tu3TpIh5r5MiR4vIXL14IcrlcACAolUohISFBXNe/f39xnx49euTr2jLHPGjQIJ11jRo1EtdZWVnluL9U17Rlyxahb9++QvXq1QVXV1fBwsJCsLW1FapVqyZ8+OGHQmhoaLZYAwMDda710qVLQlBQkFCqVCkBgM4+a9euFWrUqCFYWVkJ5cqVEyZOnCjExcUJXl5eedZxTtavX69TzkeOHNFr+8xCQ0NzPdbs2bPF5V5eXsKLFy+E8ePHC+XLlxcsLS0FPz8/YdWqVTmeMzU1VVi7dq3Qpk0bwcXFRbCwsBBcXV2FJk2aCAsXLhQEQRAGDRqU499BTrG+qZzOnTsn9O/fX/Dy8hIsLS0FOzs7oVatWsKMGTOEyMjIbNtnPd7Zs2eFt99+W7C3txdsbW2Ftm3bClevXs22X+aYAwMD8yz3zJycnMT9Pv/88xy3uXr1qvDkyROdZVnfY2fPnhXatGkj2NraCm5ubsKYMWOEV69eCYIgCFu3bhXq1asnqFQqwcPDQ5g4caKQnJyc47m2bt0qBAUFCW5uboJSqRScnJyE5s2bCytXrhRSUlJy3Oe///4TRo0aJVSqVElQqVSCjY2NUKVKFWHcuHE67/Ws76mcpow6zPreTE1NFb788kuhUqVKgqWlpeDl5SXMnTtXUKvV+S7rzGXm5eX1xu1XrVolbm9hYSFcuXJFXHfz5k3ByspKXL948eIcr/HIkSPCTz/9JJa/q6urMGzYMOHZs2c5nrOo3q+CoN9nakHOlflzIrcp4/2R9f2c2RdffCG8++67QqVKlQQnJydBqVQKpUqVEho2bCh88cUXQnx8fLZYM59j/fr1OV5/VlnjzfrZ/tdffwkqlUpcb29vL0RFRWU7zo4dO4R33nlHKFOmjGBhYSE4OTkJbdu2Ff74449cz33jxg1h9OjRQpUqVQQbGxvB2tpaqFixotC/f3/h+vXr4nYHDhwQhgwZItSpU0dwd3cXLC0tBWtra6FSpUrCkCFDhH///VfcduPGjWKstra2QlxcnM45nz9/LiiVSnGbP//8M1/lRFQSMOkmKuayJt1lypQR5w8dOiQIwpuT7j/++EOwtrbO9UtOxYoVhQcPHojbf/PNN+I6f39/cfmuXbt09ss4vyDofvnKLbnKKq+ku1u3buI6Dw+PbPtKeU2dOnXK80ujg4ODzhcdQdD9Alm3bl3BxsYmxy9z06dPz/GY9evXF9zd3fNMJnMiRdLt7OwsVK1aNcfrWLt2rc4xo6KihICAgFzLMiMRMlTSvXTpUvFHlpwmd3d34dKlSzr7ZD5ew4YNdb6UZkylS5cWnj59qrNfQZNue3t7cb+ePXsKiYmJ+dov83usevXqOglgxtSyZUvh66+/zvHaBwwYoHO89PR0oWfPnnmWecOGDbMlY7///rtOIpJ1sre3F/bt2ycIQuGS7vbt2+e4/SeffJLvstY36RYE3R/w6tSpI6Smpgrp6elCw4YNxeXt2rUTNBpNjtfYunXrHOOuVKmS8Pz5c51zFeX7Vd/P1IKcy1BJt62tbZ7HqFmzpvgDU4bM6w2VdAuCkO3vaf78+eI6tVot9O3bN89YM/8InOG7774TLCwsct0nc/xjx47N8/iWlpbCgQMHBEEQhOTkZMHV1VVc99133+mcd+3ateI6Nzc3IS0tLV/lRFQSsHs5UQkzffp0TJkyBWlpafjkk09w5syZPLe/f/8++vXrh+TkZABArVq10LVrV6SmpuLnn3/G48ePce/ePfTp0wcnT54EALRq1UrcPyQkBM+fP4eLiwtOnDihc+zjx4+jdevWePjwIR48eCAuz7y/vpKTk3Hq1CkcOHBAXNazZ0+TuiYnJye8/fbbqFKlCpycnGBpaYlnz55h+/btePjwIeLi4jBt2jSxi3xWly9fhoWFBQYPHoyKFSvixo0bsLCwwPnz5/HVV1+J25UpUwYDBw5EfHw81q1bh5SUFH2LM5vff/8dFy5cyLZ85MiRcHBwKPTxAW1Xy5cvX2Lo0KFwdnbGypUrkZiYCABYvHixTvfvAQMG4OLFi+Lr6tWrIygoCEqlEhcuXMC9e/cAAL1790aNGjXw5Zdf4sWLFwCAdu3aoX379vmOKzg4GBMnToQgCAAAHx8f9O7dGzExMVi/fj1SU1Px7NkzvPfee7h16xasrKyyHePcuXPw8vJCnz59cOPGDfz9998AgJiYGPz444+YMWOGnqWVXZ06dXD8+HEAwJYtW7Bnzx689dZbqFevHpo0aYI2bdrAzs4uz2PcuHEDXl5e6NevH86ePYtDhw4B0A7MePToUdSsWRNdu3bF33//jStXrgAANm3ahAULFsDDwwMA8MUXX2DLli3iMZs2bYo2bdrgypUr+Ouvv8TyGDVqFH777TcAwJ07dzBw4EDxverq6opBgwYhPT0dP/74I+Li4vDq1Sv06NEDt2/fRunSpbFo0SJcuHABv//+u3iuRYsWifO53fqwf/9+9OjRA5UqVcK6desQGRkJAFixYgVmz54NS0vL/BX4/8TFxWHx4sXZlnt6eup0dV63bh3Onz+PJ0+e4MqVK/jiiy9gZWWFc+fOAQBcXFywcePGXAeEO3z4MFq1aoXmzZvj5MmTYt3cvXsX06ZNww8//ACgaN+vBflMLci52rdvDzs7O6xevRr3798H8PrWogz56cpfoUIF1KhRAxUqVICTkxMEQUBoaCh+//13JCQk4Nq1a1i1ahWmTp36xmMV1pAhQzB58mSxng4fPozp06cDABYsWIDNmzcDAORyOXr06IEaNWrgzp072LR
      "text/plain": [
       "<Figure size 1000x600 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reward examples at different gap errors:\n",
      "   0mm error → 1.0000 reward\n",
      "   1mm error → 0.9460 reward\n",
      "   3mm error → 0.6065 reward\n",
      "   5mm error → 0.2494 reward\n",
      "  10mm error → 0.0039 reward\n",
      "  15mm error → 0.0000 reward\n",
      "\n",
      "Compare to old reward: (error*100)² would be:\n",
      "   1mm error → -10,000 penalty\n",
      "   3mm error → -90,000 penalty\n",
      "   5mm error → -250,000 penalty\n",
      "  10mm error → -1,000,000 penalty\n"
     ]
    }
   ],
   "source": [
    "# Visualize the new reward function\n",
    "gap_errors_mm = np.linspace(0, 20, 100)\n",
    "gap_rewards = np.exp(-0.5 * (gap_errors_mm / 3.0)**2)\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.plot(gap_errors_mm, gap_rewards, linewidth=2)\n",
    "plt.axvline(x=1.0, color='g', linestyle='--', label='Success bonus threshold (1mm)')\n",
    "plt.axvline(x=3.0, color='orange', linestyle='--', label='1 std dev (3mm)')\n",
    "plt.xlabel('Gap Error (mm)', fontsize=12)\n",
    "plt.ylabel('Gap Reward Component', fontsize=12)\n",
    "plt.title('New Reward Function: Smooth Exponential Decay', fontsize=14, fontweight='bold')\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.legend()\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "print(\"Reward examples at different gap errors:\")\n",
    "for err in [0, 1, 3, 5, 10, 15]:\n",
    "    reward = np.exp(-0.5 * (err / 3.0)**2)\n",
    "    print(f\"  {err:2d}mm error → {reward:.4f} reward\")\n",
    "print(\"\\nCompare to old reward: (error*100)² would be:\")\n",
    "for err in [1, 3, 5, 10]:\n",
    "    old_penalty = (err * 100)**2\n",
    "    print(f\"  {err:2d}mm error → -{old_penalty:,} penalty\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "fb554183",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Logging to: RL_Trials/training_log_20251211_191801.txt\n",
      "Plot will be saved to: RL_Trials/gap_error_plot_20251211_191801.png\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "from datetime import datetime\n",
    "\n",
    "# Create RL_Trials folder if it doesn't exist\n",
    "os.makedirs('RL_Trials', exist_ok=True)\n",
    "\n",
    "# Create timestamped log file\n",
    "timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
    "log_file_path = f'RL_Trials/training_log_{timestamp}.txt'\n",
    "plot_file_path = f'RL_Trials/gap_error_plot_{timestamp}.png'\n",
    "\n",
    "# Define training params\n",
    "num_episodes = 2000  # Increased for more learning time\n",
    "print_freq = 20  # Print less frequently\n",
    "gui_freq = 100  # Show GUI every 100 episodes\n",
    "\n",
    "# Create PPO trainer with improved hyperparameters\n",
    "ppo = PPOTrainer(\n",
    "    model, \n",
    "    policy_lr=5e-4,          # Higher learning rate\n",
    "    value_lr=1e-3,           # Even higher for value function\n",
    "    target_kl_div=0.02,      # Allow more policy updates\n",
    "    max_policy_train_iters=40,\n",
    "    value_train_iters=40,\n",
    "    entropy_coef=0.02        # More exploration\n",
    ")\n",
    "\n",
    "# Open log file\n",
    "log_file = open(log_file_path, 'w')\n",
    "log_file.write(f\"Training Started: {timestamp}\\n\")\n",
    "log_file.write(f\"Number of Episodes: {num_episodes}\\n\")\n",
    "log_file.write(f\"Print Frequency: {print_freq}\\n\")\n",
    "log_file.write(f\"Target Gap Height: {16.491741} mm\\n\")\n",
    "log_file.write(f\"Network: 256 hidden units with LayerNorm\\n\")\n",
    "log_file.write(f\"Policy LR: 5e-4, Value LR: 1e-3, Entropy: 0.02\\n\")\n",
    "log_file.write(\"=\"*70 + \"\\n\\n\")\n",
    "log_file.flush()\n",
    "\n",
    "print(f\"Logging to: {log_file_path}\")\n",
    "print(f\"Plot will be saved to: {plot_file_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "64994dcf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading maglev model from maglev_model.pkl...\n",
      "Model loaded. Degree: 6\n",
      "Force R2: 1.0000\n",
      "Torque R2: 0.9999\n"
     ]
    }
   ],
   "source": [
    "environ = LevPodEnv(use_gui=True, initial_gap_mm=14, max_steps=500)  # Start below target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c3353cf5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ep   20 | R:  755.5 | Len: 204 | R/s:  3.70 (462.3%) | Gap: 16.74mm (min:14.73) | Best: 14.73mm\n",
      "Ep   40 | R:  407.2 | Len: 116 | R/s:  3.52 (439.9%) | Gap: 15.56mm (min:13.80) | Best: 13.80mm\n",
      "Ep   60 | R:  157.4 | Len:  57 | R/s:  2.78 (347.5%) | Gap: 14.87mm (min:13.91) | Best: 13.80mm\n",
      "Ep   80 | R:  182.7 | Len:  61 | R/s:  2.98 (372.0%) | Gap: 15.06mm (min:14.21) | Best: 13.80mm\n",
      "Ep  100 | R:  487.2 | Len: 134 | R/s:  3.65 (455.9%) | Gap: 16.10mm (min:14.31) | Best: 13.80mm\n",
      "Ep  120 | R: 1113.1 | Len: 297 | R/s:  3.75 (468.3%) | Gap: 17.64mm (min:15.95) | Best: 13.80mm\n",
      "Ep  140 | R: 1434.7 | Len: 385 | R/s:  3.72 (465.6%) | Gap: 18.21mm (min:16.13) | Best: 13.80mm\n",
      "Ep  160 | R:  641.4 | Len: 172 | R/s:  3.72 (464.8%) | Gap: 16.69mm (min:15.38) | Best: 13.80mm\n",
      "Ep  180 | R: 1029.0 | Len: 274 | R/s:  3.76 (469.7%) | Gap: 17.43mm (min:14.60) | Best: 13.80mm\n",
      "Ep  200 | R:  287.0 | Len:  85 | R/s:  3.39 (424.0%) | Gap: 15.61mm (min:14.18) | Best: 13.80mm\n",
      "Ep  220 | R:  330.9 | Len:  94 | R/s:  3.52 (440.4%) | Gap: 15.85mm (min:14.93) | Best: 13.80mm\n",
      "Ep  240 | R:  336.4 | Len: 103 | R/s:  3.28 (409.7%) | Gap: 15.15mm (min:13.83) | Best: 13.80mm\n",
      "Ep  260 | R:  128.2 | Len:  50 | R/s:  2.58 (321.9%) | Gap: 14.51mm (min:13.90) | Best: 13.80mm\n",
      "Ep  280 | R:  116.0 | Len:  46 | R/s:  2.51 (313.3%) | Gap: 14.30mm (min:13.49) | Best: 13.49mm\n",
      "Ep  300 | R:   95.0 | Len:  39 | R/s:  2.45 (306.0%) | Gap: 13.85mm (min:13.19) | Best: 13.19mm\n",
      "Ep  320 | R:  772.1 | Len: 200 | R/s:  3.86 (482.9%) | Gap: 16.77mm (min:15.05) | Best: 13.19mm\n",
      "Ep  340 | R:  152.7 | Len:  54 | R/s:  2.84 (354.5%) | Gap: 14.81mm (min:13.96) | Best: 13.19mm\n",
      "Ep  360 | R:  118.1 | Len:  47 | R/s:  2.52 (315.6%) | Gap: 14.36mm (min:13.50) | Best: 13.19mm\n",
      "Ep  380 | R:  290.8 | Len:  81 | R/s:  3.60 (450.2%) | Gap: 15.56mm (min:14.06) | Best: 13.19mm\n",
      "Ep  400 | R:  230.0 | Len:  69 | R/s:  3.35 (418.5%) | Gap: 15.38mm (min:14.74) | Best: 13.19mm\n",
      "Ep  420 | R:  305.9 | Len:  85 | R/s:  3.58 (447.5%) | Gap: 15.82mm (min:15.08) | Best: 13.19mm\n",
      "Ep  440 | R:  450.6 | Len: 116 | R/s:  3.90 (487.0%) | Gap: 16.25mm (min:14.81) | Best: 13.19mm\n",
      "Ep  460 | R:  624.2 | Len: 161 | R/s:  3.89 (486.0%) | Gap: 16.65mm (min:15.01) | Best: 13.19mm\n",
      "Ep  480 | R:  710.6 | Len: 192 | R/s:  3.70 (462.6%) | Gap: 16.62mm (min:14.71) | Best: 13.19mm\n",
      "Ep  500 | R:  131.1 | Len:  49 | R/s:  2.65 (331.8%) | Gap: 14.44mm (min:13.45) | Best: 13.19mm\n",
      "Ep  520 | R:  169.4 | Len:  58 | R/s:  2.90 (362.5%) | Gap: 14.97mm (min:14.22) | Best: 13.19mm\n",
      "Ep  540 | R:  929.9 | Len: 263 | R/s:  3.53 (441.4%) | Gap: 16.99mm (min:14.49) | Best: 13.19mm\n",
      "Ep  560 | R: 1760.6 | Len: 500 | R/s:  3.52 (440.1%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep  580 | R: 1763.0 | Len: 500 | R/s:  3.53 (440.7%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep  600 | R: 1775.4 | Len: 500 | R/s:  3.55 (443.8%) | Gap: 18.99mm (min:18.91) | Best: 13.19mm\n",
      "Ep  620 | R: 1298.7 | Len: 355 | R/s:  3.66 (457.5%) | Gap: 17.94mm (min:14.49) | Best: 13.19mm\n",
      "Ep  640 | R: 1576.3 | Len: 438 | R/s:  3.60 (450.3%) | Gap: 18.63mm (min:16.35) | Best: 13.19mm\n",
      "Ep  660 | R: 1762.6 | Len: 500 | R/s:  3.53 (440.7%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep  680 | R: 1761.3 | Len: 500 | R/s:  3.52 (440.3%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep  700 | R: 1761.0 | Len: 500 | R/s:  3.52 (440.2%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep  720 | R: 1754.8 | Len: 500 | R/s:  3.51 (438.7%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep  740 | R: 1755.3 | Len: 500 | R/s:  3.51 (438.8%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep  760 | R: 1756.6 | Len: 500 | R/s:  3.51 (439.2%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep  780 | R: 1759.2 | Len: 500 | R/s:  3.52 (439.8%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep  800 | R: 1756.9 | Len: 500 | R/s:  3.51 (439.2%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep  820 | R: 1759.2 | Len: 500 | R/s:  3.52 (439.8%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep  840 | R: 1593.2 | Len: 436 | R/s:  3.65 (456.7%) | Gap: 18.62mm (min:16.57) | Best: 13.19mm\n",
      "Ep  860 | R: 1209.1 | Len: 334 | R/s:  3.62 (452.2%) | Gap: 17.92mm (min:15.21) | Best: 13.19mm\n",
      "Ep  880 | R:  509.8 | Len: 149 | R/s:  3.43 (429.0%) | Gap: 16.16mm (min:14.21) | Best: 13.19mm\n",
      "Ep  900 | R:  496.0 | Len: 148 | R/s:  3.36 (419.9%) | Gap: 15.86mm (min:14.56) | Best: 13.19mm\n",
      "Ep  920 | R: 1770.0 | Len: 500 | R/s:  3.54 (442.5%) | Gap: 18.99mm (min:18.97) | Best: 13.19mm\n",
      "Ep  940 | R: 1763.3 | Len: 500 | R/s:  3.53 (440.8%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep  960 | R: 1753.9 | Len: 500 | R/s:  3.51 (438.5%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep  980 | R: 1751.9 | Len: 500 | R/s:  3.50 (438.0%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep 1000 | R: 1756.6 | Len: 500 | R/s:  3.51 (439.1%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep 1020 | R: 1754.6 | Len: 500 | R/s:  3.51 (438.7%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep 1040 | R: 1759.2 | Len: 500 | R/s:  3.52 (439.8%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1060 | R: 1756.7 | Len: 500 | R/s:  3.51 (439.2%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1080 | R: 1758.8 | Len: 500 | R/s:  3.52 (439.7%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep 1100 | R: 1756.2 | Len: 500 | R/s:  3.51 (439.1%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep 1120 | R: 1756.5 | Len: 500 | R/s:  3.51 (439.1%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep 1140 | R: 1760.5 | Len: 500 | R/s:  3.52 (440.1%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep 1160 | R: 1760.5 | Len: 500 | R/s:  3.52 (440.1%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1180 | R: 1756.5 | Len: 500 | R/s:  3.51 (439.1%) | Gap: 19.00mm (min:18.99) | Best: 13.19mm\n",
      "Ep 1200 | R: 1760.0 | Len: 500 | R/s:  3.52 (440.0%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1220 | R: 1758.7 | Len: 500 | R/s:  3.52 (439.7%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1240 | R: 1760.4 | Len: 500 | R/s:  3.52 (440.1%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1260 | R: 1753.5 | Len: 500 | R/s:  3.51 (438.4%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1280 | R: 1753.9 | Len: 500 | R/s:  3.51 (438.5%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1300 | R: 1758.0 | Len: 500 | R/s:  3.52 (439.5%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1320 | R: 1762.7 | Len: 500 | R/s:  3.53 (440.7%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1340 | R: 1693.0 | Len: 459 | R/s:  3.69 (460.9%) | Gap: 18.61mm (min:16.25) | Best: 13.19mm\n",
      "Ep 1360 | R:  713.0 | Len: 181 | R/s:  3.94 (492.2%) | Gap: 16.38mm (min:15.05) | Best: 13.19mm\n",
      "Ep 1380 | R: 2118.4 | Len: 486 | R/s:  4.36 (545.4%) | Gap: 18.38mm (min:17.32) | Best: 13.19mm\n",
      "Ep 1400 | R: 2157.4 | Len: 495 | R/s:  4.36 (544.9%) | Gap: 18.50mm (min:18.01) | Best: 13.19mm\n",
      "Ep 1420 | R: 1181.5 | Len: 262 | R/s:  4.50 (563.0%) | Gap: 16.90mm (min:15.79) | Best: 13.19mm\n",
      "Ep 1440 | R: 1332.5 | Len: 298 | R/s:  4.46 (558.1%) | Gap: 17.08mm (min:15.65) | Best: 13.19mm\n",
      "Ep 1460 | R: 1496.5 | Len: 332 | R/s:  4.51 (563.8%) | Gap: 17.27mm (min:15.62) | Best: 13.19mm\n",
      "Ep 1480 | R: 1545.4 | Len: 339 | R/s:  4.56 (570.1%) | Gap: 17.26mm (min:15.87) | Best: 13.19mm\n",
      "Ep 1500 | R:  862.8 | Len: 201 | R/s:  4.29 (536.3%) | Gap: 16.17mm (min:14.88) | Best: 13.19mm\n",
      "Ep 1520 | R:  809.8 | Len: 193 | R/s:  4.20 (524.6%) | Gap: 16.03mm (min:14.74) | Best: 13.19mm\n",
      "Ep 1540 | R:  861.1 | Len: 204 | R/s:  4.22 (527.7%) | Gap: 16.25mm (min:14.93) | Best: 13.19mm\n",
      "Ep 1560 | R: 1445.2 | Len: 329 | R/s:  4.40 (549.4%) | Gap: 17.24mm (min:15.19) | Best: 13.19mm\n",
      "Ep 1580 | R: 1993.4 | Len: 486 | R/s:  4.11 (513.2%) | Gap: 18.55mm (min:16.26) | Best: 13.19mm\n",
      "Ep 1600 | R: 1985.4 | Len: 500 | R/s:  3.97 (496.4%) | Gap: 18.75mm (min:18.57) | Best: 13.19mm\n",
      "Ep 1620 | R: 1776.8 | Len: 500 | R/s:  3.55 (444.2%) | Gap: 18.97mm (min:18.91) | Best: 13.19mm\n",
      "Ep 1640 | R: 1755.2 | Len: 500 | R/s:  3.51 (438.8%) | Gap: 18.99mm (min:18.97) | Best: 13.19mm\n",
      "Ep 1660 | R: 1751.1 | Len: 500 | R/s:  3.50 (437.8%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1680 | R: 1746.6 | Len: 500 | R/s:  3.49 (436.7%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1700 | R: 1746.2 | Len: 500 | R/s:  3.49 (436.5%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1720 | R: 1747.8 | Len: 500 | R/s:  3.50 (437.0%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1740 | R: 1743.0 | Len: 500 | R/s:  3.49 (435.8%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1760 | R: 1743.4 | Len: 500 | R/s:  3.49 (435.8%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1780 | R: 1744.3 | Len: 500 | R/s:  3.49 (436.1%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1800 | R: 1744.0 | Len: 500 | R/s:  3.49 (436.0%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1820 | R: 1739.4 | Len: 500 | R/s:  3.48 (434.8%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1840 | R: 1736.2 | Len: 500 | R/s:  3.47 (434.1%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1860 | R: 1732.7 | Len: 500 | R/s:  3.47 (433.2%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1880 | R: 1732.1 | Len: 500 | R/s:  3.46 (433.0%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1900 | R: 1732.2 | Len: 500 | R/s:  3.46 (433.0%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1920 | R: 1729.1 | Len: 500 | R/s:  3.46 (432.3%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1940 | R: 1728.6 | Len: 500 | R/s:  3.46 (432.1%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1960 | R: 1728.0 | Len: 500 | R/s:  3.46 (432.0%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 1980 | R: 1728.5 | Len: 500 | R/s:  3.46 (432.1%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n",
      "Ep 2000 | R: 1726.8 | Len: 500 | R/s:  3.45 (431.7%) | Gap: 19.00mm (min:19.00) | Best: 13.19mm\n"
     ]
    }
   ],
   "source": [
    "# Training Loop with Better Monitoring\n",
    "ep_rewards = []\n",
    "ep_lengths = []\n",
    "ep_gap_errors = []\n",
    "best_gap_error = float('inf')\n",
    "gui_env = None\n",
    "\n",
    "for episode_idx in range(num_episodes):\n",
    "  train_data, ep_reward, gap_error = rollout(model, environ)\n",
    "  \n",
    "  ep_length = len(train_data[0])\n",
    "  ep_rewards.append(ep_reward)\n",
    "  ep_lengths.append(ep_length)\n",
    "  ep_gap_errors.append(gap_error)\n",
    "  \n",
    "  # Track best performance\n",
    "  if gap_error < best_gap_error:\n",
    "    best_gap_error = gap_error\n",
    "\n",
    "  # Data Formatting\n",
    "  permute_idxs = np.random.permutation(len(train_data[0]))\n",
    "  obs = torch.tensor(train_data[0][permute_idxs], dtype=torch.float32, device=device)\n",
    "  acts = torch.tensor(train_data[1][permute_idxs], dtype=torch.float32, device=device)\n",
    "  gaes = torch.tensor(train_data[3][permute_idxs], dtype=torch.float32, device=device)\n",
    "  act_log_probs = torch.tensor(train_data[4][permute_idxs], dtype=torch.float32, device=device)\n",
    "\n",
    "  returns = discount_rewards(train_data[2])[permute_idxs]\n",
    "  returns = torch.tensor(returns, dtype=torch.float32, device=device)\n",
    "\n",
    "  # Normalize GAEs for stable training\n",
    "  gaes = (gaes - gaes.mean()) / (gaes.std() + 1e-8)\n",
    "\n",
    "  ppo.train_policy(obs, acts, act_log_probs, gaes)\n",
    "  ppo.train_value(obs, returns)\n",
    "\n",
    "  if (episode_idx + 1) % print_freq == 0:\n",
    "    avg_reward = np.mean(ep_rewards[-print_freq:])\n",
    "    avg_length = np.mean(ep_lengths[-print_freq:])\n",
    "    avg_gap_error = np.mean(ep_gap_errors[-print_freq:])\n",
    "    min_gap_error = np.min(ep_gap_errors[-print_freq:])\n",
    "    avg_reward_per_step = avg_reward / avg_length if avg_length > 0 else 0\n",
    "    \n",
    "    # Updated for new reward scale (realistic good is ~0.8/step)\n",
    "    realistic_good_per_step = 0.8\n",
    "    percent_of_realistic = (avg_reward_per_step / realistic_good_per_step) * 100\n",
    "    \n",
    "    output_line = (f\"Ep {episode_idx + 1:4d} | R: {avg_reward:6.1f} | Len: {avg_length:3.0f} | \"\n",
    "                   f\"R/s: {avg_reward_per_step:5.2f} ({percent_of_realistic:5.1f}%) | \"\n",
    "                   f\"Gap: {avg_gap_error:5.2f}mm (min:{min_gap_error:5.2f}) | Best: {best_gap_error:5.2f}mm\")\n",
    "    \n",
    "    print(output_line)\n",
    "    log_file.write(output_line + \"\\n\")\n",
    "    log_file.flush()\n",
    "\n",
    "# Close GUI environment if created\n",
    "if gui_env is not None:\n",
    "  gui_env.close()\n",
    "\n",
    "# Close log file\n",
    "log_file.write(\"\\n\" + \"=\"*70 + \"\\n\")\n",
    "log_file.write(f\"Training Completed: {datetime.now().strftime('%Y%m%d_%H%M%S')}\\n\")\n",
    "log_file.write(f\"Best Gap Error Achieved: {best_gap_error:.3f} mm\\n\")\n",
    "log_file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "3678193c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAABKUAAAJOCAYAAABm7rQwAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQABAABJREFUeJzsvXe85VS5//9Jstvp/UyvDAwzwAxDVYpUQUQEUREvIigKygUUhJ8XvhYsiILitSt6UdQrIHq96AUbSJPmDAwDAwxMhWHa6XW3lPX7IzvJSnayd3bP3ud5v15nJmUlWdlJVvms53mWwBhjIAiCIAiCIAiCIAiCIIgqItY6AwRBEARBEARBEARBEMTMg0QpgiAIgiAIgiAIgiAIouqQKEUQBEEQBEEQBEEQBEFUHRKlCIIgCIIgCIIgCIIgiKpDohRBEARBEARBEARBEARRdUiUIgiCIAiCIAiCIAiCIKoOiVIEQRAEQRAEQRAEQRBE1SFRiiAIgiAIgiAIgiAIgqg6JEoRBEEQBEEQBEEQBEEQVYdEKYIgCILIIAiC+feLX/yi5PPdeOON5vkWL15c8vkIImiU+x3fsWOH7Tt85JFHSj4nQRAEQRDBhUQpgiCIBmV4eBjf/OY3cfrpp2Pu3LloampCe3s7DjjgAJx44om48cYb8a9//QuMsVpnNYtf/OIXeTumjzzySNlFpKDgvP9C4YWCXH+N9Js52bBhAy6//HIccsgh6OzsRCQSwaxZs3DyySfjm9/8JsbHx2udxaLx82ydfzt27Kh1tokaUGpZUu846wm3v2XLlrkeOzY2hs997nM4+OCD0dLSgvb2dhx++OG45ZZbkEwmPa/597//He9+97sxa9YsRKNRzJs3D+effz7WrVtXqdskCIKoa0K1zgBBEARRfv7rv/4Ln/70pzE1NWXbnkwmMTk5ic2bN+PRRx/Fl770JWzfvp2seDLceuut5vKRRx5Zw5wQxaIoCj7zmc/gu9/9bta+gYEBDAwM4OGHH8Y3vvEN/Pd//zdOO+20GuSycTjttNPQ2toKAOjo6Cj5fN3d3bbvcL/99iv5nARRKNu3b8dJJ52E119/3bb9ueeew3PPPYe7774bf//739HT02Pb/4UvfAFf+cpXbNt2796Ne+65B/feey9+8pOf4GMf+1jF808QBFFPkChFEATRYHz729/GNddcY64LgoCTTjoJRx99NNrb2zEyMoINGzbg8ccfRyKRqGFOg8e1115b6yxUhBtuuAFdXV1Z2wsR3iYnJ9HW1pYzTTKZhCRJCIfDBecxH37PfeWVV+LHP/6xuT5v3jycd9556OnpwcaNG3HvvfdCVVUMDQ3hrLPOwj/+8Q8ce+yxZc9vKeT7rXnRBgC2bt1qu+cPfOADOOKII2xpuru7i75eLo455hgcc8wxRR3rRnt7e8N+h0RtOeKII/CBD3wga7uzbNQ0Deeff74pSHV3d+PSSy9FMpnET37yEyQSCaxfvx6f+MQncO+995rH/d///Z9NkHrHO96B4447Dvfffz+eeuopaJqGT37ykzjyyCOxevXqCt0lQRBEHcIIgiCIhuGVV15hoVCIAWAAWG9vL3vqqadc005PT7Pbb7+dDQ4OmtsSiQS74YYb2Omnn86WLFnC2tvbWSgUYj09Pez4449n3/ve95gsy7bzbN++3bweAPbwww+zX/7yl+ywww5jsViM9fX1sUsuuYTt27fP9338/Oc/zzqnk4cfftiW5uc//3lWmueee45dfPHFbMmSJSwajbLW1lZ2xBFHsG9961sskUhkpc91vunpafYf//EfbMGCBSwajbIVK1aw733ve2zbtm2eef3iF79obl+0aBGbnJxk1113HVuwYAGLRCLsgAMOYD/84Q89f0u3vy9+8Yt5fz/+ugDY9u3b8x7j/M0nJyfZZz7zGbZw4UImSZJ53RNOOMFMc9FFF7HnnnuOnXHGGayzszPrWps2bWKXXXYZW7ZsGYvFYqy5uZktX76cXXnlla55KuTcbjz55JO2ezjiiCPYxMSELc3DDz/MJEky0xx00EFMVVWmqipbuHChuf0rX/lK1vmvuuoqc//BBx9s27d582Z2+eWXs+XLl7OmpibW1NTEDj74YPaFL3yBjY2NZZ1r0aJFtmf64IMPsre97W2sra2NFdo8y/ctOPe/9tpr7Ctf+Qrbf//9WTgcZhdddBFjjLG1a9eyyy67jB155JFs7ty5LBaLsVgsxhYtWsQ+8IEPsMcffzzr2s53PNc9PvPMM+wd73gHa2trYy0tLezUU09lGzZssB3jVp54XWt0dJR9+tOfZvPnz3f9nnheeOEF9q53vYu1tbWxtrY2dtppp7F169blzH+xON/jTZs2sfe9732sq6uLxWIx9pa3vMW1TPPLfffdx04//XTW39/PQqEQa2trY0uXLmVnn302+9rXvsZUVS2qLHn44YfZ+9//fvP3bG9vZ8cddxz72c9+xlRVtaUtpdz3k/9ywb/7xnuejwceeMB2b3/729/Mfbfffrtt3yuvvGLuO+qoo8ztxx57rLk9lUqxJUuWmPs+8IEPlO3+CIIgGgESpQiCIBqIT3ziE7YG8+9///uCjh8cHMzbkTn11FOZoijmMc7Oycknn+x63LJly9jQ0JCvfJRDlPre975nEx+cf0ceeWSWWOB1vnQ6zY4//njX85x11lm+OtF9fX3s0EMPdT3H7bff7vpb1lKUOvbYY12vy3e416xZw5qbm12vdc8997BYLOZ5H21tbeyvf/2rLQ9+z+3FRRddZEv/j3/8wzXdBRdcYEv3yCOPMMYY+/znP29uW7lype0YVVXZnDlzzP233Xabue/3v/89a2pq8rzX/fbbj73++uu28/GCzVve8pasd7UQChWlnM/W6KzfeuutOd89QRCyzu1XlDrqqKNsgrnx193dzfbu3Wse41eU6unpYQceeGDO78lg7dq1rLW1NStdLBZjp556qmf+i4V/j1etWuV67UgkwjZu3FjwuZ3fqdtfIpEouCz57Gc/mzPtmWeeydLptJm+2HLfb/4LSc//nXDCCbbfi3/3u7u7WU9PDwuHw2zWrFnsrLPOYn/+85+zfmO+Hm1vb2eappn7hoeHbdf7xje+wRhjbO/evbbt3/rWt2znvPLKK819LS0tZRXeCIIg6h1y3yMIgmgg/vGPf5jLXV1deM973lPQ8UbQ16OPPhpz585FV1cXZFnGpk2bcO+990JRFDz44IP4/e9/j/POO88zDyeddBKOP/54PPHEE3jooYcAAFu2bMFnP/tZ/OxnPyv4vu65556sILFbt271TP/EE0/gqquuMoO4H3fccTj11FMxNjaGO++8E6Ojo1i7di0++clP4je/+U3e63/nO9/B448/bq6vWrUKZ599NjZs2IA//vGPvu5hcHAQIyMj+OhHP4qenh784Ac/QDweBwB885vfxMc//nEzns66detwzz33mMfy7lrFuEr99Kc/dXXfy+Um9cQTT+DYY4/FKaecgsnJScyfPz8rzfr16xEOh3HxxRdjv/32w0svvYRwOIzNmzfjwx/+MFKpFACgr68PF110ERRFwR133IGJiQlMTk7i/e9/P1577TXMmjXL97lzwT+j7u5unHTSSa7p3v/+9+O///u/bcedcMIJuPjii/HVr34VjDG8/PLL2LBhg+lm88gjj2DPnj0AgHA4jA996EMAgG3btuGCCy4wAx+vWrUK55xzDtLpNH71q19h165d2Lp1Kz74wQ/iiSeecM3P008/jba2NlxwwQWYO3duxQMiP/HEE1i1ahXOPPNMaJpmxoKKxWJ461vfikMPPRQ9PT1oaWnB+Pg4HnroIaxduxaMMXzmM5/BBz7wATQ1NRV0zX/9619YtGgRPvjBD+Kll17Cn/70JwDAyMgI7rjjDlx//fUFnW94eBhjY2M5vyeDj3zkI7b4eh/84AexdOlS/Pa3v8WDDz5Y0HUL5YUXXkBvby8+8YlPYN++ffjVr34FAEin0/jud7+Ln/zkJwWd70c/+pG5fOSRR+Jd73oXFEXBzp078cwzz+CVV14BgILKkt/
      "text/plain": [
       "<Figure size 1200x600 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "✓ Training log saved to: RL_Trials/training_log_20251211_191801.txt\n",
      "✓ Gap error plot saved to: RL_Trials/gap_error_plot_20251211_191801.png\n"
     ]
    }
   ],
   "source": [
    "# Create and save gap error plot\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.plot(ep_gap_errors, alpha=0.3, label='Per Episode')\n",
    "\n",
    "# Calculate moving average (window size = print_freq)\n",
    "window_size = print_freq\n",
    "if len(ep_gap_errors) >= window_size:\n",
    "    moving_avg = np.convolve(ep_gap_errors, np.ones(window_size)/window_size, mode='valid')\n",
    "    plt.plot(range(window_size-1, len(ep_gap_errors)), moving_avg, linewidth=2, label=f'{window_size}-Episode Moving Average')\n",
    "\n",
    "plt.xlabel('Episode', fontsize=12)\n",
    "plt.ylabel('Average Gap Height Error (mm)', fontsize=12)\n",
    "plt.title('Gap Height Error Over Training, n_steps=500', fontsize=14, fontweight='bold')\n",
    "plt.legend(fontsize=11)\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.tight_layout()\n",
    "plt.savefig(plot_file_path, dpi=150, bbox_inches='tight')\n",
    "plt.show()\n",
    "\n",
    "print(f\"\\n✓ Training log saved to: {log_file_path}\")\n",
    "print(f\"✓ Gap error plot saved to: {plot_file_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "7c225451",
   "metadata": {},
   "outputs": [],
   "source": [
    "environ.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97e51696",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "LevSim",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}