From 093219b687604353e990dfe12deeed0a23ecd4ef Mon Sep 17 00:00:00 2001
From: Patrick Virie <p.virie@gmail.com>
Date: Sun, 12 Jan 2025 17:22:52 +0700
Subject: [PATCH] new best compute scheme

---
 tasks/rl_hopper.py | 49 ++++++++++++++++++----------------------------
 1 file changed, 19 insertions(+), 30 deletions(-)

diff --git a/tasks/rl_hopper.py b/tasks/rl_hopper.py
index a92cddf..f4805ed 100644
--- a/tasks/rl_hopper.py
+++ b/tasks/rl_hopper.py
@@ -192,29 +192,24 @@ def states_to_expectation(states, rewards):
         return np.stack([rewards, vx, vz], axis=1)
 
 
-    def compute_goal_diff(base, operand):
-        # base has shape (n, dim), operand has shape (m, dim)
-        base_ = np.tile(np.expand_dims(base, axis=1), (1, operand.shape[0], 1))
-        operand_ = np.tile(np.expand_dims(operand, axis=0), (base.shape[0], 1, 1))
-        raw_diff = (base_ - operand_) ** 2
-        diff = np.sum(raw_diff, axis=-1, keepdims=False)
-        return diff
+    def update_best_so_far(last_pivots, best_targets):
 
+        last_goals = last_pivots[:, 1:]
+        # compute extreme of each axis
 
-    def update_best_so_far(last_pivots, best_targets, best_target_diffs):
+        axis_min = np.min(last_goals, axis=0, keepdims=True)
+        axis_max = np.max(last_goals, axis=0, keepdims=True)
 
-        last_goals = last_pivots[:, 1:]
-        diffs = compute_goal_diff(np.array([g[0] for g in context.goals]), last_goals)
-        # diffs has shape (num_goals, num_states)
-        min_indices = np.argmin(diffs, axis=1, keepdims=True)
-        min_scores = np.take_along_axis(diffs, min_indices, axis=1)
-        min_goals = np.take_along_axis(last_goals, min_indices, axis=0)
-        update_flags = min_scores < best_target_diffs
+        target_goals = np.array([g[0] for g in context.goals])
+        is_zeros = np.abs(target_goals) < 1e-4
+        is_max = target_goals > 1e-4
+        is_min = target_goals < -1e-4
 
-        best_target_diffs = np.where(update_flags, min_scores, best_target_diffs)
-        best_targets = np.where(update_flags, min_goals, best_targets)
+        best_targets = np.where(is_zeros, 0, best_targets)
+        best_targets = np.where(is_max, axis_max, best_targets)
+        best_targets = np.where(is_min, axis_min, best_targets)
 
-        return best_targets, best_target_diffs
+        return best_targets
 
 
     def prepare_data_tuples(states, actions, rewards, num_layers, skip_steps):
@@ -273,7 +268,6 @@ def prepare_data_tuples(states, actions, rewards, num_layers, skip_steps):
         epsilon = 0.8 - 0.7 * (course + 1) / num_courses
 
         next_best_targets = np.zeros((len(context.goals), len(context.goals[0][0])), dtype=np.float32)
-        next_best_target_diffs = np.ones((len(context.goals), 1), dtype=np.float32) * 1e4
 
         for i in range(num_trials):
             if i % print_steps == 0 and i > 0:
@@ -293,16 +287,11 @@ def prepare_data_tuples(states, actions, rewards, num_layers, skip_steps):
             actions = []
             rewards = []
             for _ in range(400):
-                if random.random() <= epsilon or course == 0:
-                    selected_action = env.action_space.sample()
-                    # quantize
-                    selected_action = np.round(selected_action)
-                else:
-                    a = model.react(alg.State(observation.data), stable_state)
-                    selected_action = a.data
-                    # random in range -0.5 to 0.5
-                    selected_action += (np.random.rand(3) - 0.5) * epsilon
-                    selected_action = np.clip(selected_action, -1, 1)
+                a = model.react(alg.State(observation.data), stable_state)
+                selected_action = a.data
+                # random in range -0.5 to 0.5
+                selected_action += (np.random.rand(3) - 0.5) * epsilon
+                selected_action = np.clip(selected_action, -1, 1)
 
                 next_observation, reward, terminated, truncated, info = env.step(selected_action)
 
@@ -324,7 +313,7 @@ def prepare_data_tuples(states, actions, rewards, num_layers, skip_steps):
             path_layer_tuples, last_pivots = prepare_data_tuples(states, actions, rewards, num_layers, context.skip_steps)
             trainers = model.observe(path_layer_tuples)
 
-            next_best_targets, next_best_target_diffs = update_best_so_far(last_pivots, next_best_targets, next_best_target_diffs)
+            next_best_targets = update_best_so_far(last_pivots, next_best_targets)
 
         logging.log(logging.INFO, f"Average steps: {total_steps/num_trials}")
         env.close()