update docstrings, etc.

hakuhodo-technologies · Jan 13, 2022 · 99a2f17 · 99a2f17
1 parent 9c1e83e
commit 99a2f17
Show file tree

Hide file tree

Showing 7 changed files with 2,068 additions and 1,552 deletions.
diff --git a/_gym/env/simulator/rtb_synthetic.py b/_gym/env/simulator/rtb_synthetic.py
@@ -194,13 +194,13 @@ def __post_init__(self):
             if self.ad_sampling_rate.ndim == 1:
                 self.ad_sampling_rate = np.tile(
                     self.ad_sampling_rate / self.ad_sampling_rate.sum(),
-                    (self.step_per_episode, 1)
+                    (self.step_per_episode, 1),
                 )
             else:
-                self.ad_sampling_rate = self.ad_sampling_rate / np.tile(
-                    np.sum(self.ad_sampling_rate, axis=1),
-                    (self.n_ads, 1)
-                ).T
+                self.ad_sampling_rate = (
+                    self.ad_sampling_rate
+                    / np.tile(np.sum(self.ad_sampling_rate, axis=1), (self.n_ads, 1)).T
+                )
         check_array(
             self.ad_sampling_rate,
             name="ad_sampling_rate",
@@ -225,13 +225,15 @@ def __post_init__(self):
             if self.user_sampling_rate.ndim == 1:
                 self.user_sampling_rate = np.tile(
                     self.user_sampling_rate / self.user_sampling_rate.sum(),
-                    (self.step_per_episode, 1)
+                    (self.step_per_episode, 1),
                 )
             else:
-                self.user_sampling_rate = self.user_sampling_rate / np.tile(
-                    np.sum(self.user_sampling_rate, axis=1),
-                    (self.n_users, 1)
-                ).T
+                self.user_sampling_rate = (
+                    self.user_sampling_rate
+                    / np.tile(
+                        np.sum(self.user_sampling_rate, axis=1), (self.n_users, 1)
+                    ).T
+                )
         check_array(
             self.user_sampling_rate,
             name="user_sampling_rate",

diff --git a/_gym/policy/head.py b/_gym/policy/head.py
@@ -205,7 +205,17 @@ def calc_pscore_given_action(self, x: np.ndarray, action: np.ndarray):
 
 @dataclass
 class DiscreteEpsilonGreedyHead(BaseHead):
-    """Class to convert greedy policy into e-greedy.
+    """Class to convert a deterministic policy into an epsilon-greedy policy.
+
+    Note
+    -------
+    Epsilon-greedy policy stochastically chooses actions (i.e., :math:`a \\in \\mathcal{A}`) given state :math:`s` as follows.
+
+    .. math::
+        \\pi(a \mid s) := (1 - \\epsilon) * \\mathbb{I}(a = a*)) + \\epsilon / |\\mathcal{A}|
+
+    where :math:`\\epsilon` is the probability of taking random actions and :math:`a*` is the greedy action.
+    :math:`\\mathbb{I}(\\cdot)` denotes indicator function.
 
     Parameters
     -------
@@ -340,7 +350,17 @@ def sample_action(self, x: np.ndarray):
 
 @dataclass
 class DiscreteSoftmaxHead(BaseHead):
-    """Class to convert policy values into softmax policy.
+    """Class to convert a Q-learning based policy into a softmax policy.
+
+    Note
+    -------
+    Softmax policy stochastically chooses actions (i.e., :math:`a \\in \\mathcal{A}`) given state :math:`s` as follows.
+
+    .. math::
+        \\pi(a \mid s) := \\frac{\\exp(Q(s, a) / \\tau)}{\\sum_{a' \\in A} \\exp(Q(s, a') / \\tau)}
+
+    where :math:`\\tau` is the temperature parameter of the softmax function.
+    :math:`Q(s, a)` is the predicted value for the given :math:`(s, a)` pair.
 
     Parameters
     -------
@@ -406,8 +426,8 @@ def _gumble_max_trick(self, x: np.ndarray):
         gumble_variable = -np.log(-np.log(self.random_.rand(len(x), self.n_actions)))
         return np.argmax(x / self.tau + gumble_variable, axis=1)
 
-    def _predict_counterfactual_state_action_value(self, x: np.ndarray):
-        """Predict counterfactual state action value.
+    def _predict_value(self, x: np.ndarray):
+        """Predict state action value for all possible actions.
 
         Parameters
         -------
@@ -425,7 +445,9 @@ def _predict_counterfactual_state_action_value(self, x: np.ndarray):
             x_.append(np.tile(x[i], (self.n_actions, 1)))
         x_ = np.array(x_).reshape((-1, x.shape[1]))
         a_ = np.tile(np.arange(self.n_actions), x.shape[0])
-        return self.base_policy.predict_value(x_, a_)  # (n_samples, n_actions)
+        return self.base_policy.predict_value(x_, a_).reshape(
+            (-1, self.n_actions)
+        )  # (n_samples, n_actions)
 
     def stochastic_action_with_pscore(self, x: np.ndarray):
         """Sample stochastic action with its pscore.
@@ -522,6 +544,14 @@ class ContinuousGaussianHead(BaseHead):
     This class should be used when action_space is not clipped.
     Otherwise, please use ContinuousTruncatedGaussianHead instead.
 
+    Given a deterministic policy, gaussian policy samples action :math:`a \\in \mathcal{A}` given state :math:`s` as follows.
+
+    .. math::
+        a \\sim Normal(\\pi(s), \\sigma)
+
+    where :math:`\\sigma` is the standard deviation of the normal distribution.
+    :math:`\\pi(s)` is the action chosen by the deterministic policy.
+
     Parameters
     -------
     base_policy: AlgoBase
@@ -648,6 +678,16 @@ def sample_action(self, x: np.ndarray):
 class ContinuousTruncatedGaussianHead(BaseHead):
     """Class to sample action from Truncated Gaussian distribution.
 
+    Note
+    -------
+    Given a deterministic policy, truncated gaussian policy samples action :math:`a \\in \mathcal{A}` given state :math:`s` as follows.
+
+    .. math::
+        a \\sim TruncNorm(\\pi(s), \\sigma)
+
+    where :math:`\\sigma` is the standard deviation of the truncated normal distribution.
+    :math:`\\pi(s)` is the action chosen by the deterministic policy.
+
     Parameters
     -------
     base_policy: AlgoBase

diff --git a/examples/quickstart/rtb_synthetic_continuous.ipynb b/examples/quickstart/rtb_synthetic_continuous.ipynb
@@ -163,6 +163,8 @@
     "- `reward`: Total number of clicks or conversions obtained during the timestep.\n",
     "- `constraints`: The pre-determined episodic budget should not be exceeded.\n",
     "\n",
+    "For more about the environmental configuration , please refer to [examples/quickstart/rtb_synthetic_customize_env.ipynb](https://github.com/negocia-inc/rtb_reinforcement_learing/blob/ope/examples/quickstart/rtb_synthetic_customize_env.ipynb).\n",
+    "\n",
     "Let's see how it works!"
    ]
   },
@@ -2529,6 +2531,13 @@
     "df.describe()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For more about the data collection and visualization, please refer to [examples/quickstart/rtb_synthetic_data_collection.ipynb](https://github.com/negocia-inc/rtb_reinforcement_learing/blob/ope/examples/quickstart/rtb_synthetic_data_collection.ipynb)."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/examples/quickstart/rtb_synthetic_customize_env.ipynb b/examples/quickstart/rtb_synthetic_customize_env.ipynb
@@ -12,8 +12,7 @@
     "2. Customize Environmental Configuration\n",
     "3. Customize Bidding Setup in RTB Env\n",
     "\n",
-    "\\* This library uses [d3rlpy](https://github.com/takuseno/d3rlpy)'s algorithm implementations of online/offline rl policies and model-based evaluation.  \n",
-    "\\* Also, our implementations of OPE are highly inspired by [Open Bandit Pipeline](https://github.com/st-tech/zr-obp)."
+    "\\* This library uses [d3rlpy](https://github.com/takuseno/d3rlpy)'s algorithm implementations.  "
    ]
   },
   {
@@ -1154,7 +1153,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For the more visualization, please refer to [examples/quickstart/rtb_synthetic_visualize_logged_data.ipynb]()."
+    "For the data collection and visualization, please refer to [examples/quickstart/rtb_synthetic_data_collection.ipynb](https://github.com/negocia-inc/rtb_reinforcement_learing/blob/ope/examples/quickstart/rtb_synthetic_data_collection.ipynb). \\\n",
+    "For offline RL and OPE procedures, please refer to [examples/quickstart/rtb_synthetic_discrete.ipynb](https://github.com/negocia-inc/rtb_reinforcement_learing/blob/master/examples/quickstart/rtb_synthetic_discrete.ipynb) and [examples/quickstart/rtb_synthetic_continuous.ipynb](https://github.com/negocia-inc/rtb_reinforcement_learing/blob/master/examples/quickstart/rtb_synthetic_continuous.ipynb)."
    ]
   },
   {
@@ -1163,9 +1163,6 @@
    "source": [
     "## Reference\n",
     "\n",
-    "- Yuta Saito, Shunsuke Aihara, Megumi Matsutani, and Yusuke Narita. \\\n",
-    "\"Open Bandit Dataset and Pipeline: Towards Realistic and Reproducible Off-Policy Evaluation.\", 2021.\n",
-    "\n",
     "- Takuma Seno and Michita Imai. \\\n",
     "\"d3rlpy: An Offline Deep Reinforcement Library.\", 2021.\n",
     "\n",