Skip to content

Commit

Permalink
update docstrings, etc.
Browse files Browse the repository at this point in the history
  • Loading branch information
aiueola committed Jan 13, 2022
1 parent 9c1e83e commit 99a2f17
Show file tree
Hide file tree
Showing 7 changed files with 2,068 additions and 1,552 deletions.
22 changes: 12 additions & 10 deletions _gym/env/simulator/rtb_synthetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,13 +194,13 @@ def __post_init__(self):
if self.ad_sampling_rate.ndim == 1:
self.ad_sampling_rate = np.tile(
self.ad_sampling_rate / self.ad_sampling_rate.sum(),
(self.step_per_episode, 1)
(self.step_per_episode, 1),
)
else:
self.ad_sampling_rate = self.ad_sampling_rate / np.tile(
np.sum(self.ad_sampling_rate, axis=1),
(self.n_ads, 1)
).T
self.ad_sampling_rate = (
self.ad_sampling_rate
/ np.tile(np.sum(self.ad_sampling_rate, axis=1), (self.n_ads, 1)).T
)
check_array(
self.ad_sampling_rate,
name="ad_sampling_rate",
Expand All @@ -225,13 +225,15 @@ def __post_init__(self):
if self.user_sampling_rate.ndim == 1:
self.user_sampling_rate = np.tile(
self.user_sampling_rate / self.user_sampling_rate.sum(),
(self.step_per_episode, 1)
(self.step_per_episode, 1),
)
else:
self.user_sampling_rate = self.user_sampling_rate / np.tile(
np.sum(self.user_sampling_rate, axis=1),
(self.n_users, 1)
).T
self.user_sampling_rate = (
self.user_sampling_rate
/ np.tile(
np.sum(self.user_sampling_rate, axis=1), (self.n_users, 1)
).T
)
check_array(
self.user_sampling_rate,
name="user_sampling_rate",
Expand Down
50 changes: 45 additions & 5 deletions _gym/policy/head.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,17 @@ def calc_pscore_given_action(self, x: np.ndarray, action: np.ndarray):

@dataclass
class DiscreteEpsilonGreedyHead(BaseHead):
"""Class to convert greedy policy into e-greedy.
"""Class to convert a deterministic policy into an epsilon-greedy policy.
Note
-------
Epsilon-greedy policy stochastically chooses actions (i.e., :math:`a \\in \\mathcal{A}`) given state :math:`s` as follows.
.. math::
\\pi(a \mid s) := (1 - \\epsilon) * \\mathbb{I}(a = a*)) + \\epsilon / |\\mathcal{A}|
where :math:`\\epsilon` is the probability of taking random actions and :math:`a*` is the greedy action.
:math:`\\mathbb{I}(\\cdot)` denotes indicator function.
Parameters
-------
Expand Down Expand Up @@ -340,7 +350,17 @@ def sample_action(self, x: np.ndarray):

@dataclass
class DiscreteSoftmaxHead(BaseHead):
"""Class to convert policy values into softmax policy.
"""Class to convert a Q-learning based policy into a softmax policy.
Note
-------
Softmax policy stochastically chooses actions (i.e., :math:`a \\in \\mathcal{A}`) given state :math:`s` as follows.
.. math::
\\pi(a \mid s) := \\frac{\\exp(Q(s, a) / \\tau)}{\\sum_{a' \\in A} \\exp(Q(s, a') / \\tau)}
where :math:`\\tau` is the temperature parameter of the softmax function.
:math:`Q(s, a)` is the predicted value for the given :math:`(s, a)` pair.
Parameters
-------
Expand Down Expand Up @@ -406,8 +426,8 @@ def _gumble_max_trick(self, x: np.ndarray):
gumble_variable = -np.log(-np.log(self.random_.rand(len(x), self.n_actions)))
return np.argmax(x / self.tau + gumble_variable, axis=1)

def _predict_counterfactual_state_action_value(self, x: np.ndarray):
"""Predict counterfactual state action value.
def _predict_value(self, x: np.ndarray):
"""Predict state action value for all possible actions.
Parameters
-------
Expand All @@ -425,7 +445,9 @@ def _predict_counterfactual_state_action_value(self, x: np.ndarray):
x_.append(np.tile(x[i], (self.n_actions, 1)))
x_ = np.array(x_).reshape((-1, x.shape[1]))
a_ = np.tile(np.arange(self.n_actions), x.shape[0])
return self.base_policy.predict_value(x_, a_) # (n_samples, n_actions)
return self.base_policy.predict_value(x_, a_).reshape(
(-1, self.n_actions)
) # (n_samples, n_actions)

def stochastic_action_with_pscore(self, x: np.ndarray):
"""Sample stochastic action with its pscore.
Expand Down Expand Up @@ -522,6 +544,14 @@ class ContinuousGaussianHead(BaseHead):
This class should be used when action_space is not clipped.
Otherwise, please use ContinuousTruncatedGaussianHead instead.
Given a deterministic policy, gaussian policy samples action :math:`a \\in \mathcal{A}` given state :math:`s` as follows.
.. math::
a \\sim Normal(\\pi(s), \\sigma)
where :math:`\\sigma` is the standard deviation of the normal distribution.
:math:`\\pi(s)` is the action chosen by the deterministic policy.
Parameters
-------
base_policy: AlgoBase
Expand Down Expand Up @@ -648,6 +678,16 @@ def sample_action(self, x: np.ndarray):
class ContinuousTruncatedGaussianHead(BaseHead):
"""Class to sample action from Truncated Gaussian distribution.
Note
-------
Given a deterministic policy, truncated gaussian policy samples action :math:`a \\in \mathcal{A}` given state :math:`s` as follows.
.. math::
a \\sim TruncNorm(\\pi(s), \\sigma)
where :math:`\\sigma` is the standard deviation of the truncated normal distribution.
:math:`\\pi(s)` is the action chosen by the deterministic policy.
Parameters
-------
base_policy: AlgoBase
Expand Down
9 changes: 9 additions & 0 deletions examples/quickstart/rtb_synthetic_continuous.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@
"- `reward`: Total number of clicks or conversions obtained during the timestep.\n",
"- `constraints`: The pre-determined episodic budget should not be exceeded.\n",
"\n",
"For more about the environmental configuration , please refer to [examples/quickstart/rtb_synthetic_customize_env.ipynb](https://github.com/negocia-inc/rtb_reinforcement_learing/blob/ope/examples/quickstart/rtb_synthetic_customize_env.ipynb).\n",
"\n",
"Let's see how it works!"
]
},
Expand Down Expand Up @@ -2529,6 +2531,13 @@
"df.describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For more about the data collection and visualization, please refer to [examples/quickstart/rtb_synthetic_data_collection.ipynb](https://github.com/negocia-inc/rtb_reinforcement_learing/blob/ope/examples/quickstart/rtb_synthetic_data_collection.ipynb)."
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
9 changes: 3 additions & 6 deletions examples/quickstart/rtb_synthetic_customize_env.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
"2. Customize Environmental Configuration\n",
"3. Customize Bidding Setup in RTB Env\n",
"\n",
"\\* This library uses [d3rlpy](https://github.com/takuseno/d3rlpy)'s algorithm implementations of online/offline rl policies and model-based evaluation. \n",
"\\* Also, our implementations of OPE are highly inspired by [Open Bandit Pipeline](https://github.com/st-tech/zr-obp)."
"\\* This library uses [d3rlpy](https://github.com/takuseno/d3rlpy)'s algorithm implementations. "
]
},
{
Expand Down Expand Up @@ -1154,7 +1153,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"For the more visualization, please refer to [examples/quickstart/rtb_synthetic_visualize_logged_data.ipynb]()."
"For the data collection and visualization, please refer to [examples/quickstart/rtb_synthetic_data_collection.ipynb](https://github.com/negocia-inc/rtb_reinforcement_learing/blob/ope/examples/quickstart/rtb_synthetic_data_collection.ipynb). \\\n",
"For offline RL and OPE procedures, please refer to [examples/quickstart/rtb_synthetic_discrete.ipynb](https://github.com/negocia-inc/rtb_reinforcement_learing/blob/master/examples/quickstart/rtb_synthetic_discrete.ipynb) and [examples/quickstart/rtb_synthetic_continuous.ipynb](https://github.com/negocia-inc/rtb_reinforcement_learing/blob/master/examples/quickstart/rtb_synthetic_continuous.ipynb)."
]
},
{
Expand All @@ -1163,9 +1163,6 @@
"source": [
"## Reference\n",
"\n",
"- Yuta Saito, Shunsuke Aihara, Megumi Matsutani, and Yusuke Narita. \\\n",
"\"Open Bandit Dataset and Pipeline: Towards Realistic and Reproducible Off-Policy Evaluation.\", 2021.\n",
"\n",
"- Takuma Seno and Michita Imai. \\\n",
"\"d3rlpy: An Offline Deep Reinforcement Library.\", 2021.\n",
"\n",
Expand Down
Loading

0 comments on commit 99a2f17

Please sign in to comment.