diff --git a/.gitignore b/.gitignore index 9cbec5f..c03eb82 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ __pycache__/ *.py[cod] *$py.class +*.h diff --git a/blackjack.py b/blackjack.py index 170aafe..2c49955 100644 --- a/blackjack.py +++ b/blackjack.py @@ -1,19 +1,18 @@ +import logging from collections import defaultdict + import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import numpy as np -import logging -logging.setLogRecordFactory(logging.LogRecord) -logging.basicConfig(level=logging.INFO, - format='%(asctime)-15s - %(levelname)-5s - %(message)s') +from envs.BlackjackEnv import Blackjack, ACE_VALUE +from log import make_logger NO_ACE_LAYER = 0 ACE_LAYER = 1 N_USABLE_ACE_LAYERS = 2 -DEALER_SICK_SUM = 17 DEALER_MIN = 1 # ACE is 1 or 10 DEALER_MAX = 10 # Max in one card N_DEALER_CARD_SUM_POSSIBILITIES = DEALER_MAX - DEALER_MIN + 1 @@ -23,43 +22,19 @@ PLAYER_MAX = 21 # Blackjack :) N_PLAYER_CARDS_SUM_POSSIBILITIES = PLAYER_MAX - PLAYER_MIN + 1 -# ACE, 2, 3, 4, 5, 6, 7, 8, 9, 10, Jack, Queen, King -CARDS = np.arange(1, 13 + 1) -ACE_CARD = 1 -JACK_CARD = 11 -QUEEN_CARD = 12 -KING_CARD = 13 - -ACTION_STICK = 0 -ACTION_HIT = 1 -ACTIONS = [ACTION_STICK, ACTION_HIT] - -REWARD_WIN = 1 -REWARD_DRAW = 0 -REWARD_LOSS = -1 - -BLACKJACK = 21 - -GAME_STATE_IN_PROGRESS = 0 -GAME_STATE_WIN = 1 -GAME_STATE_LOSE = 2 -GAME_STATE_DRAW = 3 - class State: - def __init__(self, dealer, player): - self.dealer = list(dealer) - self.player = list(player) - self.dealer_sum = calculate_hand_sum(self.dealer) - self.player_sum = calculate_hand_sum(self.player) - self.player_has_usable_ace = has_ace_usable(self.player) + def __init__(self, dealer_sum, player_sum, player_has_usable_ace): + self.dealer_sum = dealer_sum + self.player_sum = player_sum + self.player_has_usable_ace = player_has_usable_ace def get_policy_player_sum(self): return self.player_sum - PLAYER_MIN def get_policy_dealer_sum(self): - if self.dealer[0] == ACE_CARD: - return DEALER_MIN - 1 + if self.dealer_sum == ACE_VALUE: + return 0 else: return self.dealer_sum - DEALER_MIN @@ -67,196 +42,43 @@ def get_policy_has_usable_ace(self): return ACE_LAYER if self.player_has_usable_ace else NO_ACE_LAYER def __str__(self): - return 'State(dealer_sum={:2} dealer_cards={} player_sum({})={:2}) player_cards={}'.format( - self.dealer_sum, self.dealer, + return 'State(dealer_sum={:2} player_sum({})={:2})'.format( + self.dealer_sum, 'has ace' if self.player_has_usable_ace else 'no ace', - self.player_sum, self.player) + self.player_sum) def __repr__(self): return self.__str__() - def to_key(self): + def to_policy_key(self): ace_layer = ACE_LAYER if self.player_has_usable_ace else NO_ACE_LAYER return self.get_policy_dealer_sum(), self.get_policy_player_sum(), ace_layer -def card_value(card): - if card == ACE_CARD: - return 1, 11 - elif card >= JACK_CARD: - return 10 - else: - return card - - -def decide_ace_value(hand): - ace_values = card_value(ACE_CARD) - if hand + max(ace_values) <= BLACKJACK: - value = max(ace_values) - else: - value = min(ace_values) - return value - - -def calculate_hand_sum(cards): - hand = 0 - aces = 0 - for card in cards: - if card == ACE_CARD: - aces += 1 - else: - hand += card_value(card) - while aces > 0: - hand += decide_ace_value(hand) - aces -= 1 - return hand - - -def draw_card(n=1): - if n == 1: - return np.random.choice(CARDS) - else: - return np.random.choice(CARDS, n) - - -def has_ace_usable(cards): - hand = 0 - aces = 0 - for card in cards: - if card == ACE_CARD: - aces += 1 - else: - hand += card_value(card) - if aces > 0: - ace_values = card_value(ACE_CARD) - return decide_ace_value(hand) == max(ace_values) - else: - return False - - -def has_blackjack(card_sum): - return card_sum == BLACKJACK - - -def has_natural(cards, card_sum): - return len(cards) == 2 and has_blackjack(card_sum) - - -def determine_game_state(state): - dealer_sum = state.dealer_sum - player_sum = state.player_sum - if player_sum > BLACKJACK: - return GAME_STATE_LOSE - elif dealer_sum > BLACKJACK: - return GAME_STATE_WIN - elif dealer_sum == player_sum: - if has_natural(state.player, player_sum) and not has_natural(state.dealer, dealer_sum): - return GAME_STATE_WIN - else: - return GAME_STATE_DRAW - elif player_sum > dealer_sum: - return GAME_STATE_WIN - else: - return GAME_STATE_LOSE - - -def get_reward(game_state): - if game_state == GAME_STATE_WIN: - return 1 - elif game_state == GAME_STATE_LOSE: - return -1 - elif game_state == GAME_STATE_DRAW: - return 0 - else: - raise Exception('Invalid game state {}'.format(game_state)) - - -def is_player_busted(state): - if state.player_sum > BLACKJACK: - return True - else: - return False - - -def should_remember(state): - return state.player_sum >= PLAYER_MIN - - -def log_card(who, card): - value = card_value(card) - if card == ACE_CARD: - logging.debug('{} drawn: {:2} of value {}'.format(who, card, value)) - else: - logging.debug('{} drawn: {:2} of value {:2}'.format(who, card, value)) - - -def generate_episode(player_policy, dealer_policy): - logging.debug('Generating episodes') +def generate_episode(env: Blackjack, player_policy, ep_no): history = [] - dealer_hidden = draw_card() - dealer = [draw_card()] - player = list(draw_card(2)) - state = State(dealer, player) - if should_remember(state): + done = False + observation = env.reset() + while not done: + state = State(*observation) history.append(state) - logging.debug('Initial state: {}'.format(state)) - if calculate_hand_sum(player) >= BLACKJACK: - logging.debug('Player has blackjack from initial hand: {}'.format(state)) - - # Player plays seeing only one dealers card - logging.debug('Player let\'s play') - action = ACTION_HIT - while state.player_sum < BLACKJACK and action == ACTION_HIT: - # Below PLAYER_MIN its boring above start using policy - action = ACTION_HIT if state.player_sum < PLAYER_MIN else player_policy[state.to_key()] - if action == ACTION_HIT: - card = draw_card() - log_card('Player', card) - player.append(card) - state = State(dealer, player) - # If things got interesting start remembering states - if should_remember(state): - history.append(State(dealer, player)) - - # Remove bust state - busted = is_player_busted(history[-1]) - if busted: logging.debug('Player busted: {}'.format(history[-1])) - if busted and len(history) > 1: - logging.debug('Remove bust state: {}'.format(history[-1])) - history = history[:-1] - - # Dealer shows a card and plays, it doest append history, but is needed to determine win or loss - dealer.append(dealer_hidden) - state = State(dealer, player) - logging.debug('Dealer showed card: {}'.format(state)) - logging.debug('Dealer let\'s play') - action = ACTION_HIT - while state.dealer_sum < BLACKJACK and action == ACTION_HIT: - action = dealer_policy[state.dealer_sum] - if action == ACTION_HIT: - card = draw_card() - log_card('Dealer', card) - dealer.append(card) - state = State(dealer, player) - - game_state = determine_game_state(state) - reward = get_reward(game_state) - logging.debug('Game reward is {} for final state {}'.format(reward, state)) + log.debug('Episode no {}: {}'.format(ep_no, state)) + observation, reward, done, auxiliary = env.step(player_policy[state.to_policy_key()]) return history, reward if __name__ == '__main__': + log = make_logger(__name__, logging.DEBUG) + env = Blackjack() state_value = np.zeros((N_DEALER_CARD_SUM_POSSIBILITIES, N_PLAYER_CARDS_SUM_POSSIBILITIES, N_USABLE_ACE_LAYERS)) - player_policy = np.ones(state_value.shape) + player_policy = np.ones(state_value.shape, dtype=np.int32) player_policy[:, (PLAYER_INIT_STICK_SUM - PLAYER_MIN):, :] = 0 - dealer_policy = np.ones((BLACKJACK + 1,)) # Quick solution assume dealer can have sums 0 up to 21 so 22 states - dealer_policy[DEALER_SICK_SUM:] = 0 # Stick at DEALER_SICK_SUM or more returns = defaultdict(list) - for i in range(500000): - episode, reward = generate_episode(player_policy, dealer_policy) - logging.info('Episode no {} rewarded {:2}: {}'.format(i, reward, episode)) + for i in range(100000): + episode, reward = generate_episode(env, player_policy, i) + log.info('Episode no {} rewarded {:2}: {}'.format(i, reward, episode)) for state in episode: - key = state.to_key() + key = state.to_policy_key() returns[key].append(reward) state_value[key] = np.mean(returns[key]) diff --git a/blackjack_action_value.py b/blackjack_action_value.py index 2e019c8..4f44b1b 100644 --- a/blackjack_action_value.py +++ b/blackjack_action_value.py @@ -1,309 +1,91 @@ +import logging from collections import defaultdict + import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import numpy as np -import logging - -logging.setLogRecordFactory(logging.LogRecord) -logging.basicConfig(level=logging.INFO, - format='%(asctime)-15s - %(levelname)-5s - %(message)s') - -NO_ACE_LAYER = 0 -ACE_LAYER = 1 -N_USABLE_ACE_LAYERS = 2 - -DEALER_SICK_SUM = 17 -DEALER_MIN = 1 # ACE is 1 or 10 -DEALER_MAX = 10 # Max in one card -N_DEALER_CARD_SUM_POSSIBILITIES = DEALER_MAX - DEALER_MIN + 1 - -PLAYER_INIT_STICK_SUM = 20 -PLAYER_MIN = 12 # Below 12 always hit -PLAYER_MAX = 21 # Blackjack :) -N_PLAYER_CARDS_SUM_POSSIBILITIES = PLAYER_MAX - PLAYER_MIN + 1 - -# ACE, 2, 3, 4, 5, 6, 7, 8, 9, 10, Jack, Queen, King -CARDS = np.arange(1, 13 + 1) -ACE_CARD = 1 -JACK_CARD = 11 -QUEEN_CARD = 12 -KING_CARD = 13 - -ACTION_STICK = 0 -ACTION_HIT = 1 -ACTIONS = [ACTION_STICK, ACTION_HIT] -N_ACTIONS = len(ACTIONS) - -REWARD_WIN = 1 -REWARD_DRAW = 0 -REWARD_LOSS = -1 - -BLACKJACK = 21 - -GAME_STATE_IN_PROGRESS = 0 -GAME_STATE_WIN = 1 -GAME_STATE_LOSE = 2 -GAME_STATE_DRAW = 3 - -class State: - def __init__(self, dealer, player, player_action=None): - self.dealer = list(dealer) - self.player = list(player) - self.player_action = int(player_action) if player_action is not None else None - self.dealer_sum = calculate_hand_sum(self.dealer) - self.player_sum = calculate_hand_sum(self.player) - self.player_has_usable_ace = has_ace_usable(self.player) +from blackjack import State, NO_ACE_LAYER, ACE_LAYER, N_DEALER_CARD_SUM_POSSIBILITIES, \ + N_PLAYER_CARDS_SUM_POSSIBILITIES, \ + DEALER_MIN, PLAYER_MIN, PLAYER_INIT_STICK_SUM, N_USABLE_ACE_LAYERS +from envs.BlackjackEnv import Blackjack, ACE_VALUE, ACTIONS, BLACKJACK, N_ACTIONS +from log import make_logger - def get_policy_player_sum(self): - return self.player_sum - PLAYER_MIN - def get_policy_dealer_sum(self): - if self.dealer[0] == ACE_CARD: - return DEALER_MIN - 1 - else: - return self.dealer_sum - DEALER_MIN - - def get_policy_has_usable_ace(self): - return ACE_LAYER if self.player_has_usable_ace else NO_ACE_LAYER +class ActionState(State): + def __init__(self, dealer, player, has_ace, player_action=None): + super().__init__(dealer, player, has_ace) + self.player_action = player_action def __str__(self): - return 'State(dealer_sum={:2} dealer_cards={} player_sum({})={:2}) player_action={} player_cards={}'.format( - self.dealer_sum, self.dealer, + return 'ActionState(dealer_sum={:2} player_sum({})={:2} action={})'.format( + self.dealer_sum, 'has ace' if self.player_has_usable_ace else 'no ace', - self.player_sum, self.player_action, self.player) - - def __repr__(self): - return self.__str__() - - def to_policy_key(self): - ace_layer = ACE_LAYER if self.player_has_usable_ace else NO_ACE_LAYER - return self.get_policy_dealer_sum(), self.get_policy_player_sum(), ace_layer + self.player_sum, self.player_action) def to_state_action_key(self): return (*self.to_policy_key(), self.player_action) -def card_value(card): - if card == ACE_CARD: - return 1, 11 - elif card >= JACK_CARD: - return 10 - else: - return card - - -def decide_ace_value(hand): - ace_values = card_value(ACE_CARD) - if hand + max(ace_values) <= BLACKJACK: - value = max(ace_values) - else: - value = min(ace_values) - return value - - -def calculate_hand_sum(cards): - hand = 0 - aces = 0 - for card in cards: - if card == ACE_CARD: - aces += 1 - else: - hand += card_value(card) - while aces > 0: - hand += decide_ace_value(hand) - aces -= 1 - return hand - - -def draw_card(n=1): - if n == 1: - return np.random.choice(CARDS) - else: - return np.random.choice(CARDS, n) - - -def has_ace_usable(cards): - hand = 0 - aces = 0 - for card in cards: - if card == ACE_CARD: - aces += 1 - else: - hand += card_value(card) - if aces > 0: - ace_values = card_value(ACE_CARD) - return decide_ace_value(hand) == max(ace_values) - else: - return False - - -def has_blackjack(card_sum): - return card_sum == BLACKJACK - - -def has_natural(cards, card_sum): - return len(cards) == 2 and has_blackjack(card_sum) - - -def determine_game_state(state): - dealer_sum = state.dealer_sum - player_sum = state.player_sum - if player_sum > BLACKJACK: - return GAME_STATE_LOSE - elif dealer_sum > BLACKJACK: - return GAME_STATE_WIN - elif dealer_sum == player_sum: - if has_natural(state.player, player_sum) and not has_natural(state.dealer, dealer_sum): - return GAME_STATE_WIN - else: - return GAME_STATE_DRAW - elif player_sum > dealer_sum: - return GAME_STATE_WIN - else: - return GAME_STATE_LOSE - - -def get_reward(game_state): - if game_state == GAME_STATE_WIN: - return 1 - elif game_state == GAME_STATE_LOSE: - return -1 - elif game_state == GAME_STATE_DRAW: - return 0 - else: - raise Exception('Invalid game state {}'.format(game_state)) - - -def is_player_busted(state): - if state.player_sum > BLACKJACK: - return True - else: - return False - - -def should_remember(state): - return state.player_sum >= PLAYER_MIN - - -def log_card(who, card): - value = card_value(card) - if card == ACE_CARD: - logging.debug('{} drawn: {:2} of value {}'.format(who, card, value)) - else: - logging.debug('{} drawn: {:2} of value {:2}'.format(who, card, value)) - - -def generate_episode(player_policy, dealer_policy): - logging.debug('Generating episodes') +def generate_episode(env: Blackjack, player_policy, init_action, ep_no): history = [] - - # Exploring starts - action = np.random.choice(ACTIONS) - dealer_hidden = draw_card() - dealer = [draw_card()] - player = list(draw_card(2)) - while calculate_hand_sum(player) < PLAYER_MIN or calculate_hand_sum(player) > PLAYER_MAX: - player = list(draw_card(2)) - state = State(dealer, player, action) - history.append(state) - logging.debug('Initial state: {}'.format(state)) - if calculate_hand_sum(player) >= BLACKJACK: - logging.debug('Player has blackjack from initial hand: {}'.format(state)) - if action == ACTION_HIT: - state = player_hit(action, dealer, history, player) - # Corner case when we had ace and we get 10 value card so old ace counts as 1 - # and we still have blackjack, but action should be now populated from policy - if not is_player_busted(state): - action = player_policy[state.to_policy_key()] - state.player_action = action - - # Player plays seeing only one dealers card - logging.debug('Player let\'s play') - while state.player_sum < BLACKJACK and action == ACTION_HIT: - if action == ACTION_HIT: - state = player_hit(action, dealer, history, player) - # Sutton: the expected return when starting in state s, taking action a, and thereafter following policy Pi - # So action should be populated given current state - if not is_player_busted(state): - action = player_policy[state.to_policy_key()] - state.player_action = action - - # Remove bust state - busted = is_player_busted(history[-1]) - if busted: logging.debug('Player busted: {}'.format(history[-1])) - if busted and len(history) > 1: - logging.debug('Remove bust state: {}'.format(history[-1])) - history = history[:-1] - - # Dealer shows a card and plays, it doest append history, but is needed to determine win or loss - dealer.append(dealer_hidden) - state = State(dealer, player) - logging.debug('Dealer showed card: {}'.format(state)) - logging.debug('Dealer let\'s play') - action = ACTION_HIT - while state.dealer_sum < BLACKJACK and action == ACTION_HIT: - action = dealer_policy[state.dealer_sum] - if action == ACTION_HIT: - card = draw_card() - log_card('Dealer', card) - dealer.append(card) - state = State(dealer, player) - - game_state = determine_game_state(state) - reward = get_reward(game_state) - logging.debug('Game reward is {} for final state {}'.format(reward, state)) - return history, reward - - -def player_hit(action, dealer, history, player): - card = draw_card() - log_card('Player', card) - player.append(card) - state = State(dealer, player, action) - # If things got interesting start remembering states - if should_remember(state): + done = False + observation = env.reset() + while not done: + state = State(*observation) + # Exploring starts + action = init_action if len(history) == 0 else player_policy[state.to_policy_key()] + state = ActionState(*observation, action) history.append(state) - return state + log.debug('Episode no {}: {}'.format(ep_no, state)) + observation, reward, done, auxiliary = env.step(action) + return history, reward def policy_improvement(episodes, player_policy, action_values): new_policy = player_policy.copy() for state in episodes: - i = np.argmax([action_values[(*state.to_policy_key(), action)] for action in ACTIONS]).flatten()[0] - new_policy[state.to_policy_key()] = ACTIONS[i] + new_policy[state.to_policy_key()] = action_values[state.to_policy_key()].argmax() return new_policy -def to_state_value(action_values, player_policy): - values = np.zeros(player_policy.shape) - for index, value in np.ndenumerate(player_policy): - values[index] = action_values[(*index, value)] +def to_state_value(action_values): + values = np.zeros(action_values.shape[:-1]) + for index, value in np.ndenumerate(action_values): + values[index[:-1]] = action_values[index[:-1]].max() return values +def to_policy(action_values): + policy = np.zeros(action_values.shape[:-1]) + for index, value in np.ndenumerate(action_values): + policy[index[:-1]] = action_values[index[:-1]].argmax() + return policy + + if __name__ == '__main__': + log = make_logger(__name__, logging.DEBUG) + env = Blackjack() action_values = np.zeros( (N_DEALER_CARD_SUM_POSSIBILITIES, N_PLAYER_CARDS_SUM_POSSIBILITIES, N_USABLE_ACE_LAYERS, N_ACTIONS)) player_policy = np.ones(action_values.shape[:-1], dtype=np.int32) player_policy[:, (PLAYER_INIT_STICK_SUM - PLAYER_MIN):, :] = 0 - dealer_policy = np.ones((BLACKJACK + 1,)) # Quick solution assume dealer can have sums 0 up to 21 so 22 states - dealer_policy[DEALER_SICK_SUM:] = 0 # Stick at DEALER_SICK_SUM or more returns = defaultdict(list) - for i in range(1000000): - episode, reward = generate_episode(player_policy, dealer_policy) - logging.info('Episode no {} rewarded {:2}: {}'.format(i, reward, episode)) + for i in range(500000): + episode, reward = generate_episode(env, player_policy, np.random.choice(ACTIONS), i) + log.info('Episode no {} rewarded {:2}: {}'.format(i, reward, episode)) for state in episode: key = state.to_state_action_key() returns[key].append(reward) action_values[key] = np.mean(returns[key]) new_policy = policy_improvement(episode, player_policy, action_values) - logging.info('Changes made to policy: {}'.format((new_policy != player_policy).sum())) + log.info('Changes made to policy: {}'.format((new_policy != player_policy).sum())) player_policy = new_policy - state_values = to_state_value(action_values, player_policy) + state_values = to_state_value(action_values) + player_policy = to_policy(action_values) X, Y = np.meshgrid(np.arange(0, state_values.shape[0]) + DEALER_MIN, np.arange(0, state_values.shape[1]) + PLAYER_MIN) fig = plt.figure() diff --git a/envs/BlackjackEnv.py b/envs/BlackjackEnv.py new file mode 100644 index 0000000..287ed64 --- /dev/null +++ b/envs/BlackjackEnv.py @@ -0,0 +1,129 @@ +from gym import Env +import numpy as np +from gym.spaces.discrete import Discrete + +ACE_CARD = 1 +ACE_VALUE = 11 +BLACKJACK = 21 + +PLAYER_MIN = 12 + +ACTION_STICK = 0 +ACTION_HIT = 1 +ACTIONS = [ACTION_STICK, ACTION_HIT] +N_ACTIONS = len(ACTIONS) + +DEALER_SICK_SUM = 17 + + +class Blackjack(Env): + metadata = {'render.modes': ['human']} + + def __init__(self): + self.seed_num = None + self.dealer = [] + self.player = [] + # ACE, 2, 3, 4, 5, 6, 7, 8, 9, 10, Jack, Queen, King + self.deck = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]) + self.action_space = Discrete(N_ACTIONS) + self.reward_range = (-1, 1) + self.dealer_stop = DEALER_SICK_SUM + + def _render(self, mode='human', close=False): + print('Dealer: sum={:2} cards={:4}'.format(self.calculate_hand_sum(self.dealer), str(self.dealer)), end=' ') + print('Player: sum={:2} cards={}'.format(self.calculate_hand_sum(self.player), str(self.player))) + + def _step(self, action): + assert self.action_space.contains(action) + done = False + if action == ACTION_HIT: + self.player += self.draw_card() + if self.is_busted(self.player): + done = True + else: + done = True + while self.calculate_hand_sum(self.dealer) < self.dealer_stop: + self.dealer += self.draw_card() + + if done: + reward = self.calculate_reward() + else: + reward = 0 + return self._observation(), reward, done, self._auxiliary() + + def _reset(self): + self.player = list(self.draw_card(2)) + while self.calculate_hand_sum(self.player) < PLAYER_MIN: + self.player += self.draw_card(1) + self.dealer = self.draw_card() + return self._observation() + + def _seed(self, seed=None): + self.seed_num = seed + return [self.seed_num] + + def draw_card(self, n=1): + return list(np.random.choice(self.deck, n)) + + def calculate_hand_sum(self, cards): + if self.has_usable_ace(cards): + return sum(cards) + 10 + else: + return sum(cards) + + def has_usable_ace(self, player): + return ACE_CARD in player and sum(player) + 10 <= BLACKJACK + + def is_busted(self, player): + return self.calculate_hand_sum(player) > BLACKJACK + + def calculate_reward(self): + if self.is_busted(self.player): + return -1 + elif self.is_busted(self.dealer): + return 1 + elif self.is_natural(self.player): + return 0 if self.is_natural(self.dealer) else 1 + elif self.calculate_hand_sum(self.player) == self.calculate_hand_sum(self.dealer): + return 0 + else: + return 1 if self.calculate_hand_sum(self.player) > self.calculate_hand_sum(self.dealer) else -1 + + def is_natural(self, player): + return self.calculate_hand_sum(player) == BLACKJACK and len(player) == 2 + + def _observation(self): + return self.calculate_hand_sum(self.dealer), \ + self.calculate_hand_sum(self.player), \ + self.has_usable_ace(self.player) + + def _auxiliary(self): + return BlackjackAuxiliary(self.dealer, self.player) + + +class BlackjackAuxiliary: + def __init__(self, dealer, player): + self.player_cards = player + self.dealer_cards = dealer + + +def policy(observation): + if observation[1] < 20: + return ACTION_HIT + else: + return ACTION_STICK + + +if __name__ == '__main__': + env = Blackjack() + + for episode in range(10): + print('Episode no: {}'.format(episode)) + done = False + observation = env.reset() + while not done: + env.render() + observation, reward, done, auxiliary = env.step(policy(observation)) + if done: + env.render() + print('Reward: {}'.format(reward)) diff --git a/envs/__init__.py b/envs/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/log/__init__.py b/log/__init__.py new file mode 100644 index 0000000..081323e --- /dev/null +++ b/log/__init__.py @@ -0,0 +1,25 @@ +import logging + +import sys + +formatter = logging.Formatter('%(asctime)-15s - %(levelname)-5s - %(message)s') + + +def make_logger(name, level=logging.DEBUG): + console = logging.StreamHandler(sys.stdout) + console.setFormatter(formatter) + log = logging.getLogger(name) + log.setLevel(level) + log.addHandler(console) + log.propagate = False + return log + + +def make_file_logger(name, filename, level=logging.DEBUG): + file = logging.FileHandler(filename, mode='w') + file.setFormatter(formatter) + log = logging.getLogger(name) + log.setLevel(level) + log.addHandler(file) + log.propagate = False + return log