Coveralls logob
Coveralls logo
  • Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

ishikota / kyoka / 128

24 Dec 2016 - 8:19 coverage: 98.031% (+0.03%) from 98.0%
128

Pull #33

travis-ci

9181eb84f9c35729a3bad740fb7f9d93?size=18&default=identiconweb-flow
Merge pull request #35 from ishikota/mkdocs

Introduce mkdocs
Pull Request #33: release v0.3.0

77 of 80 new or added lines in 9 files covered. (96.25%)

1 existing line in 1 file now uncovered.

747 of 762 relevant lines covered (98.03%)

0.98 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.04
/kyoka/algorithm/montecarlo.py
1
import os
1×
2

3
from kyoka.utils import pickle_data, unpickle_data, value_function_check
1×
4
from kyoka.value_function import BaseTabularActionValueFunction, BaseApproxActionValueFunction
1×
5
from kyoka.algorithm.rl_algorithm import BaseRLAlgorithm, generate_episode
1×
6

7

8
class MonteCarlo(BaseRLAlgorithm):
1×
9
    """Every-visit MonteCarlo method with supporting reward discounting,
10

11
    "Every-visit" means "using every state for update in an episode even if
12
    same state appeared in the episode".
13

14
    Algorithm is implemented based on the book "Reinforcement Learning: An Introduction"
15
    (reference : https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf)
16

17
    - Algorithm -
18
    Initialize:
19
        T  <- your RL task
20
        PI <- Policy used to generate episode
21
        Q  <- action value function
22
    Repeat until computational budge runs out:
23
        generate an episode of T by following policy PI
24
        for each state-action pair (S, A)  appeared in the episode:
25
            G <- sum of rewards gained after state S (discounted if gamma < 1)
26
            Q(S, A) <- average G of S sampled ever
27
    """
28

29
    def __init__(self, gamma=1):
1×
30
        """
31
        If you want to discount future reward then set gamma < 1.
32

33
        For example, we have an apisode like this
34
        episode :
35
            (state0, action0) -> reward0 ->
36
            (state1, action1) -> reward1 ->
37
            (state2, action2) -> reward2 -> finish
38

39
        then reward discounting is done like this
40
            reward_sum_from_state0 = reward0 + gamma * reward1 + gamma**2 reward2
41

42
        Args:
43
            gamma : discount factor of reward. default=1. 0 < gamma <= 1.
44
         """
45
        self.gamma = gamma
1×
46

47
    def setup(self, task, policy, value_function):
1×
48
        validate_value_function(value_function)
1×
49
        super(MonteCarlo, self).setup(task, policy, value_function)
1×
50

51
    def run_gpi_for_an_episode(self, task, policy, value_function):
1×
52
        episode = generate_episode(task, policy, value_function)
1×
53
        for idx, turn_info in enumerate(episode):
1×
54
            state, action, _next_state, _reward = turn_info
1×
55
            following_reward = self._calculate_following_state_reward(idx, episode)
1×
56
            value_function.backup(state, action, following_reward, alpha="dummy")
1×
57

58
    def _calculate_following_state_reward(self, current_turn, episode):
1×
59
        following_turn_info = episode[current_turn:]
1×
60
        following_reward = [reward for _, _, _, reward in following_turn_info]
1×
61
        return sum([self.__discount(step, reward) for step, reward in enumerate(following_reward)])
1×
62

63
    def __discount(self, step, reward):
1×
64
        return self.gamma ** step * reward
1×
65

66
class MonteCarloTabularActionValueFunction(BaseTabularActionValueFunction):
1×
67
    """Tabular action value function for MonteCarlo method.
68

69
    Backup target passed from MonteCarlo is G(reward sum of state S).
70
    So backup is done just averaging G of S sampled ever.
71

72
    Calculation of average is implemented by memory efficient way.
73
    ("_calc_average_in_incremental_way" is the method calculates average")
74
    """
75

76
    SAVE_FILE_NAME = "montecarlo_update_counter.pickle"
1×
77

78
    def setup(self):
1×
79
        super(MonteCarloTabularActionValueFunction, self).setup()
1×
80
        self.update_counter = self.generate_initial_table()
1×
81

82
    def define_save_file_prefix(self):
1×
83
        return "montecarlo"
1×
84

85
    def save(self, save_dir_path):
1×
86
        super(MonteCarloTabularActionValueFunction, self).save(save_dir_path)
1×
87
        pickle_data(self._gen_update_counter_file_path(save_dir_path), self.update_counter)
1×
88

89
    def load(self, load_dir_path):
1×
90
        super(MonteCarloTabularActionValueFunction, self).load(load_dir_path)
1×
91
        if not os.path.exists(self._gen_update_counter_file_path(load_dir_path)):
1×
NEW
92
            raise IOError(
!
93
                    'The saved data of "MonteCarlo" algorithm is not found in [ %s ]' %
94
                    load_dir_path)
95
        self.update_counter = unpickle_data(self._gen_update_counter_file_path(load_dir_path))
1×
96

97
    def backup(self, state, action, backup_target, alpha):
1×
98
        update_count = self.fetch_value_from_table(self.update_counter, state, action)
1×
99
        Q_value = self.fetch_value_from_table(self.table, state, action)
1×
100
        new_value = self._calc_average_in_incremental_way(update_count, backup_target, Q_value)
1×
101
        self.insert_value_into_table(self.table, state, action, new_value)
1×
102
        self.insert_value_into_table(self.update_counter, state, action, update_count+1)
1×
103

104
    def _calc_average_in_incremental_way(self, k, r, Q):
1×
105
        """Memory efficient implementation to calculate average"""
106
        return Q + 1.0 / (k + 1) * (r - Q)
1×
107

108
    def _gen_update_counter_file_path(self, dir_path):
1×
109
        return os.path.join(dir_path, self.SAVE_FILE_NAME)
1×
110

111
class MonteCarloApproxActionValueFunction(BaseApproxActionValueFunction):
1×
112
    """Approximation action value function for MonteCarlo method.
113
    There is no additional method from base class to use MonteCarlo method.
114

115
    Backup target passed from MonteCarlo is G(reward sum of state S).
116
    So backup should be done to approximate average of G of S sampled ever.
117
    """
118
    pass
1×
119

120
def validate_value_function(value_function):
1×
121
    value_function_check("MonteCarlo",
1×
122
            [MonteCarloTabularActionValueFunction, MonteCarloApproxActionValueFunction],
123
            value_function)
124

Troubleshooting · Open an Issue · Sales · Support · ENTERPRISE · CAREERS · STATUS
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2023 Coveralls, Inc