ishikota / kyoka / 128

pending completion

Build # 128

Build Type

Pull #33

travis-ci

Committed by

web-flow

Commit Message

Merge pull request #35 from ishikota/mkdocs

Introduce mkdocs

Pull Request Pull Request #33: release v0.3.0

Run Details

80 of 80 new or added lines in 9 files covered. (100.0%)

747 of 762 relevant lines covered (98.03%)

0.98 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

98.04

/kyoka/algorithm/montecarlo.py

import os

from kyoka.utils import pickle_data, unpickle_data, value_function_check
from kyoka.value_function import BaseTabularActionValueFunction, BaseApproxActionValueFunction
from kyoka.algorithm.rl_algorithm import BaseRLAlgorithm, generate_episode


class MonteCarlo(BaseRLAlgorithm):
    """Every-visit MonteCarlo method with supporting reward discounting,

    "Every-visit" means "using every state for update in an episode even if
    same state appeared in the episode".

    Algorithm is implemented based on the book "Reinforcement Learning: An Introduction"
    (reference : https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf)

    - Algorithm -
    Initialize:
        T  <- your RL task
        PI <- Policy used to generate episode
        Q  <- action value function
    Repeat until computational budge runs out:
        generate an episode of T by following policy PI
        for each state-action pair (S, A)  appeared in the episode:
            G <- sum of rewards gained after state S (discounted if gamma < 1)
            Q(S, A) <- average G of S sampled ever
    """

    def __init__(self, gamma=1):
        """
        If you want to discount future reward then set gamma < 1.

        For example, we have an apisode like this
        episode :
            (state0, action0) -> reward0 ->
            (state1, action1) -> reward1 ->
            (state2, action2) -> reward2 -> finish

        then reward discounting is done like this
            reward_sum_from_state0 = reward0 + gamma * reward1 + gamma**2 reward2

        Args:
            gamma : discount factor of reward. default=1. 0 < gamma <= 1.
         """
        self.gamma = gamma

    def setup(self, task, policy, value_function):
        validate_value_function(value_function)
        super(MonteCarlo, self).setup(task, policy, value_function)

    def run_gpi_for_an_episode(self, task, policy, value_function):
        episode = generate_episode(task, policy, value_function)
        for idx, turn_info in enumerate(episode):
            state, action, _next_state, _reward = turn_info
            following_reward = self._calculate_following_state_reward(idx, episode)
            value_function.backup(state, action, following_reward, alpha="dummy")

    def _calculate_following_state_reward(self, current_turn, episode):
        following_turn_info = episode[current_turn:]
        following_reward = [reward for _, _, _, reward in following_turn_info]
        return sum([self.__discount(step, reward) for step, reward in enumerate(following_reward)])

    def __discount(self, step, reward):
        return self.gamma ** step * reward

class MonteCarloTabularActionValueFunction(BaseTabularActionValueFunction):
    """Tabular action value function for MonteCarlo method.

    Backup target passed from MonteCarlo is G(reward sum of state S).
    So backup is done just averaging G of S sampled ever.

    Calculation of average is implemented by memory efficient way.
    ("_calc_average_in_incremental_way" is the method calculates average")
    """

    SAVE_FILE_NAME = "montecarlo_update_counter.pickle"

    def setup(self):
        super(MonteCarloTabularActionValueFunction, self).setup()
        self.update_counter = self.generate_initial_table()

    def define_save_file_prefix(self):
        return "montecarlo"

    def save(self, save_dir_path):
        super(MonteCarloTabularActionValueFunction, self).save(save_dir_path)
        pickle_data(self._gen_update_counter_file_path(save_dir_path), self.update_counter)

    def load(self, load_dir_path):
        super(MonteCarloTabularActionValueFunction, self).load(load_dir_path)
        if not os.path.exists(self._gen_update_counter_file_path(load_dir_path)):
            raise IOError(
                    'The saved data of "MonteCarlo" algorithm is not found in [ %s ]' %
                    load_dir_path)
        self.update_counter = unpickle_data(self._gen_update_counter_file_path(load_dir_path))

    def backup(self, state, action, backup_target, alpha):
        update_count = self.fetch_value_from_table(self.update_counter, state, action)
        Q_value = self.fetch_value_from_table(self.table, state, action)
        new_value = self._calc_average_in_incremental_way(update_count, backup_target, Q_value)
        self.insert_value_into_table(self.table, state, action, new_value)
        self.insert_value_into_table(self.update_counter, state, action, update_count+1)

    def _calc_average_in_incremental_way(self, k, r, Q):
        """Memory efficient implementation to calculate average"""
        return Q + 1.0 / (k + 1) * (r - Q)

    def _gen_update_counter_file_path(self, dir_path):
        return os.path.join(dir_path, self.SAVE_FILE_NAME)

class MonteCarloApproxActionValueFunction(BaseApproxActionValueFunction):
    """Approximation action value function for MonteCarlo method.
    There is no additional method from base class to use MonteCarlo method.

    Backup target passed from MonteCarlo is G(reward sum of state S).
    So backup should be done to approximate average of G of S sampled ever.
    """
    pass

def validate_value_function(value_function):
    value_function_check("MonteCarlo",
            [MonteCarloTabularActionValueFunction, MonteCarloApproxActionValueFunction],
            value_function)


1	import os	1✔
2
3	from kyoka.utils import pickle_data, unpickle_data, value_function_check	1✔
4	from kyoka.value_function import BaseTabularActionValueFunction, BaseApproxActionValueFunction	1✔
5	from kyoka.algorithm.rl_algorithm import BaseRLAlgorithm, generate_episode	1✔
6
7
8	class MonteCarlo(BaseRLAlgorithm):	1✔
9	"""Every-visit MonteCarlo method with supporting reward discounting,
10
11	"Every-visit" means "using every state for update in an episode even if
12	same state appeared in the episode".
13
14	Algorithm is implemented based on the book "Reinforcement Learning: An Introduction"
15	(reference : https://webdocs.cs.ualberta.ca/~sutton/book/bookdraft2016sep.pdf)
16
17	- Algorithm -
18	Initialize:
19	T <- your RL task
20	PI <- Policy used to generate episode
21	Q <- action value function
22	Repeat until computational budge runs out:
23	generate an episode of T by following policy PI
24	for each state-action pair (S, A) appeared in the episode:
25	G <- sum of rewards gained after state S (discounted if gamma < 1)
26	Q(S, A) <- average G of S sampled ever
27	"""
28
29	def __init__(self, gamma=1):	1✔
30	"""
31	If you want to discount future reward then set gamma < 1.
32
33	For example, we have an apisode like this
34	episode :
35	(state0, action0) -> reward0 ->
36	(state1, action1) -> reward1 ->
37	(state2, action2) -> reward2 -> finish
38
39	then reward discounting is done like this
40	reward_sum_from_state0 = reward0 + gamma * reward1 + gamma**2 reward2
41
42	Args:
43	gamma : discount factor of reward. default=1. 0 < gamma <= 1.
44	"""
45	self.gamma = gamma	1✔
46
47	def setup(self, task, policy, value_function):	1✔
48	validate_value_function(value_function)	1✔
49	super(MonteCarlo, self).setup(task, policy, value_function)	1✔
50
51	def run_gpi_for_an_episode(self, task, policy, value_function):	1✔
52	episode = generate_episode(task, policy, value_function)	1✔
53	for idx, turn_info in enumerate(episode):	1✔
54	state, action, _next_state, _reward = turn_info	1✔
55	following_reward = self._calculate_following_state_reward(idx, episode)	1✔
56	value_function.backup(state, action, following_reward, alpha="dummy")	1✔
57
58	def _calculate_following_state_reward(self, current_turn, episode):	1✔
59	following_turn_info = episode[current_turn:]	1✔
60	following_reward = [reward for _, _, _, reward in following_turn_info]	1✔
61	return sum([self.__discount(step, reward) for step, reward in enumerate(following_reward)])	1✔
62
63	def __discount(self, step, reward):	1✔
64	return self.gamma ** step * reward	1✔
65
66	class MonteCarloTabularActionValueFunction(BaseTabularActionValueFunction):	1✔
67	"""Tabular action value function for MonteCarlo method.
68
69	Backup target passed from MonteCarlo is G(reward sum of state S).
70	So backup is done just averaging G of S sampled ever.
71
72	Calculation of average is implemented by memory efficient way.
73	("_calc_average_in_incremental_way" is the method calculates average")
74	"""
75
76	SAVE_FILE_NAME = "montecarlo_update_counter.pickle"	1✔
77
78	def setup(self):	1✔
79	super(MonteCarloTabularActionValueFunction, self).setup()	1✔
80	self.update_counter = self.generate_initial_table()	1✔
81
82	def define_save_file_prefix(self):	1✔
83	return "montecarlo"	1✔
84
85	def save(self, save_dir_path):	1✔
86	super(MonteCarloTabularActionValueFunction, self).save(save_dir_path)	1✔
87	pickle_data(self._gen_update_counter_file_path(save_dir_path), self.update_counter)	1✔
88
89	def load(self, load_dir_path):	1✔
90	super(MonteCarloTabularActionValueFunction, self).load(load_dir_path)	1✔
91	if not os.path.exists(self._gen_update_counter_file_path(load_dir_path)):	1✔
92	raise IOError(	×
93	'The saved data of "MonteCarlo" algorithm is not found in [ %s ]' %
94	load_dir_path)
95	self.update_counter = unpickle_data(self._gen_update_counter_file_path(load_dir_path))	1✔
96
97	def backup(self, state, action, backup_target, alpha):	1✔
98	update_count = self.fetch_value_from_table(self.update_counter, state, action)	1✔
99	Q_value = self.fetch_value_from_table(self.table, state, action)	1✔
100	new_value = self._calc_average_in_incremental_way(update_count, backup_target, Q_value)	1✔
101	self.insert_value_into_table(self.table, state, action, new_value)	1✔
102	self.insert_value_into_table(self.update_counter, state, action, update_count+1)	1✔
103
104	def _calc_average_in_incremental_way(self, k, r, Q):	1✔
105	"""Memory efficient implementation to calculate average"""
106	return Q + 1.0 / (k + 1) * (r - Q)	1✔
107
108	def _gen_update_counter_file_path(self, dir_path):	1✔
109	return os.path.join(dir_path, self.SAVE_FILE_NAME)	1✔
110
111	class MonteCarloApproxActionValueFunction(BaseApproxActionValueFunction):	1✔
112	"""Approximation action value function for MonteCarlo method.
113	There is no additional method from base class to use MonteCarlo method.
114
115	Backup target passed from MonteCarlo is G(reward sum of state S).
116	So backup should be done to approximate average of G of S sampled ever.
117	"""
118	pass	1✔
119
120	def validate_value_function(value_function):	1✔
121	value_function_check("MonteCarlo",	1✔
122	[MonteCarloTabularActionValueFunction, MonteCarloApproxActionValueFunction],
123	value_function)
124

ishikota / kyoka / 128

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous