z [Just Code] 다중 선택
본문 바로가기

Reinforcement Learning

[Just Code] 다중 선택

728x90
class GreedyEpsilon(object):
    def __init__(self, n_arms, epsilon, average_sample, initial_value, gt_initial, step_size=0):
        self.n_arms = n_arms
        self.epsilon = epsilon
        self.average_sample = average_sample
        self.initial_value = initial_value
        self.gt_initial = gt_initial
        self.step_size = step_size
        self.times = 0

    def reset(self):
        self.q_star = np.zeros(self.n_arms) + self.gt_initial
        self.estimator = np.zeros(self.n_arms) + self.initial_value
        self.Act_num = np.zeros(self.n_arms)
        self.optimal_action = np.argmax(self.q_star)


    def act(self): # 액션
        ran = np.random.random()
        if ran < self.epsilon :
            return np.random.choice(self.n_arms)
        else :
            max_val = np.max(self.estimator)
            return np.random.choice([i for i,k in enumerate(self.estimator) if k == max_val])

    def step(self, action):
        self.times += 1
        reward = self.q_star[action] + np.random.randn()

        self.Act_num[action] += 1
        self.q_star += np.random.randn(self.n_arms) * 0.01 # Non Stationary

        if self.average_sample :
            self.estimator[action] += (1./self.Act_num[action])*(reward - self.estimator[action])
        else :
            self.estimator[action] += self.step_size*(reward - self.estimator[action])

        return reward




GreedyEpsilon Bandit으로 epsilon-greedy에 대한 행동 및 선택이 나와 있습니다.
_init_ : 클래스 내 다른 메서드에서 변수를 사용 할 수 있도록 변수를 초기화해줌.
reset : 한 iteration이 끝났을 때 true reward array, reward estimates array, act당 횟수, 이 상황에서의 최적값 을 초기화.
act : 현재 reward estimates array와 epsilon을 기반으로 행동을 어떻게 할 건지에 대해.
step : act function을 통해 나온 action에 대한 reward랑 act에 영향을 주는 reward estimates array를 표본 평균 혹은 기하급수적 최신 가중 평균(exponential recency weighted average)를 통해 갱신.


def simulate( bandits, iteration, plays ):
    optimal_act = np.zeros([len(bandits), iteration, plays])
    rewards_list = np.zeros([len(bandits), iteration, plays])
    for i, bandit in enumerate(bandits) :

        for iter in tqdm(range(iteration)):
            bandit.reset()
            for play in range(plays):
                action = bandit.act()
                reward = bandit.step(action)

                rewards_list[i, iter, play] = reward

                if action == bandit.optimal_action:
                    optimal_act[i, iter, play] = 1

    rewards = np.mean(rewards_list, axis =1)
    optimal_act = np.mean(optimal_act, axis =1)
    return rewards ,optimal_act




simulate : 각 time에 따라 나온 optimal_act와 rewards_list를 iteration을 기준으로 평균을 매겨서 (len(bandits), plays)의 shape으로 꺼내준다.
여러 bandit을 받고 이 bandits들 별 times에 따른 얻어지는 reward 평균


아래는 UCB, GradientBandit을 위한 class.


class UCB(object):
    def __init__(self, n_arms, average_sample, initial_value, gt_initial, step_size=0, c= 2.0):
        self.n_arms = n_arms

        self.average_sample = average_sample
        self.initial_value = initial_value
        self.gt_initial = gt_initial
        self.step_size = step_size
        self.times = 0
        self.c = c

    def reset(self):
        self.q_star = np.zeros(self.n_arms) + self.gt_initial
        self.estimator = np.zeros(self.n_arms) + self.initial_value
        self.Act_num = np.zeros(self.n_arms)
        self.optimal_action = np.argmax(self.q_star)


    def act(self): # 액션

        ran = np.random.random()
        temp = [act_est + self.c*(np.sqrt(np.log(self.times+1)/(act_num+1e-10))) for act_est, act_num in zip(self.estimator, self.Act_num)]
        max_val = np.max(temp)
        actions = [i for i,v in enumerate(temp) if v == max_val]
        return np.random.choice(actions)

    def step(self, action):
        self.times += 1
        reward = self.q_star[action] + np.random.randn()

        self.Act_num[action] += 1
        self.q_star += np.random.randn(self.n_arms) * 0.01 # Non Stationary

        if self.average_sample :
            self.estimator[action] += (1./self.Act_num[action])*(reward - self.estimator[action])
        else :
            self.estimator[action] += self.step_size*(reward - self.estimator[action])

        return reward

class GradientBandit(object):
    def __init__(self,n_arms, alpha, gt_initial, is_baseline=True):
        self.n_arms = n_arms
        self.alpha = alpha
        self.gt_initial = gt_initial
        self.is_baseline = is_baseline
        self.times = 0


    def reset(self):
        self.preference = np.zeros(self.n_arms)
        self.q_star = np.zeros(self.n_arms)
        self.average_reward = 0
        self.optimal_action = np.argmax(self.q_star)

    def act(self):
        softmax_unnormalized = np.exp(self.preference)
        self.softmax = softmax_unnormalized / sum(softmax_unnormalized)

        return np.random.choice(range(self.n_arms), p=self.softmax)

    def step(self, action):
        reward = self.q_star[action] + np.random.randn()
        self.times += 1

        self.q_star += np.random.randn(self.n_arms)*0.01
        self.average_reward += 1.0 / self.times * (reward - self.average_reward)

        if self.is_baseline :
            mean_reward = self.average_reward
        else :
            mean_reward = 0

        one_hot = np.zeros(self.n_arms)
        one_hot[action] = 1

        self.preference += self.alpha*(reward - mean_reward)*(one_hot-self.softmax)
        return reward
728x90