728x90
class GreedyEpsilon(object):
def __init__(self, n_arms, epsilon, average_sample, initial_value, gt_initial, step_size=0):
self.n_arms = n_arms
self.epsilon = epsilon
self.average_sample = average_sample
self.initial_value = initial_value
self.gt_initial = gt_initial
self.step_size = step_size
self.times = 0
def reset(self):
self.q_star = np.zeros(self.n_arms) + self.gt_initial
self.estimator = np.zeros(self.n_arms) + self.initial_value
self.Act_num = np.zeros(self.n_arms)
self.optimal_action = np.argmax(self.q_star)
def act(self): # 액션
ran = np.random.random()
if ran < self.epsilon :
return np.random.choice(self.n_arms)
else :
max_val = np.max(self.estimator)
return np.random.choice([i for i,k in enumerate(self.estimator) if k == max_val])
def step(self, action):
self.times += 1
reward = self.q_star[action] + np.random.randn()
self.Act_num[action] += 1
self.q_star += np.random.randn(self.n_arms) * 0.01 # Non Stationary
if self.average_sample :
self.estimator[action] += (1./self.Act_num[action])*(reward - self.estimator[action])
else :
self.estimator[action] += self.step_size*(reward - self.estimator[action])
return reward
GreedyEpsilon Bandit으로 epsilon-greedy에 대한 행동 및 선택이 나와 있습니다.
_init_ : 클래스 내 다른 메서드에서 변수를 사용 할 수 있도록 변수를 초기화해줌.
reset : 한 iteration이 끝났을 때 true reward array, reward estimates array, act당 횟수, 이 상황에서의 최적값 을 초기화.
act : 현재 reward estimates array와 epsilon을 기반으로 행동을 어떻게 할 건지에 대해.
step : act function을 통해 나온 action에 대한 reward랑 act에 영향을 주는 reward estimates array를 표본 평균 혹은 기하급수적 최신 가중 평균(exponential recency weighted average)를 통해 갱신.
def simulate( bandits, iteration, plays ):
optimal_act = np.zeros([len(bandits), iteration, plays])
rewards_list = np.zeros([len(bandits), iteration, plays])
for i, bandit in enumerate(bandits) :
for iter in tqdm(range(iteration)):
bandit.reset()
for play in range(plays):
action = bandit.act()
reward = bandit.step(action)
rewards_list[i, iter, play] = reward
if action == bandit.optimal_action:
optimal_act[i, iter, play] = 1
rewards = np.mean(rewards_list, axis =1)
optimal_act = np.mean(optimal_act, axis =1)
return rewards ,optimal_act
simulate : 각 time에 따라 나온 optimal_act와 rewards_list를 iteration을 기준으로 평균을 매겨서 (len(bandits), plays)의 shape으로 꺼내준다.
여러 bandit을 받고 이 bandits들 별 times에 따른 얻어지는 reward 평균
아래는 UCB, GradientBandit을 위한 class.
class UCB(object):
def __init__(self, n_arms, average_sample, initial_value, gt_initial, step_size=0, c= 2.0):
self.n_arms = n_arms
self.average_sample = average_sample
self.initial_value = initial_value
self.gt_initial = gt_initial
self.step_size = step_size
self.times = 0
self.c = c
def reset(self):
self.q_star = np.zeros(self.n_arms) + self.gt_initial
self.estimator = np.zeros(self.n_arms) + self.initial_value
self.Act_num = np.zeros(self.n_arms)
self.optimal_action = np.argmax(self.q_star)
def act(self): # 액션
ran = np.random.random()
temp = [act_est + self.c*(np.sqrt(np.log(self.times+1)/(act_num+1e-10))) for act_est, act_num in zip(self.estimator, self.Act_num)]
max_val = np.max(temp)
actions = [i for i,v in enumerate(temp) if v == max_val]
return np.random.choice(actions)
def step(self, action):
self.times += 1
reward = self.q_star[action] + np.random.randn()
self.Act_num[action] += 1
self.q_star += np.random.randn(self.n_arms) * 0.01 # Non Stationary
if self.average_sample :
self.estimator[action] += (1./self.Act_num[action])*(reward - self.estimator[action])
else :
self.estimator[action] += self.step_size*(reward - self.estimator[action])
return reward
class GradientBandit(object):
def __init__(self,n_arms, alpha, gt_initial, is_baseline=True):
self.n_arms = n_arms
self.alpha = alpha
self.gt_initial = gt_initial
self.is_baseline = is_baseline
self.times = 0
def reset(self):
self.preference = np.zeros(self.n_arms)
self.q_star = np.zeros(self.n_arms)
self.average_reward = 0
self.optimal_action = np.argmax(self.q_star)
def act(self):
softmax_unnormalized = np.exp(self.preference)
self.softmax = softmax_unnormalized / sum(softmax_unnormalized)
return np.random.choice(range(self.n_arms), p=self.softmax)
def step(self, action):
reward = self.q_star[action] + np.random.randn()
self.times += 1
self.q_star += np.random.randn(self.n_arms)*0.01
self.average_reward += 1.0 / self.times * (reward - self.average_reward)
if self.is_baseline :
mean_reward = self.average_reward
else :
mean_reward = 0
one_hot = np.zeros(self.n_arms)
one_hot[action] = 1
self.preference += self.alpha*(reward - mean_reward)*(one_hot-self.softmax)
return reward
728x90
'Reinforcement Learning' 카테고리의 다른 글
[강화학습] 다중 선택 문제 (K-armed bandit Bandit) (0) | 2021.06.22 |
---|