In [87]:
import numpy as np
import matplotlib.pyplot as plt
In [114]:
para_set = [1 / 128, 1 / 64, 1 / 32, 1 / 16, 1 / 8, 1 / 4, 1 / 2, 1, 2, 4]
para_char = ['1/128', '1/64', '1/32', '1/16', '1/8', '1/4', '1/2', '1', '2', '4']
In [89]:
class Arm:
    def __init__(self, loc=0, scale=1):
        self.loc = loc
        self.scale = scale
    def play(self):
        return np.random.normal(self.loc, self.scale)
    def change(self):
        self.loc += np.random.normal(0, 0.01)
 
class Env:
    def __init__(self, arm_num=10, loc=0, scale=1):
        arm_list = []
        for i in range(arm_num):
            arm_list.append(Arm(loc, scale))
        self.arm_list = arm_list
    def act(self, action):
        self.arm_list[action].change()
        return self.arm_list[action].play()
In [90]:
class UCBAct:
    def __init__(self, c, value=np.int(0), time=np.float(0)):
        self.time = time
        self.value = value
        self.c = c
    def __gt__(self, o):
        if self.time == 0:
            return True
        elif o.time == 0:
            return False
        else:
            return self.value + (self.c * np.sqrt(np.log(t) / self.time)) > o.value + (o.c * np.sqrt(np.log(t) / o.time))

class UCB:
    def __init__(self, c, act_num=10, alpha=0.1):
        self.c = c
        self.act_num = act_num
        self.alpha = alpha
        self.act_map = []
        for _ in range(act_num):
            self.act_map.append(UCBAct(self.c))
    def act(self):
        action = np.argmax(self.act_map)
        self.act_map[action].time += 1
        return action
    def update(self, action, r):
        self.act_map[action].value = self.act_map[action].value + ((r -self.act_map[action].value) * self.alpha)
In [91]:
ubc_res = []
for para in para_set:
    env = Env()
    ucb = UCB(para)
    res = 0
    for t in range(200000):
        action = ucb.act()
        r = env.act(action)
        ucb.update(action, r)
        if t > 99999:
           res += r
    ubc_res.append(res / 100000)
In [92]:
ubc_res
Out[92]:
[1.6089612432492224,
 2.225713053680838,
 4.1962251489886055,
 0.4987496557598075,
 7.985545079424276,
 -0.26916405792383846,
 3.414704463516615,
 1.5540001446685383,
 3.409614714211533,
 2.39245232654006]
In [93]:
class Eps:
    def __init__(self, epsilon=0.1, act_num=10, alpha=0.1):
        self.epsilon = epsilon
        self.alpha = alpha
        self.act_map = np.rec.fromarrays([np.zeros(act_num), np.zeros(act_num).astype('int')])
        self.act_num = act_num
    def act(self):
        ep_con = np.random.choice([0, 1], p = [1-self.epsilon, self.epsilon])
        if ep_con:
            action = np.random.choice(range(self.act_num))
        else:
            action = np.argmax([value[0] for value in self.act_map])
        self.act_map[action][1] += 1
        return action
    def update(self, action, r):
        self.act_map[action][0] = self.act_map[action][0] + ((r - self.act_map[action][0]) * self.alpha)
In [94]:
eps_res = []
for para in para_set:
    if para > 1:
        break
    env = Env()
    eps = Eps(para)
    res = 0
    for t in range(200000):
        action = eps.act()
        r = env.act(action)
        eps.update(action, r)
        if t > 99999:
           res += r
    eps_res.append(res / 100000)
In [95]:
eps_res
Out[95]:
[1.6500003705972925,
 0.9370704212094757,
 7.68823523801286,
 7.384458206785488,
 2.932129495160003,
 2.614733206131557,
 3.819731184006643,
 -0.20554565908214728]
In [96]:
class Opt:
    def __init__(self, epsilon=0.1, act_num=10, alpha=0.1):
        self.alpha = alpha
        self.epsilon = epsilon
        self.act_num = act_num
        self.act_map = np.rec.fromarrays([np.array([1] * self.act_num), np.zeros(act_num).astype('int')])
    def act(self):
        ep_con = np.random.choice([0, 1], p = [1-self.epsilon, self.epsilon])
        if ep_con:
            action = np.random.choice(range(self.act_num))
        else:
            action = np.argmax([value[0] for value in self.act_map])
        self.act_map[action][1] += 1
        return action
    def update(self, action, r):
        self.act_map[action][0] = self.act_map[action][0] + ((r - self.act_map[action][0]) * self.alpha)
In [97]:
opt_res = []
for para in para_set:
    if para > 1:
        break
    env = Env()
    opt = Opt(para)
    res = 0
    for t in range(200000):
        action = opt.act()
        r = env.act(action)
        opt.update(action, r)
        if t > 99999:
           res += r
    opt_res.append(res / 100000)
In [98]:
opt_res
Out[98]:
[-0.21441917159324742,
 5.175622761012167,
 -1.1627402882653755,
 -2.358736042957691,
 0.5300468257517075,
 2.3799079145427333,
 3.2224792104312865,
 -0.5870005118677026]
In [99]:
class Gra:
    def __init__(self, alpha=0.1, act_num=10):
        self.H_map = np.zeros(act_num)
        self.act_num = act_num
        self.alpha = alpha
        self.d = 0
        self.r_avg = 0
    def act(self):
        self.d = np.sum(np.exp(self.H_map))
        return np.random.choice([i for i in range(self.act_num)], p=np.exp(self.H_map) / self.d)
    def update(self, action, r):
        self.r_avg += (r - self.r_avg) / (t + 1)
        temp_mu = self.alpha * (r - self.r_avg)
        self.H_map = self.H_map - (temp_mu * (np.exp(self.H_map) / self.d))
        self.H_map[action] += temp_mu
In [103]:
gra_res = []
for para in para_set:
    env = Env()
    gra = Gra(para)
    res = 0
    for t in range(200000):
        action = gra.act()
        r = env.act(action)
        gra.update(action, r)
        if t > 99999:
           res += r
    gra_res.append(res / 100000)
In [104]:
gra_res
Out[104]:
[3.2549606270301785,
 2.9964135485365126,
 1.3510483781657525,
 1.8509598949480481,
 1.8587992347402094,
 0.49540040021011034,
 1.6187459223627656,
 2.8282339693907956,
 1.5018602546589312,
 10.757532421830215]
In [123]:
plt.plot(range(len(para_set)), ubc_res, Label='UBC')
plt.plot(range(len(para_set)), eps_res + [0]*2, Label='Eps')
plt.plot(range(len(para_set)), opt_res + [0]*2, Label='Opt')
plt.plot(range(len(para_set)), gra_res, Label='Gra')
plt.legend(loc=0,ncol=2)
_ = plt.xticks(range(len(para_set)),para_char)