Source code for Policies.RAWUCB

# -*- coding: utf-8 -*-
"""
author: Julien Seznec

Rotting Adaptive Window Upper Confidence Bounds for rotting bandits.

Reference : [Seznec et al.,  2019b]
A single algorithm for both rested and restless rotting bandits (WIP)
Julien Seznec, Pierre Ménard, Alessandro Lazaric, Michal Valko
"""


from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Julien Seznec"
__version__ = "0.1"

import numpy as np
import time
np.seterr(divide='ignore')  # XXX dangerous in general, controlled here!

try:
    from .BasePolicy import BasePolicy
    from .IndexPolicy import IndexPolicy
    from .FEWA import FEWA, EFF_FEWA
    from .kullback import klucbBern
except ImportError:
    from BasePolicy import BasePolicy
    from IndexPolicy import IndexPolicy
    from FEWA import FEWA, EFF_FEWA
    from kullback import klucbBern



[docs]class EFF_RAWUCB(EFF_FEWA):
    """
    Efficient Rotting Adaptive Window Upper Confidence Bound (RAW-UCB) [Seznec et al.,  2020]
    Efficient trick described in [Seznec et al.,  2019a, https://arxiv.org/abs/1811.11043] (m=2)
    and [Seznec et al.,  2020] (m<=2)
    We use the confidence level :math:`\delta_t = \frac{1}{t^\alpha}`.
    """

[docs]    def choice(self):
        not_selected = np.where(self.pulls == 0)[0]
        if len(not_selected):
            return not_selected[0]
        self.ucb = self._compute_ucb()
        return np.nanmin(self.ucb, axis=1).argmax()

[docs]    def _compute_ucb(self):
        return (self.statistics[0, :, :] / self.windows + self.outlogconst * np.sqrt(np.log(self._inlog())))

[docs]    def _append_thresholds(self, w):
        # FEWA use two confidence bounds. Hence, the outlogconst is twice smaller for RAWUCB
        return np.sqrt(2 * self.alpha * self.subgaussian ** 2 / w)

[docs]    def __str__(self):
        return r"EFF_RAW-UCB($\alpha={:.3g}, \, m={:.3g}$)".format(self.alpha, self.grid)


[docs]class EFF_RAWklUCB(EFF_RAWUCB):
    """
    Use KL-confidence bound instead of close formula approximation.
    Experimental work : Much slower (!!) because we compute many UCB at each round per arm)
    """
[docs]    def __init__(self, nbArms, subgaussian=1, alpha=1, klucb=klucbBern, tol=1e-4, m=2):
        super(EFF_RAWklUCB, self).__init__(nbArms=nbArms, subgaussian=subgaussian, alpha=alpha, m=m)
        self.c = alpha
        self.klucb_vec = np.vectorize(klucb, excluded=['precision'])
        self.tolerance = tol

[docs]    def choice(self):
        not_selected = np.where(self.pulls == 0)[0]
        if len(not_selected):
            return not_selected[0]
        self.ucb = self.klucb_vec(self.statistics[0, :, :] / self.windows, self.c * np.log(self.t + 1) / self.windows,
                                  precision=self.tolerance)
        return np.argmax(np.nanmin(self.ucb, axis=1))

[docs]    def __str__(self):
        return r"EFF_RAW-klUCB($c={:.3g}, \, m={:.3g}$)".format(self.alpha, self.grid)


[docs]class RAWUCB(EFF_RAWUCB):
    """
    Rotting Adaptive Window Upper Confidence Bound (RAW-UCB) [Seznec et al.,  2020]
    We use the confidence level :math:`\delta_t = \frac{1}{t^\alpha}`.
    """
[docs]    def __init__(self, nbArms, subgaussian=1, alpha=1):
        super(RAWUCB, self).__init__(nbArms=nbArms, subgaussian=subgaussian, alpha=alpha, m=1 + 1e-15)

[docs]    def __str__(self):
        return r"RAW-UCB($\alpha={:.3g}$)".format(self.alpha)


[docs]class EFF_RAWUCB_pp(EFF_RAWUCB):
    """
    Efficient Rotting Adaptive Window Upper Confidence Bound ++ (RAW-UCB++) [Seznec et al.,  2020, Thesis]
    We use the confidence level :math:`\delta_{t,h} = \frac{Kh}{t(1+log(t/Kh)^\Beta)}`.
    """

[docs]    def __init__(self, nbArms, subgaussian=1, alpha=1, beta=0, m =2):
        super(EFF_RAWUCB_pp, self).__init__(nbArms=nbArms, subgaussian=subgaussian, alpha=alpha, m=m)
        self.beta = beta

[docs]    def __str__(self):
        if self.alpha == 1:
            return r"EFF-RAW-UCB++($\beta={:.3g}, \, m={:.3g}$)".format(self.beta, self.grid)
        elif self.beta == 0:
            return r"EFF-RAW-UCB++($\alpha={:.3g}, \, m={:.3g}$)".format(self.alpha, self.grid)
        else:
            return r"EFF-RAW-UCB++($\alpha={:.3g}, \, $\beta={:.3g}, \, m={:.3g}$)".format(self.alpha, self.beta, self.grid)



[docs]    def _compute_ucb(self):
        return (self.statistics[0, :, :] / self.windows + self.outlogconst * np.sqrt(np.log(self._inlog(self.windows))))

[docs]    def _inlog(self, w):
        moss_confidence = self.t/(w * self.nbArms)
        moss_confidence[moss_confidence < 1] = 1
        inlog = moss_confidence * (1 + np.log(moss_confidence)) ** (self.beta/self.alpha)
        return inlog

[docs]class RAWUCB_pp(EFF_RAWUCB_pp):
    """
    Rotting Adaptive Window Upper Confidence Bound (RAW-UCB) [Seznec et al.,  2019b, WIP]
    We use the confidence level :math:`\delta_t = \frac{Kh}{t^\alpha}`.
    """
[docs]    def __init__(self, nbArms, subgaussian=1, beta=2):
        super(EFF_RAWUCB_pp, self).__init__(nbArms=nbArms, subgaussian=subgaussian, beta=beta, m=1 + 1e-15)

[docs]    def __str__(self):
        return r"RAW-UCB++($\beta={:.3g}$)".format(self.beta)

if __name__ == "__main__":
    # Code for debugging purposes.
    start = time.time()
    HORIZON = 10**4
    sigma = 1
    reward = {0: 0, 1: 0.2, 2: 0.4, 3: 0.6, 4: 0.8}
    policy = EFF_RAWUCB_pp(5, subgaussian=sigma, alpha=1.4)
    for t in range(HORIZON):
        choice = policy.choice()
        policy.getReward(choice, reward[choice])
    print(time.time() - start)
    print(policy.windows[:10])
    print(policy.outlogconst[:10])
    print(policy.pulls)