Source code for Policies.AdBandits

# -*- coding: utf-8 -*-
""" The AdBandits bandit algorithm, mixing Thompson Sampling and BayesUCB.

- Reference: [AdBandit: A New Algorithm For Multi-Armed Bandits, F.S.Truzzi, V.F.da Silva, A.H.R.Costa, F.G.Cozman](http://sites.poli.usp.br/p/fabio.cozman/Publications/Article/truzzi-silva-costa-cozman-eniac2013.pdf)
- Code inspired from: https://github.com/flaviotruzzi/AdBandits/

.. warning:: This policy is very not famous, but for stochastic bandits it works usually VERY WELL! It is not anytime thought.
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Flavio Truzzi and Lilian Besson"
__version__ = "0.9"

from random import random, choice
import numpy as np

try:
    from .Posterior import Beta
    from .BasePolicy import BasePolicy
    from .with_proba import with_proba
except ImportError:
    from Posterior import Beta
    from BasePolicy import BasePolicy
    from with_proba import with_proba


# --- Data

#: Default value for the parameter :math:`\alpha` for the :class:`AdBandits` class.
ALPHA = 1


# --- Class

[docs]class AdBandits(BasePolicy):
    """ The AdBandits bandit algorithm, mixing Thompson Sampling and BayesUCB.

    - Reference: [AdBandit: A New Algorithm For Multi-Armed Bandits, F.S.Truzzi, V.F.da Silva, A.H.R.Costa, F.G.Cozman](http://sites.poli.usp.br/p/fabio.cozman/Publications/Article/truzzi-silva-costa-cozman-eniac2013.pdf)
    - Code inspired from: https://github.com/flaviotruzzi/AdBandits/

    .. warning:: This policy is very not famous, but for stochastic bandits it works usually VERY WELL! It is not anytime thought.
    """

[docs]    def __init__(self, nbArms, horizon=1000, alpha=ALPHA, posterior=Beta, lower=0., amplitude=1.):
        """ New policy."""
        super(AdBandits, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        self.alpha = alpha  #: Parameter alpha
        self.horizon = int(horizon)  #: Parameter :math:`T` = known horizon of the experiment. Default value is 1000.
        self.posterior = [None] * self.nbArms  #: Posterior for each arm. List instead of dict, quicker access
        for arm in range(self.nbArms):
            self.posterior[arm] = posterior()

[docs]    def __str__(self):
        # OK, they all have knowledge of T, but it's good to display it to, remember it
        return r"AdBandits($T={}$, $\alpha={:.3g}$)".format(self.horizon, self.alpha)

[docs]    def startGame(self):
        """ Reset each posterior."""
        super(AdBandits, self).startGame()
        for arm in range(self.nbArms):
            self.posterior[arm].reset()

[docs]    def getReward(self, arm, reward):
        """ Store the reward, and update the posterior for that arm."""
        super(AdBandits, self).getReward(arm, reward)
        reward = (reward - self.lower) / self.amplitude
        self.posterior[arm].update(reward)

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def epsilon(self):
        r""" Time variating parameter :math:`\varepsilon(t)`."""
        # Crop it to [0, 1]
        return max(0, min(1, float(self.t / (self.horizon * self.alpha))))

[docs]    def choice(self):
        r""" With probability :math:`1 - \varepsilon(t)`, use a Thompson Sampling step, otherwise use a UCB-Bayes step, to choose one arm."""
        # Thompson Exploration
        if with_proba(1 - self.epsilon):  # with proba 1-epsilon
            upperbounds = [self.posterior[i].sample() for i in range(self.nbArms)]
            maxIndex = max(upperbounds)
            bestArms = [arm for (arm, index) in enumerate(upperbounds) if index == maxIndex]
            arm = choice(bestArms)
        # UCB-Bayes
        else:
            expectations = (1.0 + self.rewards) / (2.0 + self.pulls)
            upperbounds = [self.posterior[arm].quantile(1. - 1. / self.t) for arm in range(self.nbArms)]
            regret = np.max(upperbounds) - expectations
            admissible = np.nonzero(regret == np.min(regret))[0]
            arm = choice(admissible)
        return arm

[docs]    def choiceWithRank(self, rank=1):
        r""" With probability :math:`1 - \varepsilon(t)`, use a Thompson Sampling step, otherwise use a UCB-Bayes step, to choose one arm of a certain rank."""
        if rank == 1:
            return self.choice()
        else:
            assert rank >= 1, "Error: for AdBandits = {}, in choiceWithRank(rank={}) rank has to be >= 1.".format(self, rank)
            # Thompson Exploration
            if with_proba(1 - self.epsilon):  # with proba 1-epsilon
                indexes = [self.posterior[i].sample() for i in range(self.nbArms)]
            # UCB-Bayes
            else:
                expectations = (1.0 + self.rewards) / (2.0 + self.pulls)
                upperbounds = [self.posterior[arm].quantile(1. - 1. / self.t) for arm in range(self.nbArms)]
                indexes = expectations - np.max(upperbounds)
            # We computed the indexes, OK let's use them
            sortedRewards = np.sort(indexes)  # XXX What happens here if two arms has the same index, being the max?
            chosenIndex = sortedRewards[-rank]
            # Uniform choice among the rank-th best arms
            return choice(np.nonzero(indexes == chosenIndex)[0])