Source code for Policies.Exp3S

# -*- coding: utf-8 -*-
r""" The historical Exp3.S algorithm for non-stationary bandits.

- Reference: [["The nonstochastic multiarmed bandit problem", P. Auer, N. Cesa-Bianchi, Y. Freund, R.E. Schapire, SIAM journal on computing, 2002]](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.21.8735&rep=rep1&type=pdf)
- It is a simple extension of the :class:`Exp3` policy:

    >>> policy = Exp3S(nbArms, C=1)
    >>> # use policy as usual, with policy.startGame(), r = policy.choice(), policy.getReward(arm, r)

- It uses an additional :math:`\mathcal{O}(\tau_\max)` memory for a game of maximum stationary length :math:`\tau_\max`.
"""
from __future__ import division, print_function  # Python 2 compatibility

__author__ = "Lilian Besson"
__version__ = "0.9"

import numpy as np
import numpy.random as rn
from math import log, sqrt

try:
    from .Exp3 import Exp3
except ImportError:
    from Exp3 import Exp3


CONSTANT_e = np.exp(1)


# --- Exp3S

[docs]class Exp3S(Exp3):
    r""" The historical Exp3.S algorithm for non-stationary bandits.

    - Reference: [["The nonstochastic multiarmed bandit problem", P. Auer, N. Cesa-Bianchi, Y. Freund, R.E. Schapire, SIAM journal on computing, 2002]](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.21.8735&rep=rep1&type=pdf)
    """

[docs]    def __init__(self, nbArms,
                gamma=None, alpha=None,
                gamma0=1.0, alpha0=1.0,
                horizon=None, max_nb_random_events=None,
                 *args, **kwargs):
        super(Exp3S, self).__init__(nbArms, *args, **kwargs)
        # set default values of gamma and alpha
        if gamma is None:  # Use a default value for the gamma parameter
            if horizon is None:
                gamma = np.sqrt(np.log(nbArms) / nbArms)
                print("For Exp3S: by using the formula with unknown T and unknown Upsilon_T, gamma = {}...".format(gamma))  # DEBUG
            elif max_nb_random_events is None:
                gamma = min(1.0, np.sqrt(nbArms * np.log(nbArms * horizon) / horizon))
                print("For Exp3S: by using the formula with known T but unknown Upsilon_T, gamma = {}...".format(gamma))  # DEBUG
            else:
                # Corollary 8.3
                gamma = min(1.0, np.sqrt(nbArms * (max_nb_random_events * np.log(nbArms * horizon) + CONSTANT_e) / ((CONSTANT_e - 1.0) * horizon)))
                print("For Exp3S: by using the formula with known T and Upsilon_T, gamma = {}...".format(gamma))  # DEBUG
        if gamma0 is not None and gamma0 >= 0:
            gamma *= gamma0
        if gamma > 1:  # XXX for to be in [0, 1]
            gamma = 1.0
        assert 0 <= gamma <= 1, "Error: the parameter 'gamma' for the Exp3S class has to be in [0,1] !"  # DEBUG
        self._gamma = gamma
        if alpha is None:  # Use a default value for the alpha parameter
            if horizon is None:
                alpha = 0.001  # XXX bad!
            else:
                alpha = 1.0 / horizon
                print("For Exp3S: by using the formula with known T, alpha = {}...".format(alpha))  # DEBUG
        if alpha0 is not None and alpha0 >= 0:
            alpha *= alpha0
        assert alpha >= 0, "Error: the parameter 'alpha' for the Exp3S class has to be > 0!"  # DEBUG
        self._alpha = alpha
        # Just store horizon and max_nb_random_events, for pretty printing?
        self.horizon = horizon if horizon is not None else "?"
        self.max_nb_random_events = max_nb_random_events if max_nb_random_events is not None else "?"
        # Internal memory
        self.weights = np.full(nbArms, 1. / nbArms)  #: Weights on the arms

[docs]    def __str__(self):
        # return r"Exp3.S($T={}$, $\Upsilon_T={}$, $\alpha={:.6g}$, $\gamma={:.6g}$)".format(self.horizon, self.max_nb_random_events, self._alpha, self._gamma)
        return "Exp3.S"

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def gamma(self):
        r"""Constant :math:`\gamma_t = \gamma`."""
        return self._gamma

    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def alpha(self):
        r"""Constant :math:`\alpha_t = \alpha`."""
        return self._alpha

[docs]    def startGame(self):
        """Start with uniform weights."""
        super(Exp3S, self).startGame()
        self.weights.fill(1. / self.nbArms)

    @property
    def trusts(self):
        r"""Update the trusts probabilities according to Exp3 formula, and the parameter :math:`\gamma_t`.

        .. math::

           \mathrm{trusts}'_k(t+1) &= (1 - \gamma_t) w_k(t) + \gamma_t \frac{1}{K}, \\
           \mathrm{trusts}(t+1) &= \mathrm{trusts}'(t+1) / \sum_{k=1}^{K} \mathrm{trusts}'_k(t+1).

        If :math:`w_k(t)` is the current weight from arm k.
        """
        # Mixture between the weights and the uniform distribution
        trusts = ((1 - self.gamma) * self.weights / np.sum(self.weights)) + (self.gamma / self.nbArms)
        # XXX Handle weird cases, slow down everything but safer!
        if not np.all(np.isfinite(trusts)):
            trusts[~np.isfinite(trusts)] = 0  # set bad values to 0
        # Bad case, where the sum is so small that it's only rounding errors
        # or where all values where bad and forced to 0, start with trusts=[1/K...]
        if np.isclose(np.sum(trusts), 0):
            trusts[:] = 1.0 / self.nbArms
        # Normalize it and return it
        return trusts / np.sum(trusts)

[docs]    def getReward(self, arm, reward):
        r"""Give a reward: accumulate rewards on that arm k, then update the weight :math:`w_k(t)` and renormalize the weights.

        - With unbiased estimators, divide by the trust on that arm k, i.e., the probability of observing arm k: :math:`\tilde{r}_k(t) = \frac{r_k(t)}{\mathrm{trusts}_k(t)}`.
        - But with a biased estimators, :math:`\tilde{r}_k(t) = r_k(t)`.

        .. math::

           w'_k(t+1) &= w_k(t) \times \exp\left( \frac{\tilde{r}_k(t)}{\gamma_t N_k(t)} \right) \\
           w(t+1) &= w'(t+1) / \sum_{k=1}^{K} w'_k(t+1).
        """
        # XXX DONT call getReward of Exp3 (mother class), it already updates the weights!
        # super(Exp3S, self).getReward(arm, reward)  # XXX Call to BasePolicy
        # XXX Manually copy/paste from BasePolicy.getReward
        self.t += 1
        self.pulls[arm] += 1
        reward = (reward - self.lower) / self.amplitude
        self.rewards[arm] += reward
        # Update weight of THIS arm, with this biased or unbiased reward
        if self.unbiased:
            reward = reward / self.trusts[arm]
        # Multiplicative weights + uniform share of previous weights (alpha is used for this)
        old_weights = self.weights[:]
        sum_of_weights = np.sum(old_weights)
        for otherArm in range(self.nbArms):
            if otherArm != arm:
                self.weights[otherArm] = old_weights[otherArm] + CONSTANT_e * (self.alpha / self.nbArms) * sum_of_weights
        self.weights[arm] = old_weights[arm] * np.exp(reward * (self.gamma / self.nbArms)) + CONSTANT_e * (self.alpha / self.nbArms) * sum_of_weights
        # WARNING DONT Renormalize weights at each step !!
        # self.weights /= np.sum(self.weights)