agent.py

"""
This file contains an agent class, where agents are using Q-learning to evolve
strategy to play in a collective-risk game. For multi-arm bandit, epsilon-
greedy is implemented. Agents don't recognise their opponent, nor have memory
of previous rounds of a game. Their actions are based solely on their own
Q-Table, where states are consisted of round numbers and available actions.

Author: Liyao Zhu  liyao@student.unimelb.edu.au
Date:   Apr. 2019
"""


import numpy as np


class Agent:

    def __init__(self, rounds, initialWealth, availableActions, alpha=0.1,
                 gamma=0.9, epsilon=0.1, multiArm='greedy'):

        self.R = rounds
        self.initialWealth = initialWealth
        self.wealth = initialWealth
        self.availableActions = availableActions
        self.iteration = 0

        """initialise Q table to small random numbers"""
        self.qTable = np.random.rand(self.R, len(self.availableActions)) * 0.01

        "Q-Learning Parameters"
        self.learnRate = alpha
        self.discount  = gamma
        self.epsilon   = epsilon
        self.multiArm  = multiArm

    def updateReward(self, round, action, loss):
        """
        :param round:
        :param action:
        :param loss:
        :return:
        """
        newWealth = self.wealth * (1-action) * (1-loss)
        reward = newWealth - self.wealth
        self.wealth = newWealth

        index = self.availableActions.index(action)
        if round == self.R - 1:
            """ at goal state, no future value"""
            maxNextQ = 0
        elif round < self.R - 1:
            """ not at goal state"""
            maxNextQ = max(self.qTable[round + 1])
        else:
            print("ERROR: Illegal round number")
            exit(2)
        self.qTable[round][index] += self.learnRate * (
                reward + self.discount * maxNextQ - self.qTable[round][index])
        # print("QTABLE:", self.qTable)

        if round == self.R - 1:
            self.iteration += 1
            # print("Player iteration +1 =", self.iteration)


    def chooseAction(self, roundNumber):

        """Method: Q-learning"""

        randomAct = False
        if self.multiArm == 'decrease':
            """Epsilon Decrease"""
            if np.random.uniform(0, 1) <= 1 * self.epsilon ** self.iteration:
               randomAct = True

        elif self.multiArm == 'greedy':
            """EPSILON GREEDY"""
            if np.random.uniform(0, 1) <= self.epsilon:
                randomAct = True

        if randomAct:
            return np.random.choice(self.availableActions)
        else:
            index = np.argmax(self.qTable[roundNumber])
            return self.availableActions[index]

    def getStrategy(self):
        """
        Get the current strategy without randomness, for analytical use
        :return: a dictionary of actions in all rounds
        """
        strategy = {}
        for r in range(self.R):
            index = np.argmax(self.qTable[r])
            strategy[r] = self.availableActions[index]
        return strategy


    def getWealth(self):
        return self.wealth

    def resetWealth(self):
        self.wealth = self.initialWealth