diff --git a/AZUL Learn Opponent/MI_player.py b/AZUL Learn Opponent/MI_player.py new file mode 100644 index 0000000000000000000000000000000000000000..ec68f33ea986593b400bb6be8f5dc7f393810fdf --- /dev/null +++ b/AZUL Learn Opponent/MI_player.py @@ -0,0 +1,635 @@ +import math +import time + + +# from graphTree import TreeGraph +import numpy as np + +from model import * +from naive_player import NaivePlayer +from reward import RewardPro +from learn_opponent import Net_model +from testPlayers import Test_Player + +FIRST_SEARCH = 5 +FOE_SEARCH = 10 +SEARCH_TIME = 0.2 +GAMMA = 0.9 +MAX = 10000 +USE_LEARNING = False +USE_NAIVE = False +SIMU_LEARNING = False +CORRECT = [] +MIN_TIME = 5 + +def randomMax(dict): + maxValue = dict[max(dict, key=dict.get)] + maxGroup = [k for k, v in dict.items() if v == maxValue] + return random.choice(maxGroup) + +def get_max_difference(value_list, act_id): + id_value = value_list[act_id] + max_other = max([v for i, v in enumerate(value_list) if i!= act_id]) + return id_value-max_other + + + +class RewardBasedPlayer(Player): + def __init__(self, _id): + super().__init__(_id) + self.using_reward = 'RewardPro' + + def SelectMove(self, moves, game_state): + player_order = self.get_player_order(game_state) + moves = self.filtering_moves(game_state.players[self.id], moves) + return random.choice(moves) + + def filtering_moves(self, player_state, moves): + remove_list = [] + for index, move in enumerate(moves): + tile_type = move[2].tile_type + pattern_line_dest = move[2].pattern_line_dest + if pattern_line_dest > 0 and player_state.lines_tile[pattern_line_dest] == tile_type and \ + player_state.lines_number[pattern_line_dest] == pattern_line_dest + 1: + remove_list.append(index) + moves = [moves[i] for i in range(len(moves)) if i not in remove_list] + return moves + + def get_player_order(self, game_state): + player_order = [] + for i in range(self.id + 1, len(game_state.players)): + player_order.append(i) + for i in range(0, self.id + 1): + player_order.append(i) + return player_order + + def get_place_reward(self, game_state, move, act_id, player_order): + reward, score_list = eval(self.using_reward)(game_state, act_id, player_order).estimate(move) + return reward, score_list + + +class MI_Player(RewardBasedPlayer): + def __init__(self, _id): + super().__init__(_id) + self.search_agent = Mcts_search(_id, False, self) + + + + def SelectMove(self, moves, game_state): + player_order = self.get_player_order(game_state) + moves = self.filtering_moves(game_state.players[self.id], moves) + move = self.search_agent.search(moves, game_state, player_order) + return move + +class MI_PlayerHis(RewardBasedPlayer): + def __init__(self, _id): + super().__init__(_id) + self.search_agent = Mcts_search(_id, False, self) + self.using_reward = 'RewardHis' + + + def SelectMove(self, moves, game_state): + player_order = self.get_player_order(game_state) + moves = self.filtering_moves(game_state.players[self.id], moves) + move = self.search_agent.search(moves, game_state, player_order) + return move + + +class MI_PlayerPro(RewardBasedPlayer): + def __init__(self, _id): + super().__init__(_id) + self.search_agent = Mcts_search(_id, False, self) + self.using_reward = 'RewardPro' + + + def SelectMove(self, moves, game_state): + player_order = self.get_player_order(game_state) + moves = self.filtering_moves(game_state.players[self.id], moves) + move = self.search_agent.search(moves, game_state, player_order) + return move + + + + +class Mcts_search: + def __init__(self, _id, log, agent): + self.id = _id + self.log = log + self.agent = agent + + def search(self, moves, game_state, player_order): + self.tree = [] + self.init_game_state = game_state + self.init_moves = moves + self.player_order = player_order + + state = self.init_game_state + parent = None + f_move = None + act_id = self.id + moves_dict = self.get_pre_prob(state, self.init_moves, self.id, self.player_order) + i_r = Instant_reward() + root_node = Node(state, parent, f_move, moves_dict, act_id, i_r, self.tree) + self.root_node = root_node + start = time.time() + n = 0 + # while n<= 4: + # while True: + while time.time()-start < max(len(moves)*SEARCH_TIME, MIN_TIME): + #a = input('input') + n += 1 + self.one_search(root_node) + print('searched tims', n) + print('nodes:', len(self.tree)) + + for m,(c, p) in root_node.moves.items(): + print(m[1], m[2], p, (c.value, get_max_difference(c.value, self.id)) if c is not None else ()) + + dict = {} + for m, (c, p) in root_node.moves.items(): + Q = get_max_difference(c.value, self.id) if c is not None else -1000 + dict[m] = Q + #print(Q) + + move = randomMax(dict) + track = self.get_predict_track(root_node, move) + print('track:') + for t in track: + print(t) + return move + + + def get_predict_track(self, root_node, move): + track = [(move[1], move[2])] + node = root_node.moves[move][0] + while True: + id = node.act_id + children = [c for m, (c,p) in node.moves.items() if c is not None] + if len(children) == 0: + break + node = max(children, key=lambda x:get_max_difference(x.value, id)) + track.append((node.from_move[1], str(node.from_move[2]), str(id), node, node.value)) + + return track + + + def one_search(self, root_node): + select_node, move = self.select(root_node) + if self.log: + print('select') + print(select_node, move) + + + node_dict = self.expand(select_node, move) + if self.log: + print('expand') + print(node_dict) + + + + choose_node = self.choose(node_dict) + if self.log: + print('choose') + print(choose_node.state, choose_node.act_id) + + result = self.simulate(choose_node) + if self.log: + print(result) + + self.backup(choose_node, result) + + def select(self, root_node): + c_node = root_node + while True: + if c_node.is_end(): + return c_node, None + if not c_node.is_fully_expanded(): + return c_node, c_node.get_unexpanded_move() + node = self.jump(c_node) + + if node.act_id != self.id: + return node, None + else: + c_node = node + + def jump(self, node): + if self.log: + print('jump') + print(node) + node_v_para = 2*math.log(node.visited) + uct_dict = {} + for m, (c, p) in node.moves.items(): + Q = get_max_difference(c.value, self.id)/max(c.value) if max(c.value) != 0 else 0 + N = ((node_v_para/c.visited)**(1/2)) if c.visited!=0 else MAX + uct_value = Q+p+N + # uct_value = p + N + uct_dict[c] = uct_value + uc_node = randomMax(uct_dict) + uc_node_v_para = 2*math.log(uc_node.visited) if uc_node.visited != 0 else 1 + + + uct_dict = {} + for m, (c, p) in uc_node.moves.items(): + Q = get_max_difference(c.value, self.id)/max(c.value) if max(c.value) != 0 else 0 + N = ((uc_node_v_para/c.visited))**(1/2) if c.visited!=0 else MAX + uct_value = Q + p + N + uct_dict[c] = uct_value + + if len(uct_dict) == 0: + if self.log: + print('reach the end, jump to the uc_node') + print(uc_node) + return uc_node + jump_node = randomMax(uct_dict) + if self.log: + print('normal jump to the node') + print(jump_node) + return jump_node + + + + + def generate_node(self, p_node, move): + state = copy.deepcopy((p_node.state)) + state.ExecuteMove(p_node.act_id, move) + parent = p_node + f_move = move + act_id = p_node.act_id + 1 if p_node.act_id < len(self.player_order) - 1 else 0 + moves = self.get_pre_prob(state, state.players[act_id].GetAvailableMoves(state), act_id, self.player_order) + i_r = Instant_reward() + return Node(state, parent, f_move, moves, act_id, i_r, self.tree) + + + def expand(self, node, move): + default = {} + default[node] = (node, 1) + if move is None: + return default + uc_node = self.generate_node(node, move) + moves = uc_node.moves + if self.log: + print('expanding') + print('uc_node') + print(uc_node.state) + node_dict = {} + for m, (c, p) in moves.items(): + c_node = self.generate_node(uc_node, m) + node_dict[c_node] = (c, p) + if self.log: + print('c node') + print(c_node.state) + + if len(node_dict) == 0: + return default + return node_dict + + def choose(self, nodes_prob_dict): + nodes_list = [(k,v) for k,v in nodes_prob_dict.items()] + p = np.array([v[1] for k,v in nodes_list]) + index = np.random.choice([i for i in range(len(p))], p=p.ravel()) + node, _ = nodes_list[index] + return node + + def simulate(self, node): + state = copy.deepcopy(node.state) + player_count = len(self.player_order) + + if SIMU_LEARNING: + players = [Simu_Player(0), Naive_Simu_Player(1)] + else: + players = [Simu_Player(i) for i in range(player_count)] + + act_id = node.act_id + while state.TilesRemaining(): + if self.log: + print(act_id) + + print('id', act_id) + print('before') + print(state.detail_str()) + move = players[act_id].SelectMove(None, state) + state.ExecuteMove(act_id, move) + act_id = act_id + 1 if act_id+1 < player_count else 0 + + if self.log: + print('simulate over') + state.ExecuteEndOfRound() + reward = [0] * player_count + for i, plr in enumerate(state.players): + reward[i] = state.players[i].score + game_continuing = True + for i in range(player_count): + plr_state = state.players[i] + completed_rows = plr_state.GetCompletedRows() + if completed_rows > 0: + game_continuing = False + break + if not game_continuing: + for i in range(player_count): + state.players[i].EndOfGameScore() + reward[i] = state.players[i].score + else: + for i, plr in enumerate(state.players): + expection_score = eval(self.agent.using_reward)(state, i, self.player_order).get_round_expection() + reward[i] = state.players[i].score + expection_score + return reward + + + + def backup(self, node, result): + update_node = node + update_node.update(self.id, result) + + while True: + update_node = update_node.parent + if update_node is None:break + update_node.update(self.id) + + + + def get_pre_prob(self, game_state, moves, act_id, player_order): + threshold_most = FOE_SEARCH if act_id!= self.id else FIRST_SEARCH + #threshold_impo = 4 + + ft_moves = self.agent.filtering_moves(game_state.players[act_id], moves) + move_dict = {} + move_prob_dict = {} + + + if USE_LEARNING and act_id != self.id and len(moves)>0: + + f_list = [] + r = eval(self.agent.using_reward)(game_state, act_id, player_order) + for move in moves: + r.estimate(move) + f_list.append(r.get_features()) + for i in range(len(f_list), 150): + f_list.append([0, 0, 0, 0, 0, 0]) + results_prob = Net_model().perdict([f_list])[0][:len(moves)] + + prob_list = sorted(enumerate(results_prob), key=lambda x: x[1], reverse=True)[:min(len(moves), threshold_most)] + # prob = sum([prob_list[i][1] for i in range(len(prob_list))]) + prob_reference = [0.7, 0.1, 0.1, 0.5] + prob = sum(prob_reference[:min(len(moves), threshold_most)]) + + for i in range(min(len(moves), threshold_most)): + # move_prob_dict[moves[prob_list[i][0]]] = None, prob_list[i][1] / prob + move_prob_dict[moves[prob_list[i][0]]] = None, prob_reference[i] / prob + + # most_to_line = -1 + # corr_to_floor = 0 + # best_move = None + # + # for mid, fid, tgrab in moves: + # if most_to_line == -1: + # best_move = (mid, fid, tgrab) + # most_to_line = tgrab.num_to_pattern_line + # corr_to_floor = tgrab.num_to_floor_line + # continue + # + # if tgrab.num_to_pattern_line > most_to_line: + # best_move = (mid, fid, tgrab) + # most_to_line = tgrab.num_to_pattern_line + # corr_to_floor = tgrab.num_to_floor_line + # elif tgrab.num_to_pattern_line == most_to_line and \ + # tgrab.num_to_pattern_line < corr_to_floor: + # best_move = (mid, fid, tgrab) + # most_to_line = tgrab.num_to_pattern_line + # corr_to_floor = tgrab.num_to_floor_line + # if moves[prob_list[0][0]] == best_move: + # CORRECT.append(1) + # print('************************', len(CORRECT)) + # else: print('&&&&&&&&&&&&&&&&&&&&&&') + elif USE_NAIVE and act_id != self.id and len(moves)>0 and random.random() <= 0.7: + most_to_line = -1 + corr_to_floor = 0 + best_move = None + + + for mid, fid, tgrab in moves: + if most_to_line == -1: + best_move = (mid, fid, tgrab) + most_to_line = tgrab.num_to_pattern_line + corr_to_floor = tgrab.num_to_floor_line + continue + + if tgrab.num_to_pattern_line > most_to_line: + best_move = (mid, fid, tgrab) + most_to_line = tgrab.num_to_pattern_line + corr_to_floor = tgrab.num_to_floor_line + elif tgrab.num_to_pattern_line == most_to_line and \ + tgrab.num_to_pattern_line < corr_to_floor: + best_move = (mid, fid, tgrab) + most_to_line = tgrab.num_to_pattern_line + corr_to_floor = tgrab.num_to_floor_line + + move_prob_dict[best_move] = None, 1 + + + else: + for move in ft_moves: + reward, score_list = self.agent.get_place_reward(game_state, move, act_id, player_order) + move_dict[move] = reward, score_list + move_tuple = sorted(move_dict.items(), key=lambda x: x[1][0], reverse=True)[:threshold_most] if len( + move_dict) > threshold_most else move_dict.items() + + sum_reward = sum([math.e**m[1][0] for m in move_tuple]) + for i, m in enumerate(move_tuple): + move_prob_dict[m[0]] = None, math.e**m[1][0]/sum_reward + + + return move_prob_dict + + +class Instant_reward: + def __init__(self, reward = 0, info=None): + if info is None: + info = {} + self.reward = reward + self.info = info + + def to_tuple(self): + return self.reward, self.info + +class Node: + def __init__(self, game_state, parent, from_move, moves, act_id, instant_reward, tree): + self.state = game_state + self.parent = parent + self.from_move = from_move + if self.parent is not None: + #print( self.parent.moves[from_move]) + self.parent.moves[from_move] = (self, self.parent.moves[from_move][1]) + peers = [c for m, (c, p) in self.parent.moves.items()] + assert self in peers + self.act_id = act_id + self.value = [0] * len(game_state.players) + self.instant_reward = instant_reward + tree.append(self) + self.moves = moves + self.visited = 0 + self.name = 'n'+str(len(tree)) + + def is_fully_expanded(self): + for m, (c, p) in self.moves.items(): + if c is None: + return False + return True + + def get_unexpanded_move(self): + unexp_dict = {} + for m, (c, p) in self.moves.items(): + if c is None: + unexp_dict[m] = p + unexp_prob = sum(unexp_dict.values()) + assert len(unexp_dict) > 0 + # print(unexp_dict.values()) + for m, p in unexp_dict.items(): + unexp_dict[m] = p/unexp_prob + # print(sum(unexp_dict.values())) + unexp_m_list = [(k, v) for k, v in unexp_dict.items()] + p = np.array([v for k, v in unexp_m_list]) + # print(p) + index = np.random.choice([i for i in range(len(p))], p=p.ravel()) + m, _ = unexp_m_list[index] + return m + + def is_end(self): + return not self.state.TilesRemaining() + + + def update(self, agent_id, result=None): + self.visited += 1 + if result is not None: + for i in range(len(self.value)): + self.value[i] = (self.value[i]*(self.visited-1)+result[i])/self.visited + return + # if self.act_id == agent_id: + value_list = [] + for m, (c, p) in self.moves.items(): + if c is None or c.visited == 0: + continue + value = c.value.copy() + value_list.append(value) + value_list = sorted(value_list, key=lambda x: get_max_difference(x, self.act_id), reverse=True) + # value_list = sorted(value_list, key=lambda x: (get_max_difference(x, self.act_id) + x[self.act_id]), reverse=True) + + self.value = value_list[0] + # else: + # value = [0] * len(self.value) + # for m, (c, p) in self.moves.items(): + # for i in range(len(self.value)): + # value[i] += c.value[i] * p + # self.value = value + + def setGuiNode(self, node): + self.node = node + + def getGuiNode(self): + return self.node + + +class Rand_Player(RewardBasedPlayer): + def __init__(self, _id): + super().__init__(_id) + + def SelectMove(self, moves, game_state): + i_moves = game_state.players[self.id].GetAvailableMoves(game_state) + ft_moves = self.filtering_moves(game_state.players[self.id], i_moves) + move = random.choice(ft_moves) + return move + + + +class Simu_Player(RewardBasedPlayer): + def __init__(self, _id): + super().__init__(_id) + + def SelectMove(self, moves, game_state): + player_order = [] + for i in range(self.id + 1, len(game_state.players)): + player_order.append(i) + for i in range(0, self.id + 1): + player_order.append(i) + + i_moves = game_state.players[self.id].GetAvailableMoves(game_state) + + ft_moves = self.filtering_moves(game_state.players[self.id], i_moves) + move_dict = {} + for m in ft_moves: + r = self.get_place_reward(game_state, m, self.id, player_order) + move_dict[m] = r + #print(m, r[0]) + move = max(move_dict.items(), key=lambda x:x[1][0])[0] + #print(move) + return move + +class NN_Predict_Player(RewardBasedPlayer): + def __init__(self, _id): + super().__init__(_id) + + def SelectMove(self, moves, game_state): + player_order = [] + for i in range(self.id + 1, len(game_state.players)): + player_order.append(i) + for i in range(0, self.id + 1): + player_order.append(i) + + i_moves = game_state.players[self.id].GetAvailableMoves(game_state) + ft_moves = self.filtering_moves(game_state.players[self.id], i_moves) + + + f_list = [] + r = eval(self.using_reward)(game_state, self.id, player_order) + for move in ft_moves: + r.estimate(move) + f_list.append(r.get_features()) + for i in range(len(f_list), 150): + f_list.append([0, 0, 0, 0, 0, 0]) + + results_prob = Net_model().perdict([f_list])[0][:min(len(ft_moves), 1)] + + prob_list = sorted(enumerate(results_prob), key=lambda x: x[1], reverse=True) + + + return ft_moves[prob_list[0][0]] + + +class Naive_Simu_Player(Player): + def __init__(self, _id): + super().__init__(_id) + + def SelectMove(self, moves, game_state): + # Select move that involves placing the most number of tiles + # in a pattern line. Tie break on number placed in floor line. + if moves is None: + moves = game_state.players[self.id].GetAvailableMoves(game_state) + most_to_line = -1 + corr_to_floor = 0 + + best_move = None + + # print(game_state.bag) + # print(game_state.bag_used) + # print(game_state.factories) + # print(game_state.centre_pool) + # + # print(moves) + + for mid, fid, tgrab in moves: + if most_to_line == -1: + best_move = (mid, fid, tgrab) + most_to_line = tgrab.num_to_pattern_line + corr_to_floor = tgrab.num_to_floor_line + continue + + if tgrab.num_to_pattern_line > most_to_line: + best_move = (mid, fid, tgrab) + most_to_line = tgrab.num_to_pattern_line + corr_to_floor = tgrab.num_to_floor_line + elif tgrab.num_to_pattern_line == most_to_line and \ + tgrab.num_to_pattern_line < corr_to_floor: + best_move = (mid, fid, tgrab) + most_to_line = tgrab.num_to_pattern_line + corr_to_floor = tgrab.num_to_floor_line + + return best_move