from __future__ import print_function

import datetime
import random
import time

import numpy as np
from keras.layers.advanced_activations import PReLU
from keras.layers.core import Dense
from keras.models import Sequential

# 7 x 7 maze
# 1.0 is a free cell,  0.0 is a occupied cell
maze_map = np.array([
    [1., 0., 1., 1., 1., 1., 1.],
    [1., 1., 1., 0., 0., 1., 0.],
    [0., 0., 0., 1., 1., 1., 0.],
    [1., 1., 1., 1., 0., 0., 1.],
    [1., 0., 0., 0., 1., 1., 1.],
    [1., 0., 1., 1., 1., 1., 1.],
    [1., 1., 1., 0., 1., 1., 1.]
])

visited_mark = 0.8
rat_mark = 0.5
LEFT = 0
UP = 1
RIGHT = 2
DOWN = 3

# Actions dictionary
actions_dict = {
    LEFT: 'left',
    UP: 'up',
    RIGHT: 'right',
    DOWN: 'down',
}

num_actions = len(actions_dict)

# Exploration factor
# every 10 moves the agent takes a completely random action
epsilon = 0.1


# defaults rat position = (0,0)
# cheese at bottom right
# rat can start from any free cell + is allowed to travel on the free cells only
class Game(object):
    # init maze and positions of rat and cheese
    def __init__(self, maze_map, rat=(0, 0)):
        self.maze_map = np.array(maze_map)
        nrows, ncols = self.maze_map.shape
        self.cheese = (nrows - 1, ncols - 1)  # target cell where the "cheese" is
        self.free_cells = [(r, c) for r in range(nrows) for c in range(ncols) if self.maze_map[r, c] == 1.0]
        self.free_cells.remove(self.cheese)
        if self.maze_map[self.cheese] == 0.0:
            raise Exception("Invalid maze: target cell cannot be blocked!")
        if not rat in self.free_cells:
            raise Exception("Invalid Rat Location: must sit on a free cell")
        self.reset(rat)

    # reset operation, which set every setting on default
    def reset(self, rat):
        self.rat = rat
        self.maze_map = np.copy(self.maze_map)
        nrows, ncols = self.maze_map.shape
        row, col = rat
        self.maze_map[row, col] = rat_mark
        self.state = (row, col, 'start')  # state
        self.min_reward = -0.5 * self.maze_map.size  # negative threshold to avoid infinite loops
        self.total_reward = 0
        self.visited = set()

    # update the previous state to the taken decision and create new state
    def update_state(self, action):
        nrows, ncols = self.maze_map.shape
        nrow, ncol, nmode = rat_row, rat_col, mode = self.state

        if self.maze_map[rat_row, rat_col] > 0.0:
            self.visited.add((rat_row, rat_col))  # mark visited cell

        valid_actions = self.valid_actions()

        if not valid_actions:
            nmode = 'blocked'
            print(nmode)
        elif action in valid_actions:
            nmode = 'valid'
            if action == LEFT:
                print("LEFT")
                ncol -= 1
            elif action == UP:
                print("UP")
                nrow -= 1
            if action == RIGHT:
                print("RIGHT")
                ncol += 1
            elif action == DOWN:
                print("DOWN")
                nrow += 1
        else:  # invalid action, no change in rat position
            mode = 'invalid'
            print(mode)

        # new state
        self.state = (nrow, ncol, nmode)

    # reward, or penalty for the new state
    def get_reward(self):
        rat_row, rat_col, mode = self.state
        nrows, ncols = self.maze_map.shape
        if rat_row == nrows - 1 and rat_col == ncols - 1:  # compare with cheese
            return 1.0
        if mode == 'blocked':
            return self.min_reward - 1
        if (rat_row, rat_col) in self.visited:
            return -0.25
        if mode == 'invalid':
            return -0.75
        if mode == 'valid':
            return -0.04

    # Decision has been taken
    # Execute decision and determine the consequences
    def act(self, action):
        print(actions_dict)
        print(action)
        self.update_state(action)
        reward = self.get_reward()
        self.total_reward += reward
        status = self.game_status()
        envstate = self.observe()
        return envstate, reward, status

    # observations on maze
    def observe(self):
        env = self.draw_env()
        envstate = env.reshape((1, -1))
        for vistited_cell in self.visited:
            env[vistited_cell] = visited_mark
        row, col, valid = self.state
        env[row, col] = rat_mark
        print(env)
        print()
        return envstate

    # print maze,
    def draw_env(self):
        env = np.copy(self.maze_map)
        nrows, ncols = self.maze_map.shape
        # clear all visual marks
        for r in range(nrows):
            for c in range(ncols):
                if env[r, c] > 0.0:
                    env[r, c] = 1.0
        # draw the rat
        row, col, valid = self.state
        env[row, col] = rat_mark

        return env

    # check for game status
    # if total_reward is less than negative threshold = lose
    # if rat_pos = cheese_pos = win
    # else continue
    def game_status(self):
        if self.total_reward < self.min_reward:
            return 'lose'
        rat_row, rat_col, mode = self.state
        nrows, ncols = self.maze_map.shape
        if rat_row == nrows - 1 and rat_col == ncols - 1:
            return 'win'
        return 'not_over'

    # check for valid actions
    def valid_actions(self, cell=None):
        if cell is None:
            row, col, mode = self.state
        else:
            row, col = cell
        actions = [0, 1, 2, 3]
        nrows, ncols = self.maze_map.shape
        if row == 0:
            actions.remove(1)
        elif row == nrows - 1:
            actions.remove(3)

        if col == 0:
            actions.remove(0)
        elif col == ncols - 1:
            actions.remove(2)

        if row > 0 and self.maze_map[row - 1, col] == 0.0:
            actions.remove(1)
        if row < nrows - 1 and self.maze_map[row + 1, col] == 0.0:
            actions.remove(3)

        if col > 0 and self.maze_map[row, col - 1] == 0.0:
            actions.remove(0)
        if col < ncols - 1 and self.maze_map[row, col + 1] == 0.0:
            actions.remove(2)

        return actions


# represent the Experience of game states
class Experience(object):
    # init Eperience
    def __init__(self, model, max_memory=100, discount=0.95):
        self.model = model
        self.max_memory = max_memory
        self.discount = discount
        self.memory = list()
        self.num_actions = model.output_shape[-1]

    # remember episode = [envstate, action, reward, envstate_next, game_over]
    # but only the newest
    def remember(self, episode):
        self.memory.append(episode)
        if len(self.memory) > self.max_memory:
            del self.memory[0]

    def predict(self, envstate):
        return self.model.predict(envstate)[0]

    # gives the memory of the previous input and  output of the neural network
    # and calculate Bellman's Equation
    def get_data(self, data_size=10):
        env_size = self.memory[0][0].shape[1]
        mem_size = len(self.memory)
        data_size = min(mem_size, data_size)
        inputs = np.zeros((data_size, env_size))
        targets = np.zeros((data_size, self.num_actions))
        t = enumerate(np.random.choice(range(mem_size), data_size, replace=False))
        for i, j in enumerate(np.random.choice(range(mem_size), data_size, replace=False)):
            envstate, action, reward, envstate_next, game_over = self.memory[j]
            inputs[i] = envstate
            # There should be no target values for actions not taken.
            targets[i] = self.predict(envstate)
            # Q_sa = derived policy = max quality env/action = max_a' Q(s', a')
            Q_sa = np.max(self.predict(envstate_next))
            if game_over:
                targets[i, action] = reward
            else:
                # reward + gamma * max_a' Q(s', a')
                targets[i, action] = reward + self.discount * Q_sa
        return inputs, targets


# neural network
# optimizer = adam
# loss function = mse (Mean Squared Error)
# input layer has the same size as the maze
# two hidden layers, each of size equals to the maze size
# output layer size is the same as the number of actions
def build_model(maze_map, lr=0.001):
    model = Sequential()
    model.add(Dense(maze_map.size, input_shape=(maze_map.size,)))
    model.add(PReLU())
    model.add(Dense(maze_map.size))
    model.add(PReLU())
    model.add(Dense(num_actions))
    model.compile(optimizer='adam', loss='mse')
    return model


# training method for an neural network
def train(model, maze_map, **opt):
    global epsilon
    n_epoch = opt.get('n_epoch', 15000)
    max_memory = opt.get('max_memory', 1000)
    data_size = opt.get('data_size', 50)
    start_time = datetime.datetime.now()

    # Construct environment/game
    game = Game(maze_map)

    # Initialize experience
    experience = Experience(model, max_memory=max_memory)

    win_history = []  # history of win/lose game
    n_free_cells = len(game.free_cells)
    hsize = game.maze_map.size // 2  # history window size
    win_rate = 0.0

    for epoch in range(n_epoch):
        loss = 0.0
        rat_cell = (0, 0)
        game.reset(rat_cell)
        game_over = False

        envstate = game.observe()

        n_episodes = 0
        while not game_over:
            valid_actions = game.valid_actions()
            if not valid_actions: break
            prev_envstate = envstate

            # Get next action
            if np.random.rand() < epsilon:
                action = random.choice(valid_actions)
            else:
                action = np.argmax(experience.predict(prev_envstate))

            # Apply action, get reward and new envstate
            envstate, reward, game_status = game.act(action)
            if game_status == 'win':
                win_history.append(1)
                game_over = True
            elif game_status == 'lose':
                win_history.append(0)
                game_over = True
            else:
                game_over = False

            # Store episode (experience)
            episode = [prev_envstate, action, reward, envstate, game_over]
            experience.remember(episode)
            n_episodes += 1

            # Train neural network model
            inputs, targets = experience.get_data(data_size=data_size)
            h = model.fit(
                inputs,
                targets,
                epochs=8,
                batch_size=16,
                verbose=0,
            )
            loss = model.evaluate(inputs, targets, verbose=0)

        # calculate win_rate
        if len(win_history) <= hsize:
            win_rate = sum(win_history) / hsize
        else:
            win_rate = sum(win_history[-hsize:]) / hsize

        dt = datetime.datetime.now() - start_time
        t = convert(dt.total_seconds())
        r_t = time.ctime()
        template = "Epoch: {:03d}/{:d} | Loss: {:.4f} | Episodes: {:d} | Win count: {:d} | Win rate: {:.3f} | time: {}"
        print(r_t, template.format(epoch, n_epoch - 1, loss, n_episodes, sum(win_history), win_rate, t))

        # win_rate is good enough to generate less random steps
        if win_rate > 0.9: epsilon = 0.05
        # if we won the last 'hsize' games, is the neural network well enough trained
        if sum(win_history[-hsize:]) == hsize:
            print("Reached 100%% win rate at epoch: %d" % (epoch,))
            break

    dt = datetime.datetime.now() - start_time
    t = convert(dt.total_seconds())
    r_t = time.ctime()
    print(r_t, "n_epoch: %d, max_mem: %d, data: %d, time: %s" % (epoch, max_memory, data_size, t))
    return dt.total_seconds()


# simulation of full game according to a given model
def play_game(model, game, rat_cell):
    print("game has started")
    game.reset(rat_cell)
    envstate = game.observe()
    while True:
        prev_envstate = envstate
        # get next action
        q = model.predict(prev_envstate)
        action = np.argmax(q[0])

        # apply action, get rewards and new state
        envstate, reward, game_status = game.act(action)
        if game_status == 'win':
            return True
        elif game_status == 'lose':
            return False


# Python Program to Convert seconds
# into hours, minutes and seconds
def convert(seconds):
    seconds = seconds % (24 * 3600)
    hour = seconds // 3600
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60

    return "%d:%02d:%02d" % (hour, minutes, seconds)


if __name__ == '__main__':
    model = build_model(maze_map)
    train(model, maze_map, epochs=1000, max_memory=7 * maze_map.size, data_size=32)
    game = Game(maze_map)
    play_game(model, game, (0, 0))
