artificial-intelligencegame-aiq-learning

Q-learning in game not working as expected


I have attempted to implement Q-learning in to a simple game I have written. The game is based around the player having to "jump" to avoid oncoming boxes.

I have designed the system with two actions; jump and do_nothing and the states are the distances from the next block (divided and floored to ensure that there are not a large number of states).

My issue seems to be that my implementation of the algorithm isn't considering "future reward", and so it ends up jumping at the wrong times.

Here is my implementation of the Q-learning algorithm;

JumpGameAIClass.prototype.getQ = function getQ(state) {
    if (!this.Q.hasOwnProperty(state)) {
        this.Q[state] = {};

        for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
            var action = this.actions[actionIndex];

            this.Q[state][action] = 0;
        }
    }

    return this.Q[state];
};

JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
    var closest = -1;

    for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
        var block = this.blocks[blockIndex];

        var distance = block.x - this.playerX;

        if (distance >= 0 && (closest === -1 || distance < closest)) {
            closest = distance;
        }
    }

    return Math.max(0, Math.floor(closest * this.resolution));
};

JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
    var jumpReward = this.getQ(distance)[this.actions[0]];
    var doNothingReward = this.getQ(distance)[this.actions[1]];

    if (jumpReward > doNothingReward) {
        return this.actions[0];
    } else if (doNothingReward > jumpReward) {
        return this.actions[1];
    } else {
        if (!this.canJump()) {
            return this.actions[1];
        }

        return this.actions[Math.floor(Math.random() * this.actions.length)];
    }
};

JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
    // We can't jump while in mid-air
    if (!this.canJump()) {
        return this.actions[1];
    }

    if (Math.random() < this.epsilon) {
        return this.actions[Math.floor(Math.random() * this.actions.length)];
    } else {
        return this.getActionWithHighestQ(this.getBlockDistance());
    }
};

JumpGameAIClass.prototype.think = function think() {
    var reward = this.liveReward;

    if (this.score !== this.lastScore) {
        this.lastScore = this.score;
        reward = this.scoreReward;
    } else if (!this.playerAlive) {
        reward = this.deathReward;
    }

    this.drawDistance();

    var distance = this.getBlockDistance(),
        maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],
        previousQ = this.getQ(this.lastDistance)[this.lastAction];

    this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);

    this.lastAction = this.getActionEpsilonGreedy();
    this.lastDistance = distance;

    switch (this.lastAction) {
        case this.actions[0]:
            this.jump();
            break;
    }
};

And here are some of the properties used by it:

epsilon: 0.05,
alpha: 1,
gamma: 1,
resolution: 0.1,
actions: [ 'jump', 'do_nothing' ],
Q: {},
liveReward: 0,
scoreReward: 100,
deathReward: -1000,
lastAction: 'do_nothing',
lastDistance: 0,
lastScore: 0

I am having to use lastAction/lastDistance to calculate Q, as I cannot use the current data (would be acting on the action performed in the frame before).

The think method is called once every frame after all rendering and game stuff is done (physics, controls, death, etc).

var JumpGameAIClass = function JumpGame(canvas) {
    Game.JumpGame.call(this, canvas);

    Object.defineProperties(this, {
        epsilon: {
            value: 0.05
        },

        alpha: {
            value: 1
        },

        gamma: {
            value: 1
        },

        resolution: {
            value: 0.1
        },

        actions: {
            value: [ 'jump', 'do_nothing' ]
        },

        Q: {
            value: { },
            writable: true
        },

        liveReward: {
            value: 0
        },

        scoreReward: {
            value: 100
        },

        deathReward: {
            value: -1000
        },

        lastAction: {
            value: 'do_nothing',
            writable: true
        },

        lastDistance: {
            value: 0,
            writable: true
        },

        lastScore: {
            value: 0,
            writable: true
        }
    });
};

JumpGameAIClass.prototype = Object.create(Game.JumpGame.prototype);

JumpGameAIClass.prototype.getQ = function getQ(state) {
    if (!this.Q.hasOwnProperty(state)) {
        this.Q[state] = {};

        for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
            var action = this.actions[actionIndex];

            this.Q[state][action] = 0;
        }
    }

    return this.Q[state];
};

JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
    var closest = -1;

    for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
        var block = this.blocks[blockIndex];

        var distance = block.x - this.playerX;

        if (distance >= 0 && (closest === -1 || distance < closest)) {
            closest = distance;
        }
    }

    return Math.max(0, Math.floor(closest * this.resolution));
};

JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
    var jumpReward = this.getQ(distance)[this.actions[0]];
    var doNothingReward = this.getQ(distance)[this.actions[1]];

    if (jumpReward > doNothingReward) {
        return this.actions[0];
    } else if (doNothingReward > jumpReward) {
        return this.actions[1];
    } else {
        if (!this.canJump()) {
            return this.actions[1];
        }

        return this.actions[Math.floor(Math.random() * this.actions.length)];
    }
};

JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
    if (!this.canJump()) {
        return this.actions[1];
    }

    if (Math.random() < this.epsilon) {
        return this.actions[Math.floor(Math.random() * this.actions.length)];
    } else {
        return this.getActionWithHighestQ(this.getBlockDistance());
    }
};

JumpGameAIClass.prototype.onDeath = function onDeath() {
    this.restart();
};

JumpGameAIClass.prototype.think = function think() {
    var reward = this.liveReward;

    if (this.score !== this.lastScore) {
        this.lastScore = this.score;
        reward = this.scoreReward;
    } else if (!this.playerAlive) {
        reward = this.deathReward;
    }

    this.drawDistance();

    var distance = this.getBlockDistance(),
        maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],
        previousQ = this.getQ(this.lastDistance)[this.lastAction];

    this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);

    this.lastAction = this.getActionEpsilonGreedy();
    this.lastDistance = distance;

    switch (this.lastAction) {
        case this.actions[0]:
            this.jump();
            break;
    }
};

JumpGameAIClass.prototype.drawDistance = function drawDistance() {
    this.context.save();

    this.context.textAlign = 'center';
    this.context.textBaseline = 'bottom';

    this.context.fillText('Distance: ' + this.getBlockDistance(), this.canvasWidth / 2, this.canvasHeight / 4);

    this.context.textBaseline = 'top';

    this.context.fillText('Last Distance: ' + this.lastDistance, this.canvasWidth / 2, this.canvasHeight / 4);

    this.context.restore();
};

JumpGameAIClass.prototype.onFrame = function onFrame() {
    Game.JumpGame.prototype.onFrame.apply(this, arguments);

    this.think();
}

Game.JumpGameAI = JumpGameAIClass;
body {
    background-color: #EEEEEE;
    text-align: center;
}

canvas#game {
    background-color: #FFFFFF;
    border: 1px solid #DDDDDD;
}
<!DOCTYPE HTML>
<html lang="en">
<head>
    <title>jump</title>
</head>
<body>
    <canvas id="game" width="512" height="512">
        <h1>Your browser doesn't support canvas!</h1>
    </canvas>
  
    <script src="https://raw.githubusercontent.com/cagosta/requestAnimationFrame/master/app/requestAnimationFrame.js"></script>
  
    <!-- https://gist.github.com/jackwilsdon/d06bffa6b32c53321478 -->
  
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/4e467f82590e76543bf55ff788504e26afc3d694/game.js"></script>
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2b7ce2c3dd268c4aef9ad27316edb0b235ad0d06/canvasgame.js"></script>
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2696c72e001e48359a6ce880f1c475613fe359f5/jump.js"></script>
    <script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/249c92f3385757b6edf2ceb49e26f14b89ffdcfe/bootstrap.js"></script>
</body>


Solution

  • You basically have simplified version of :

    enter image description here

    Source: Flappy Bird RL

    I used values :

        epsilon: {
            value: 0.01
        },
        alpha: {
            value: 0.7
        },
        gamma: {
            value: 0.9
        },
        resolution: {
            value: 0.1
        },  
        liveReward: {
            value: 10
        },
        scoreReward: {
            value: -100
        },
        deathReward: {
            value: 1000
        },
    

    It had no trouble of getting beyond 100 in first 20 attempts.


    Q-learning can be described with temporal logic

    Q(s, a)=r(s,a)+gamma*max_a'(Q(s', a'))
    

    Where

    You should execute it as

    Select an action a and execute it

    1. For each state-action pair (s, a), initialize the table entry Q(s, a) to zero
    2. Observe the current state s
    3. Do forever:
      • Select an action a and execute it
      • Receive immediate reward r aka Q(s, a)
      • Observe the new state s'
      • Update the table entry for Q(s, a)=r(s,a)+gamma*max_a'(Q(s', a'))
      • s=s'