I have attempted to implement Q-learning in to a simple game I have written. The game is based around the player having to "jump" to avoid oncoming boxes.
I have designed the system with two actions; jump
and do_nothing
and the states are the distances from the next block (divided and floored to ensure that there are not a large number of states).
My issue seems to be that my implementation of the algorithm isn't considering "future reward", and so it ends up jumping at the wrong times.
Here is my implementation of the Q-learning algorithm;
JumpGameAIClass.prototype.getQ = function getQ(state) {
if (!this.Q.hasOwnProperty(state)) {
this.Q[state] = {};
for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
var action = this.actions[actionIndex];
this.Q[state][action] = 0;
}
}
return this.Q[state];
};
JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
var closest = -1;
for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
var block = this.blocks[blockIndex];
var distance = block.x - this.playerX;
if (distance >= 0 && (closest === -1 || distance < closest)) {
closest = distance;
}
}
return Math.max(0, Math.floor(closest * this.resolution));
};
JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
var jumpReward = this.getQ(distance)[this.actions[0]];
var doNothingReward = this.getQ(distance)[this.actions[1]];
if (jumpReward > doNothingReward) {
return this.actions[0];
} else if (doNothingReward > jumpReward) {
return this.actions[1];
} else {
if (!this.canJump()) {
return this.actions[1];
}
return this.actions[Math.floor(Math.random() * this.actions.length)];
}
};
JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
// We can't jump while in mid-air
if (!this.canJump()) {
return this.actions[1];
}
if (Math.random() < this.epsilon) {
return this.actions[Math.floor(Math.random() * this.actions.length)];
} else {
return this.getActionWithHighestQ(this.getBlockDistance());
}
};
JumpGameAIClass.prototype.think = function think() {
var reward = this.liveReward;
if (this.score !== this.lastScore) {
this.lastScore = this.score;
reward = this.scoreReward;
} else if (!this.playerAlive) {
reward = this.deathReward;
}
this.drawDistance();
var distance = this.getBlockDistance(),
maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],
previousQ = this.getQ(this.lastDistance)[this.lastAction];
this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);
this.lastAction = this.getActionEpsilonGreedy();
this.lastDistance = distance;
switch (this.lastAction) {
case this.actions[0]:
this.jump();
break;
}
};
And here are some of the properties used by it:
epsilon: 0.05,
alpha: 1,
gamma: 1,
resolution: 0.1,
actions: [ 'jump', 'do_nothing' ],
Q: {},
liveReward: 0,
scoreReward: 100,
deathReward: -1000,
lastAction: 'do_nothing',
lastDistance: 0,
lastScore: 0
I am having to use lastAction/lastDistance to calculate Q, as I cannot use the current data (would be acting on the action performed in the frame before).
The think
method is called once every frame after all rendering and game stuff is done (physics, controls, death, etc).
var JumpGameAIClass = function JumpGame(canvas) {
Game.JumpGame.call(this, canvas);
Object.defineProperties(this, {
epsilon: {
value: 0.05
},
alpha: {
value: 1
},
gamma: {
value: 1
},
resolution: {
value: 0.1
},
actions: {
value: [ 'jump', 'do_nothing' ]
},
Q: {
value: { },
writable: true
},
liveReward: {
value: 0
},
scoreReward: {
value: 100
},
deathReward: {
value: -1000
},
lastAction: {
value: 'do_nothing',
writable: true
},
lastDistance: {
value: 0,
writable: true
},
lastScore: {
value: 0,
writable: true
}
});
};
JumpGameAIClass.prototype = Object.create(Game.JumpGame.prototype);
JumpGameAIClass.prototype.getQ = function getQ(state) {
if (!this.Q.hasOwnProperty(state)) {
this.Q[state] = {};
for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
var action = this.actions[actionIndex];
this.Q[state][action] = 0;
}
}
return this.Q[state];
};
JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
var closest = -1;
for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
var block = this.blocks[blockIndex];
var distance = block.x - this.playerX;
if (distance >= 0 && (closest === -1 || distance < closest)) {
closest = distance;
}
}
return Math.max(0, Math.floor(closest * this.resolution));
};
JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
var jumpReward = this.getQ(distance)[this.actions[0]];
var doNothingReward = this.getQ(distance)[this.actions[1]];
if (jumpReward > doNothingReward) {
return this.actions[0];
} else if (doNothingReward > jumpReward) {
return this.actions[1];
} else {
if (!this.canJump()) {
return this.actions[1];
}
return this.actions[Math.floor(Math.random() * this.actions.length)];
}
};
JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
if (!this.canJump()) {
return this.actions[1];
}
if (Math.random() < this.epsilon) {
return this.actions[Math.floor(Math.random() * this.actions.length)];
} else {
return this.getActionWithHighestQ(this.getBlockDistance());
}
};
JumpGameAIClass.prototype.onDeath = function onDeath() {
this.restart();
};
JumpGameAIClass.prototype.think = function think() {
var reward = this.liveReward;
if (this.score !== this.lastScore) {
this.lastScore = this.score;
reward = this.scoreReward;
} else if (!this.playerAlive) {
reward = this.deathReward;
}
this.drawDistance();
var distance = this.getBlockDistance(),
maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],
previousQ = this.getQ(this.lastDistance)[this.lastAction];
this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);
this.lastAction = this.getActionEpsilonGreedy();
this.lastDistance = distance;
switch (this.lastAction) {
case this.actions[0]:
this.jump();
break;
}
};
JumpGameAIClass.prototype.drawDistance = function drawDistance() {
this.context.save();
this.context.textAlign = 'center';
this.context.textBaseline = 'bottom';
this.context.fillText('Distance: ' + this.getBlockDistance(), this.canvasWidth / 2, this.canvasHeight / 4);
this.context.textBaseline = 'top';
this.context.fillText('Last Distance: ' + this.lastDistance, this.canvasWidth / 2, this.canvasHeight / 4);
this.context.restore();
};
JumpGameAIClass.prototype.onFrame = function onFrame() {
Game.JumpGame.prototype.onFrame.apply(this, arguments);
this.think();
}
Game.JumpGameAI = JumpGameAIClass;
body {
background-color: #EEEEEE;
text-align: center;
}
canvas#game {
background-color: #FFFFFF;
border: 1px solid #DDDDDD;
}
<!DOCTYPE HTML>
<html lang="en">
<head>
<title>jump</title>
</head>
<body>
<canvas id="game" width="512" height="512">
<h1>Your browser doesn't support canvas!</h1>
</canvas>
<script src="https://raw.githubusercontent.com/cagosta/requestAnimationFrame/master/app/requestAnimationFrame.js"></script>
<!-- https://gist.github.com/jackwilsdon/d06bffa6b32c53321478 -->
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/4e467f82590e76543bf55ff788504e26afc3d694/game.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2b7ce2c3dd268c4aef9ad27316edb0b235ad0d06/canvasgame.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2696c72e001e48359a6ce880f1c475613fe359f5/jump.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/249c92f3385757b6edf2ceb49e26f14b89ffdcfe/bootstrap.js"></script>
</body>
You basically have simplified version of :
Source: Flappy Bird RL
I used values :
epsilon: {
value: 0.01
},
alpha: {
value: 0.7
},
gamma: {
value: 0.9
},
resolution: {
value: 0.1
},
liveReward: {
value: 10
},
scoreReward: {
value: -100
},
deathReward: {
value: 1000
},
It had no trouble of getting beyond 100 in first 20 attempts.
Q-learning can be described with temporal logic
Q(s, a)=r(s,a)+gamma*max_a'(Q(s', a'))
Where
r(s,a)
= r
= Immediate rewardgamma
= relative value of delayed vs. immediate rewards (0 to 1)s'
= the new state after action a
a
= action in state s
a'
= action in state s'
You should execute it as
Select an action a and execute it