I'm using a TensorFlow.js implementation of YoloV7 object detection, but can't wrap my head around how to correctly scale and deal with ratios of bounding boxes. I get the correct spatial output in [x1, y1, width, height]
format, but I don't get the exact locations correct, because the model input image is 640x640:
[
[
371.74334716796875,
101.12919616699219,
19.002845764160156,
39.24892807006836
],
[
45.18428421020508,
181.0949249267578,
66.98155212402344,
74.84469604492188
],
[
405.8454284667969,
239.25437927246094,
92.49766540527344,
278.452880859375
],
[
292.5257873535156,
264.4200744628906,
77.10870361328125,
263.32293701171875
],
[
102.06099700927734,
249.17529296875,
71.49154663085938,
326.78228759765625
],
[
200.99879455566406,
256.24462890625,
118.14222717285156,
311.6120910644531
]
]
export const renderBoxes = (
canvasRef,
classThreshold,
boxes_data,
scores_data,
classes_data,
ratios
) => {
const ctx = canvasRef.getContext("2d");
ctx.clearRect(0, 0, ctx.canvas.width, ctx.canvas.height); // clean canvas
const colors = new Colors();
// font configs
const font = `${Math.max(
Math.round(Math.max(ctx.canvas.width, ctx.canvas.height) / 40),
14
)}px Arial`;
ctx.font = font;
ctx.textBaseline = "top";
for (let i = 0; i < scores_data.length; ++i) {
const klass = labels[classes_data[i]];
const score = (scores_data[i] * 100).toFixed(1);
let [x1, y1, x2, y2] = xywh2xyxy(boxes_data[i]);
const width = x2 - x1;
const height = y2 - y1;
// Draw the bounding box.
ctx.strokeStyle = "#B033FF";
ctx.lineWidth = 2;
ctx.strokeRect(x1, y1, width, height);
// Draw the label background.
ctx.fillStyle = "#B033FF";
const textWidth = ctx.measureText(klass + " - " + score + "%").width;
const textHeight = parseInt(font, 10); // base 10
ctx.fillRect(x1 - 1, y1 - (textHeight + 2), textWidth + 2, textHeight + 2);
// Draw labels
ctx.fillStyle = "#ffffff";
ctx.fillText(klass + " - " + score + "%", x1 - 1, y1 - (textHeight + 2));
}
};
const preprocess = (source: HTMLImageElement, modelWidth: number, modelHeight: number) => {
let xRatio, yRatio; // ratios for boxes
const input = tf.tidy(() => {
const img = tf.browser.fromPixels(source);
// padding image to square => [n, m] to [n, n], n > m
const [h, w] = img.shape.slice(0, 2); // get source width and height
const maxSize = Math.max(w, h); // get max size
const imgPadded = img.pad([
[0, maxSize - h], // padding y [bottom only]
[0, maxSize - w], // padding x [right only]
[0, 0],
]);
xRatio = maxSize / w; // update xRatio
yRatio = maxSize / h; // update yRatio
return tf.image
.resizeBilinear(imgPadded, [modelWidth, modelHeight]) // resize frame
.div(255.0) // normalize
.transpose([2, 0, 1]) // ??
.expandDims(0); // add batch
});
return [input, xRatio, yRatio];
};
const MODEL_URL = '/yolov7_web_model/model.json';
const model = await tf.loadGraphModel(MODEL_URL);
const model_dim = [640, 640];
const myImage = document.getElementById('image');
const [input, xRatio,yRatio] = preprocess(myImage, 640, 640);
const execution = model.execute(input);
const result = execution.arraySync()[0];
var detections = non_max_suppression(result);
const boxes = shortenedCol(detections, [0,1,2,3]);
const scores = shortenedCol(detections, [4]);
const class_detect = shortenedCol(detections, [5]);
renderBoxes(canvasRef.value!, 0.2, boxes, scores, class_detect, [xRatio, yRatio]);
I've tried using things like xRatio
and yRatio
but I don't understand how I would scale not just the ratios but also the size. How do I scale this so that it can be rendered for the original 1356x904 image?
Canvas scaling solution:
You have to reverse the initial letterboxing that is done in order to achieve a square image for the model, and then scale that to the image dimensions.
// Initial scale for letterbox image -> model
ctx.scale(1, Math.max(...ratios));
// Post-Process scale for model output -> actual dimensions
const horizontalScaleFactor = imageWidth / 640;
const verticalScaleFactor = imageHeight / 640;
ctx.scale(horizontalScaleFactor, verticalScaleFactor)
export const renderBoxes = (
canvasRef,
classThreshold,
boxes_data,
scores_data,
classes_data,
ratios,
imageWidth,
imageHeight
) => {
const ctx = canvasRef.getContext("2d");
ctx.clearRect(0, 0, ctx.canvas.width, ctx.canvas.height); // clean canvas
ctx.canvas.width = imageWidth;
ctx.canvas.height = imageHeight;
// Initial scale for letterbox image -> model
ctx.scale(1, Math.max(...ratios));
// Post-Process scale for model output -> actual dimensions
const horizontalScaleFactor = imageWidth / 640;
const verticalScaleFactor = imageHeight / 640;
ctx.scale(horizontalScaleFactor, verticalScaleFactor)
console.log({horizontalScaleFactor, verticalScaleFactor})
// font configs
const font = `${14}px Arial`;
ctx.font = font;
ctx.textBaseline = "top";
const colors = new Colors();
for (let i = 0; i < scores_data.length; ++i) {
const klass = labels[classes_data[i]];
const color = colors.get(classes_data[i]);
const score = (scores_data[i] * 100).toFixed(1);
let [x1, y1, x2, y2] = xywh2xyxy(boxes_data[i]);
const width = x2 - x1;
const height = y2 - y1;
// Draw the bounding box.
ctx.strokeStyle = Colors.hexToRgba(color, 0.5);
ctx.lineWidth = 2;
ctx.strokeRect(x1, y1, width, height);
// Draw the label background.
ctx.fillStyle = color;
const textWidth = ctx.measureText(klass + " - " + score + "%").width;
const textHeight = parseInt(font, 10); // base 10
ctx.fillRect(x1 - 1, y1 - (textHeight + 2), textWidth + 2, textHeight + 2);
// Draw labels
ctx.fillStyle = "#ffffff";
ctx.fillText(klass + " - " + score + "%", x1 - 1, y1 - (textHeight + 2));
}
};