I want to get the link text from the src attribute within an img tag. This is part of the html with the img and src tags:
<img alt="" class="responsive-img" src="https://i.guim.co.uk/img/media/6167380a1330877b8265353f2756b127c2226824/0_81_4256_2554/master/4256.jpg?width=300&quality=85&auto=format&fit=max&s=787aa5ddd44a8a66a06120452e228503">
I will give the HTML and the script I use because that is what I tried:
This is the HTML:
<div class="fc-item__container">
<div class="fc-item__media-wrapper">
<div class="fc-item__image-container u-responsive-ratio">
<picture><!--[if IE 9]><video style="display: none;"><![endif]-->
<source
media="(min-width: 980px) and (-webkit-min-device-pixel-ratio: 1.25), (min-width: 980px) and (min-resolution: 120dpi)"
srcset="https://i.guim.co.uk/img/media/6167380a1330877b8265353f2756b127c2226824/0_81_4256_2554/master/4256.jpg?width=140&quality=45&auto=format&fit=max&dpr=2&s=35c7a9a7cc4e5ebd8fcdfcb67177a8f4 280w">
<source media="(min-width: 980px)"
srcset="https://i.guim.co.uk/img/media/6167380a1330877b8265353f2756b127c2226824/0_81_4256_2554/master/4256.jpg?width=140&quality=85&auto=format&fit=max&s=f68f029ce1b60ed96581f28a29062e3b 140w">
<source
media="(min-width: 740px) and (-webkit-min-device-pixel-ratio: 1.25), (min-width: 740px) and (min-resolution: 120dpi)"
srcset="https://i.guim.co.uk/img/media/6167380a1330877b8265353f2756b127c2226824/0_81_4256_2554/master/4256.jpg?width=140&quality=45&auto=format&fit=max&dpr=2&s=35c7a9a7cc4e5ebd8fcdfcb67177a8f4 280w">
<source media="(min-width: 740px)"
srcset="https://i.guim.co.uk/img/media/6167380a1330877b8265353f2756b127c2226824/0_81_4256_2554/master/4256.jpg?width=140&quality=85&auto=format&fit=max&s=f68f029ce1b60ed96581f28a29062e3b 140w">
<!--[if IE 9]></video><![endif]-->
<img alt="" class="responsive-img"
src="https://i.guim.co.uk/img/media/6167380a1330877b8265353f2756b127c2226824/0_81_4256_2554/master/4256.jpg?width=300&quality=85&auto=format&fit=max&s=787aa5ddd44a8a66a06120452e228503">
</picture>
</div>
</div>
</div>
This is the Puppeteer script:
const fs = require("node:fs/promises");
const puppeteer = require("puppeteer"); // ^19.4.1
const url = "https://www.theguardian.com/international";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.setJavaScriptEnabled(false);
await page.setRequestInterception(true);
page.on("request", req => {
if (req.url() !== url) {
req.abort();
}
else {
req.continue();
}
});
await page.goto(url, { waitUntil: "domcontentloaded" });
const img_src = await page.$$eval(".fc-item__container", els =>
els.map(e => {
const text = s => e.querySelector(s)?.textContent.trim();
return {
src: e.querySelector(".fc-item__media-wrapper .responsive-img src"),
image: text(".fc-item__media-wrapper .responsive-img"),
};
})
);
console.log(img_src);
await fs.writeFile("img_src.json", JSON.stringify(img_src, null, 2));
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
The script runs but all I get are empty strings, like this:
[
{
"src": null,
"image": ""
},
{
"src": null,
"image": ""
}
}
]
As you can see I tried 2 variations but both doesn't give any result.
return {
src: e.querySelector(".fc-item__media-wrapper .responsive-img src"),
image: text(".fc-item__media-wrapper .responsive-img"),
};
Any help is much appriciated.
First of all, great job on blocking requests and disabling JS! This speeds up the script considerably and means we can rely purely on the view-source:
which simplifies matters.
A problem is:
e.querySelector(".fc-item__media-wrapper .responsive-img src"),
This says "return the <src>
tag within an element with class="responsive-img"
within an element with class="fc-item__media-wrapper"
". You probably mean:
e.querySelector(".fc-item__media-wrapper .responsive-img")
?.getAttribute("src")
As for the "text", I'm not sure what that refers to, since there's no text anywhere inside of the .fc-item__media-wrapper
class.
If you're looking for the kicker text or headline, here's one approach:
const fs = require("node:fs/promises");
const puppeteer = require("puppeteer"); // ^19.6.3
const url = "<Your URL>";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.setJavaScriptEnabled(false);
await page.setRequestInterception(true);
page.on("request", req => {
if (req.url() !== url) {
req.abort();
}
else {
req.continue();
}
});
await page.goto(url, {waitUntil: "domcontentloaded"});
const data = await page.$$eval(".fc-item__container", els =>
els.map(e => {
const $ = s => e.querySelector(s);
const text = s => $(s)?.textContent.trim();
return {
src: $(".fc-item__media-wrapper .responsive-img")
?.getAttribute("src"),
kicker: text(".fc-item__kicker"),
headline: text(".fc-item__headline"),
};
})
);
console.log(data);
await fs.writeFile("img_src.json", JSON.stringify(data, null, 2));
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
By the way, once you've gotten to the point where you're blocking all requests and have disabled JS, you can often just use fetch (or axios, if you're not on Node 18 yet) with Cheerio. This simplifies matters and further speeds things up:
const cheerio = require("cheerio"); // 1.0.0-rc.12
const url = "<Your URL>";
fetch(url)
.then(res => {
if (!res.ok) {
throw Error(res.statusText);
}
return res.text();
})
.then(html => {
const $ = cheerio.load(html);
const data = [...$(".fc-item__container")].map(e => ({
src: $(e).find(".fc-item__media-wrapper .responsive-img").attr("src"),
kicker: $(e).find(".fc-item__kicker").text().trim(),
headline: $(e).find(".fc-item__headline").text().trim(),
}));
console.log(data);
})
.catch(err => console.error(err));
See also this related question from OP.
Disclosure: I'm the author of the linked blog post.