I am using axios to send a get request on google scholar. With cheerio I access the data. When the title is too long I get: title: 'Comparison of live-birth defects after luteal-phase ovarian stimulation vs. conventional ovarian stimulation for in vitro fertilization and vitrified embryo transfer�…'.
This is the code:
const free_proxy_url1 = "https://free-proxy-list.net";
request(free_proxy_url1, (err, response, body) => {
let $ = cheerio.load(body);
let ipResults = [];
$(".table-responsive > div > table > tbody > tr").each(
(i, el) => {
if ($(el).find("td:nth-child(7)").text() === "yes")
ipResults.push({
ip: $(el).find("td:nth-child(1)").text(),
port: Number($(el).find("td:nth-child(2)").text()),
https: $(el).find("td:nth-child(7)").text(),
});
}
);
let rand = Math.floor(Math.random() * ipResults.length);
let searchTerm = "AI";
const proxy = {
host: ipResults[rand].ip,
port: ipResults[rand].port,
};
axios
.get(
`https://scholar.google.com/scholar?q=${searchTerm}`,
proxy
)
.then(result => {
const $ = cheerio.load(result.data);
$("div.gs_ri").each((i, el) => {
const yearElement = $(el).find("div.gs_a");
const yearText = yearElement.text().match(/\d{4}/);
const titleElement = $(el).find("h3.gs_rt a");
scholar_results.push({
title: titleElement.text().trim(),
link: $(el).find(".gs_rt a").attr("href"),
year: yearText ? parseInt(yearText[0]) : null,
});
});
})
.catch(err => {
console.log(err);
});
});
Is there any way of getting the full title and not the truncated one?
This is a bit tricky. The full title doesn't appear to be anywhere in the static HTML, so it seems you'll need to follow the link to the external site for each paper, then try to guess what the title is.
Here's an example:
const axios = require("axios");
const cheerio = require("cheerio"); // 1.0.0-rc.12
const ua =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36";
const normalizeText = s =>
s.toLowerCase()
.replace(/[^a-z]/g, " ")
.replace(/\s+/, " ")
.trim();
const tryFindingFullTitle = async (title, link) => {
const normalizedTitle = normalizeText(title);
try {
const {data} = await axios.get(link, {headers: {"User-Agent": ua}});
const $ = cheerio.load(data);
return [...$("*")]
.flatMap(e =>
[...$(e).contents()].filter(e => e.type === "text")
)
.map(e => $(e).text().trim())
.filter(Boolean)
.find(e => normalizeText(e).startsWith(normalizedTitle));
}
catch (err) {
return title;
}
};
const runSearch = async searchTerm => {
const url = `https://scholar.google.com/scholar?q=${encodeURI(searchTerm)}`;
const {data} = await axios.get(url, {headers: {"User-Agent": ua}});
const $ = cheerio.load(data);
const result = [];
for (const el of [...$("div.gs_ri")]) {
const link = $(el).find(".gs_rt a").attr("href");
const year = $(el).find("div.gs_a").text().match(/\d{4}/);
const title = $(el).find("h3.gs_rt a").text().trim();
result.push({
title: title.includes("…") ?
await tryFindingFullTitle(title, link) : title,
link,
year: year ? parseInt(year[0]) : null,
});
}
return result;
};
runSearch("AI")
.then(result => console.log(result))
.catch(err => console.error(err.message));
The bulk of the work is tryFindingFullTitle
, which takes an abbreviated title and a URL, navigates to the URL and tries to find the first element with text content that has the abbreviated title as a prefix. Another approach might be to find the text with the smallest Levenstein distance.
This works OK on the few ScienceDirect listings in the sample test, but probably will fail on other cases, so consider it a proof of concept. I'm not familiar with this site, so it's entirely possible that it's available in full in a predictable format on another Google Scholar page.
By the way, your proxy doesn't appear to be properly added to the axios.get
call. I think it should be something like:
axios.get(url, {proxy: {host: "xxxx", port: 80}})