I have a database in JSON containing multiple links and I cannot find a library to crawl it for valid and invalid links. I want to test each URL inside to look for broken links.
Property example:
{
"Organisation": "British Association for Sexual Health and HIV",
"Abréviation": "BASHH",
"Spécialité": "infectiologie",
"Type": "societe savante",
"Actualités": "https://www.bashh.org/news/news/",
"RSS": "No",
"Publications ouvertes": "https://www.bashh.org/guidelines",
"Publications RSS": "No",
"Social": "https://x.com/BASHH_UK"
},
I had to build a script thanks to Codestral. Works on any JSON layout.
You need NodeJS 16+.
import axios from 'axios'
import fs from 'fs/promises'
// Find all links
function isUrl(str) {
const urlPattern = new RegExp('^(https?:\\/\\/)?' + // protocol
'((([a-z\\d]([a-z\\d-]*[a-z\\d])*)\\.)+[a-z]{2,}|' + // domain name
'((\\d{1,3}\\.){3}\\d{1,3}))' + // OR ip (v4) address
'(\\:\\d+)?(\\/[-a-z\\d%_.~+]*)*' + // port and path
'(\\?[;&a-z\\d%_.~+=-]*)?' + // query string
'(\\#[-a-z\\d_]*)?$', 'i'); // fragment locator
return !!urlPattern.test(str);
}
// Test links
async function testUrl(url) {
try {
const response = await axios.get(url);
// console.log(`${url} is working. Statut : ${response.status}`);
} catch (error) {
console.error(`${url} is down. Erreur : ${error.message}`);
}
}
// Look for links in JSON
function traverse(obj) {
for (let key in obj) {
if (obj[key] !== null && typeof obj[key] === 'object') {
traverse(obj[key]);
} else if (typeof obj[key] === 'string' && isUrl(obj[key])) {
testUrl(obj[key]);
}
}
}
// Read JSON and test links
async function main() {
// Set JSON file location
try {
const data = await fs.readFile('../static/data/societes-savantes.json', 'utf8');
const jsonData = JSON.parse(data);
traverse(jsonData);
} catch (error) {
console.error(`Erreur lors de la lecture du fichier JSON : ${error.message}`);
}
}
main()