The below code is working fine, but I had some code feedback: "Why download and save the file to disk, only to read it back in memory?"
However, after spending some hours exploring options with Buffer
and stream
, I just don't seem to be getting anywhere.
const fs = require('fs');
const { PdfData } = require('pdfdataextract');
const axios = require('axios').default;
const getPDFText = async ({ url }) => {
const tmpDir = `${process.cwd()}/my_dir`;
const writer = fs.createWriteStream(`${tmpDir}/document.pdf`);
const response = await axios({
url,
method: 'get',
responseType: 'stream'
});
response.data.pipe(writer);
const text = await new Promise((resolve, reject) => {
writer.on('finish', () => {
const fileData = fs.readFileSync(`${tmpDir}/document.pdf`);
PdfData.extract(fileData, {
get: {
// ...
},
})
.then(resolve)
.catch(reject);
});
writer.on('error', reject);
});
return text;
};
How can I avoid saving the file to disk, and to instead feed it into the PdfData.extract
method?
The signature for .extract
says it accepts an Uint8Array
.
Something like
const {PdfData} = require('pdfdataextract');
const axios = require('axios').default;
async function getPDFText({url}) {
const response = await axios({
url,
method: 'get',
responseType: 'arraybuffer',
});
const pdfUint8Array = new Uint8Array(response.data);
const res = await PdfData.extract(pdfUint8Array, /* ... */);
console.log(res);
return res;
}
could do the trick?