javascriptnode.jspdf-extraction

Node js: Download file to memory and pass to library function without saving to disk


The below code is working fine, but I had some code feedback: "Why download and save the file to disk, only to read it back in memory?"

However, after spending some hours exploring options with Buffer and stream, I just don't seem to be getting anywhere.

const fs = require('fs');
const { PdfData } = require('pdfdataextract');
const axios = require('axios').default;

const getPDFText = async ({ url }) => {
  const tmpDir = `${process.cwd()}/my_dir`;
  const writer = fs.createWriteStream(`${tmpDir}/document.pdf`);
  const response = await axios({
    url,
    method: 'get',
    responseType: 'stream'
  });
  response.data.pipe(writer);

  const text = await new Promise((resolve, reject) => {
    writer.on('finish', () => {
      const fileData = fs.readFileSync(`${tmpDir}/document.pdf`);

      PdfData.extract(fileData, {
        get: {
           // ...
        },
      })
      .then(resolve)
      .catch(reject);
    });
    writer.on('error', reject);
  });

  return text;
};

How can I avoid saving the file to disk, and to instead feed it into the PdfData.extract method?


Solution

  • The signature for .extract says it accepts an Uint8Array.

    Something like

    const {PdfData} = require('pdfdataextract');
    const axios = require('axios').default;
    
    async function getPDFText({url}) {
      const response = await axios({
        url,
        method: 'get',
        responseType: 'arraybuffer',
      });
      const pdfUint8Array = new Uint8Array(response.data);
      const res = await PdfData.extract(pdfUint8Array, /* ... */);
      console.log(res);
      return res;
    }
    

    could do the trick?