google-cloud-platformmicroservicespdf-conversion

Send file from GCP bucket to 3rd party pdf converter


I am trying to adapt a Qwiklabs tutorial to use pdfCrowd, rather than LibreOffice.

The service works by downloading a file from one GCP storage bucket for 'uploads', processing it, then uploading it to another bucket for 'processed' files, then deleting the original from the 'uploads' bucket.

This is the function that downloads, uploads, sends off for processing, then deletes. This is the code from the Qwiklabs tut and it works great.

app.post('/', async (req, res) => {
  try {
    const file = decodeBase64Json(req.body.message.data);
    await downloadFile(file.bucket, file.name);
    const pdfFileName = await convertFile(file.name);
    await uploadFile(process.env.PDF_BUCKET, pdfFileName);
    await deleteFile(file.bucket, file.name);
  }
  catch (ex) {
    console.log(`Error: ${ex}`);
  }
  res.set('Content-Type', 'text/plain');
  res.send('\n\nOK\n\n');
})

The original convertFile function is:

    async function convertFile(fileName) {
      const cmd = 'libreoffice --headless --convert-to pdf --outdir /tmp ' +
                  `"/tmp/${fileName}"`;
      console.log(cmd);
      const { stdout, stderr } = await exec(cmd);
      if (stderr) {
        throw stderr;
      }
      console.log(stdout);
      pdfFileName = fileName.replace(/\.\w+$/, '.pdf');
      return pdfFileName;
}

The problem comes when I change my convertFile function. LibreOffice accepts file.name, but pdfCrowd wants the file path.

So I change the function with pdfCrowd to:

async function convertFile(fileName) {
    // create the API client instance
    const _newPdfPath = `/tmp/${fileName.replace(/\.\w+$/, '.pdf')}`
    const client = new pdfcrowd.HtmlToPdfClient("demo", "ce544b6ea52a5621fb9d55f8b542d14d");
    // run the conversion and write the result to a file
    client.convertFileToFile(`/tmp/${fileName}`, _newPdfPath, function (err, fileName) {
        if (err)
            return console.error("Pdfcrowd Error: " + err);
        console.log("Success: the file was created " + fileName);
    });
    pdfFileName = fileName.replace(/\.\w+$/, '.pdf');
    return pdfFileName;
}

Now the pdf conversion returns SUCCESS, but after a notification to say there is no file or directory as specified in the 'out' file path I passed to convertFileToFile. The file specified by _newPdfPath doesn't exist.

Error: Error: ENOENT: no such file or directory, stat '/tmp/mynew.pdf'
Success: the file was created /tmp/hello (31).pdf

The pdfCrowd function should be creating a file in the tmp directory, but is the async waiting for the file to be created in the tmp directory?

My complete code is:

const {promisify} = require('util');
const {Storage}   = require('@google-cloud/storage');
const exec        = promisify(require('child_process').exec);
const storage     = new Storage();
const express     = require('express');
const bodyParser  = require('body-parser');
const app         = express();

const pdfcrowd = require("pdfcrowd");

app.use(bodyParser.json());

const port = process.env.PORT || 8080;
app.listen(port, () => {
  console.log('Listening on port', port);
});

app.post('/', async (req, res) => {
  try {
    const file = decodeBase64Json(req.body.message.data);
    // console.log("FILE=========", file, req.body.message.data)
    await downloadFile(file.bucket, file.name);
    const pdfFileName = await convertFile(file.name);
    await uploadFile(process.env.PDF_BUCKET, pdfFileName);
    await deleteFile(file.bucket, file.name);
  }
  catch (ex) {
    console.log(`Error: ${ex}`);
  }
  res.set('Content-Type', 'text/plain');
  res.send('\n\nOK\n\n');
})

function decodeBase64Json(data) {
  return JSON.parse(Buffer.from(data, 'base64').toString());
}

async function downloadFile(bucketName, fileName) {
  const options = {destination: `/tmp/${fileName}`};
  await storage.bucket(bucketName).file(fileName).download(options);
}

async function convertFile(fileName) {
    // create the API client instance
    const _newPdfPath = `/tmp/${fileName.replace(/\.\w+$/, '.pdf')}`
    const client = new pdfcrowd.HtmlToPdfClient("demo", "ce544b6ea52a5621fb9d55f8b542d14d");
    // run the conversion and write the result to a file
    client.convertFileToFile(`/tmp/${fileName}`, _newPdfPath, function (err, fileName) {
        if (err)
            return console.error("Pdfcrowd Error: " + err);
        console.log("Success: the file was created " + fileName);
    });
    pdfFileName = fileName.replace(/\.\w+$/, '.pdf');
    return pdfFileName;
}

async function deleteFile(bucketName, fileName) {
  await storage.bucket(bucketName).file(fileName).delete();
}

async function uploadFile(bucketName, fileName) {
  await storage.bucket(bucketName).upload(`/tmp/${fileName}`);
}

Solution

  • The problem is that your convertFile function finishes before convertFileToFile callback is invoked.

    I'd pass callback for success and error into convertFile, e.g.

    app.post('/', async (req, res) => {
      try {
        const file = decodeBase64Json(req.body.message.data);
        // console.log("FILE=========", file, req.body.message.data)
        await downloadFile(file.bucket, file.name);
        let on_pdf_done = async function(pdfFileName) {
           await uploadFile(process.env.PDF_BUCKET, pdfFileName);
           await deleteFile(file.bucket, file.name);
    
           res.set('Content-Type', 'text/plain');
           res.send('\n\nOK\n\n');
        };
        let on_pdf_fail = function() {
           res.set('Content-Type', 'text/plain');
           res.send('\n\nERROR\n\n');
        };
        convertFile(file.name, on_pdf_done, on_pdf_fail);    
      }
      catch (ex) {
        console.log(`Error: ${ex}`);
      }
    })
    
    function convertFile(fileName, success_callback, fail_callback) {
        // create the API client instance
        const _newPdfPath = `/tmp/${fileName.replace(/\.\w+$/, '.pdf')}`
        const client = new pdfcrowd.HtmlToPdfClient("demo", "ce544b6ea52a5621fb9d55f8b542d14d");
        // run the conversion and write the result to a file
        client.convertFileToFile(`/tmp/${fileName}`, _newPdfPath, function (err, fileName) {
            if (err)
                return fail_callback();
            success_callback(fileName.replace(/\.\w+$/, '.pdf'));
        });
    }