node.jsgoogle-cloud-functionspdf.jspdfjs-dist

PDFJS Firebase Cloud Functions: certificate has expired


I am extracting plain text from pdf documents using PDFJS and the extraction is scheduled using Firebase Cloud Functions. All is good until one fine day I am getting certificate has expired from selected pdfs, coming mostly from two domains.

I have checked that those affected domains still have valid SSL, and running the plain text extraction code on local machine works with no problem. Once it is deployed to firebase cloud functions, it throws the certificate has expired error.

Error
    at BaseExceptionClosure (/srv/node_modules/pdfjs-dist/build/pdf.js:666:29)
    at Object.<anonymous> (/srv/node_modules/pdfjs-dist/build/pdf.js:669:2)
    at __w_pdfjs_require__ (/srv/node_modules/pdfjs-dist/build/pdf.js:52:30)
    at Object.defineProperty.value (/srv/node_modules/pdfjs-dist/build/pdf.js:129:23)
    at __w_pdfjs_require__ (/srv/node_modules/pdfjs-dist/build/pdf.js:52:30)
    at pdfjsVersion (/srv/node_modules/pdfjs-dist/build/pdf.js:116:18)
    at /srv/node_modules/pdfjs-dist/build/pdf.js:119:10
    at webpackUniversalModuleDefinition (/srv/node_modules/pdfjs-dist/build/pdf.js:25:20)
    at Object.<anonymous> (/srv/node_modules/pdfjs-dist/build/pdf.js:32:3)
    at Module._compile (module.js:653:30)
    at Object.Module._extensions..js (module.js:664:10)
    at Module.load (module.js:566:32)
    at tryModuleLoad (module.js:506:12)
    at Function.Module._load (module.js:498:3)
    at Module.require (module.js:597:17)
    at require (internal/module.js:11:18)
    at Object.<anonymous> (/srv/pdf/pdf.js:7:18)
    at Module._compile (module.js:653:30)
    at Object.Module._extensions..js (module.js:664:10)
    at Module.load (module.js:566:32)
    at tryModuleLoad (module.js:506:12)
    at Function.Module._load (module.js:498:3)
  message: 'certificate has expired',
  name: 'UnknownErrorException',
  details: 'UnknownErrorException: certificate has expired' }" 

Code:

const pdfjslib = require('pdfjs-dist');
const functions = require('firebase-functions');

module.exports = functions.https.onRequest((req, res) => {
    let url = req.query.url

    return extractPlainTextFromPdf(url)
    .then(pb => {
        return res.send(pb)
    })
    .catch(err => {
        console.log(err)
        return res.send("Err occured")
    })
});

function extractPlainTextFromPdf(pdfUrl) {
    let options = setupPdfOptions(pdfUrl)
    return getPlainBody(options)
    .then((plainBody) => plainBody)
    .catch((err) => {
        console.log("Err plainBody", err) //<== Error thrown here
    })
}

async function getPlainBody(options) {
    return getDocument(options)
    .then(doc => extractTexts(doc, doc.numPages))
}

function getDocument(options) {
    var loadingTask = pdfjslib.getDocument(options)
    return loadingTask.promise
    .then((doc) => doc)
}

function setupPdfOptions(url) {
    return {
        url: url,
        httpHeaders: {
            "User-Agent": "MY-USER-AGENT",
        },
    };
}

Here's two sample pdf that are facing the above issue.

https://www.nea.gov.sg/docs/default-source/our-services/building-planning/notification-of-new-edition-of-code-of-practice-on-environment-health-(2020-edition).pdf

https://www.nparks.gov.sg/-/media/nparks-real-content/partner-us/developers-architects-and-engineers/circular_2020_0106_nparks.pdf?la=en&hash=F25A74CC8667D5D98EDF3A9C186E235330D228A8

EDIT:

//package.json
{
  "name": "functions",
  "description": "Cloud Functions for Firebase",
  "scripts": {
    "serve": "firebase serve --only functions",
    "shell": "firebase functions:shell",
    "start": "npm run shell",
    "deploy": "firebase deploy --only functions",
    "logs": "firebase functions:log",
  },
  "engines": {
    "node": "8"
  },
  "dependencies": {
    "@google-cloud/functions-framework": "^1.5.1",
    "@google-cloud/vision": "^1.11.0",
    "aws-sdk": "^2.667.0",
    "axios": "^0.19.2",
    "cheerio": "^1.0.0-rc.3",
    "diff-match-patch": "^1.0.4",
    "firebase-admin": "^8.11.0",
    "firebase-functions": "^3.6.1",
    "moment": "^2.25.0",
    "nodemailer": "^6.4.6",
    "pdfjs-dist": "^2.3.200",
    "request": "^2.88.2",
    "request-promise": "^4.2.5",
  },
  "devDependencies": {
    "firebase-functions-test": "^0.1.6"
  },
  "private": true
}

Solution

  • Node.JS 8 on cloud functions is deprecated, I think that some packages like openssl and others are obsolete in Node 8 runtime and cause weird SSL issues, I experienced this behavior in some vintage linux distros(ubuntu 10.04).

    "The Node.js 8 runtime will be deprecated on 2020-06-05. To ensure that your functions are on a supported version of Node.js, migrate them to Node.js 10."