javascriptnode.jsgoogle-cloud-vision

How to increase the limit of 20 pages for Google Cloud Vision PDF text detection?


Hi I am trying to detect text in a 34 page PDF with Google Cloud Vision. I then save the text to Firebase Firestore from the JSON generated and saved to my Firebase Storage. All this using a Firebase Cloud function. When I loop through the pages in the JSON, I only get 20 pages and not all the 34 pages. The same thing happens with any PDF I use that is over 20 pages. I use the asyncBatchAnnotateFiles function which is supposed to have a 2,000 page limit. I have searched everywhere to learn how to increase the 20 page limit but failed to find any relevant info. Please help. Here's my code:

if (contentType === "application/pdf") {  

    const orderUri = path.dirname(object.name);
    const fileName = path.basename(object.name)
    const outputPrefix = 'results';
    const gcsSourceUri = `gs://${object.bucket}/${object.name}`;
    const gcsDestinationUri = `gs://${object.bucket}/${orderUri}/${outputPrefix}/${fileName}/`;
    

    const inputConfig = {
    // Supported mime_types are: 'application/pdf' and 'image/tiff'
    mimeType: 'application/pdf',
    gcsSource: {
        uri: gcsSourceUri,
    },
    };

    const outputConfig = {
    gcsDestination: {
        uri: gcsDestinationUri,
    },
    };

    const features = [{type: 'DOCUMENT_TEXT_DETECTION'}];

    const request = {
    requests: [
        {
        inputConfig: inputConfig,
        features: features,
        outputConfig: outputConfig,
        },
    ],
    };

    // OCR PDFs
    const [operation] = await client.asyncBatchAnnotateFiles(request);
    const [filesResponse] = await operation.promise();    
    // const destinationUri =
    // filesResponse.responses[0].outputConfig.gcsDestination.uri;
    // console.log('Json saved to: ' + destinationUri);    
    
    
}

and here's how I retrieve the JSON from Firebase Storage and loop through it to get the text:

if (path.basename(object.name).startsWith('output') && path.basename(object.name).split('.').pop() === "json") {       
        // Get references
        const fileBucket = object.bucket; // The Storage bucket that contains the file.
        const filePath = object.name; // File path in the bucket.       

         // Download JSON     
        const bucket = admin.storage().bucket(fileBucket);   
        const downloadResponse = await bucket.file(filePath).download();       
        const bufferToJson = downloadResponse.toString();
        const jsObject = JSON.parse(bufferToJson);

        // Text
        const textArray = jsObject.responses.map(async (response) => {
          return response.fullTextAnnotation.text;
        });
        const readyArray = await Promise.all(textArray);
        const fullTextReady = readyArray.join();

        // Count words
        function countWords(str) {
          return str.trim().split(/\s+/).length;
        }
        const words = countWords(fullTextReady);       


        // Text confidence
        const textConfidenceArray = jsObject.responses.map(async (response) => {
          return response.fullTextAnnotation.pages.map((page) => {
            return page.confidence;
          })
        })
        const textConfidence = await Promise.all(textConfidenceArray);        
        const textConfidence2 = textConfidence.flat();
        const sum = textConfidence2.reduce((accumulator, currentValue) => {
          return accumulator + currentValue
        },0);
        const average = sum / textConfidence2.length;
        const textConfidence3 = Number(average).toFixed(2) * 100;      
        
        
        // Language and Language Confidence
        const pages = jsObject.responses.map((response) => {
          return response.fullTextAnnotation.pages.map((page) => {
            return page.property.detectedLanguages
          })
        });
        const pages2 = await Promise.all(pages);
        const detectedLanguages = pages2.flat(2);    

        const languageAndConfidenceArray = detectedLanguages.map((language) => {
               const langCode = language.languageCode;
               const confidence = Number((language.confidence).toFixed(1)) * 100;
                return {
                  languageCode: langCode,
                  languageConfidence: confidence
                }
        })

        const languages = await Promise.all(languageAndConfidenceArray);

             
        // Save to Firestore
        const jsonLocation = path.dirname(object.name);
        const fileName = path.basename(jsonLocation);
        const results = path.dirname(jsonLocation);
        const order = path.dirname(results);   
        const destination = `${order}/${fileName}`;
        const docRef = db.collection('Clients').doc(destination);
        await docRef.set({
          fullText: fullTextReady,
          textConfidence: textConfidence3,         
          type: "application/pdf",
          pageCount: jsObject.responses.length,
          languages: languages,
          fileName: fileName,
          location: jsonLocation,
          wordCount: words          
          }, { merge: true });
            
}

Solution

  • After long research, I was able to figure it out. You need to change the batchsize in the outputconfig like this:

    JSON representation
    
    {
      "gcsDestination": {
        object (GcsDestination)
      },
      "batchSize": integer
    }
    

    The max number of response protos to put into each output JSON file on Google Cloud Storage. The valid range is [1, 100]. If not specified, the default value is 20.

    Here's a link to the official docs: https://cloud.google.com/vision/docs/reference/rest/v1/OutputConfig