I have worked on a script to extract out tables, forms from pdf documents using AWS Textract in Node.js. The problem Im facing is that when I use the async (and even sync) operations of textract , I am not getting tables after the first page in documents uploaded on S3. All the textual data, form key-values seem fine but the response does not show tables after page 1.
The interesting thing though is that the tables are recognized and shown in csv results in the Textract BulkUploader from the AWS Console. Which is very strange!
The textract api response "Blocks" don't show any BlockType of "TABLE" on Pages after Page 1 when I use the aws-sdk. Please help me with this as the results shown on AWS console itself does in fact show the tables after page 1. So why the difference when I am making the api calls through script? Any help will be much appreciated!
Here is the code I have tried out:
const {
TextractClient,
StartDocumentAnalysisCommand,
GetDocumentAnalysisCommand,
} = require("@aws-sdk/client-textract");
const startJob = async (file, bucket) => {
try {
const params = {
DocumentLocation: {
S3Object: {
Bucket: bucket,
Name: file,
},
},
FeatureTypes: ["FORMS", "TABLES"],
};
const command = new StartDocumentAnalysisCommand(params);
const response = await textractClient.send(command);
const jobId = response.JobId;
console.log("Textract job started with ID:", jobId);
// Wait for the job to complete
await waitForJobCompletion(jobId, file);
} catch (err) {
console.log("Error starting Textract job:", err);
}
};
// Wait for the Textract job to completes
const waitForJobCompletion = async (jobId, file) => {
try {
const jobParams = {
JobId: jobId,
};
let response;
let jobStatus;
do {
// const command = new GetDocumentTextDetectionCommand(params); //for text detection
const command = new GetDocumentAnalysisCommand(jobParams);
response = await textractClient.send(command);
jobStatus = response.JobStatus;
console.log("Job status:", jobStatus);
if (jobStatus === "SUCCEEDED") {
// Job completed successfully, retrieve the results
if (response && response.Blocks) {
fs.writeFile(`./s3-textract-results/tabledata.json`, JSON.stringify(response), 'utf8', (err) => {
if (err) {
console.error('Error writing to file:', err);
} else {
console.log('Data written to file.');
}
});
console.log(response.Blocks);
}
} else if (jobStatus === "FAILED" || jobStatus === "PARTIAL_SUCCESS") {
// Job failed or partially succeeded, handle the error
console.log("Job failed or partially succeeded:", response);
} else {
// Job is still in progress, wait for a while and check again
await new Promise((resolve) => setTimeout(resolve, 10000)); // Wait for 5 seconds
}
} while (jobStatus === "IN_PROGRESS" || jobStatus === "PARTIAL_SUCCESS");
} catch (err) {
console.log("Error retrieving Textract job results:", err);
}
};
Made it work using the "NextToken" in the response whenever the status was "SUCCEEDED" but the job was not fully complete. You have to pass the NextToken in the jobParams in the subsequent requests to get the remaining response. Textract provides a multi-part response in case the response is too big (if the file is large)