I'm trying to read PDF files that are stored in a S3 bucket, using a Python package called pdfplumber
. I tried the following approaches, but none of them has worked.
Does anyone know how I can read PDFs off of a S3 bucket using pdfplumber
?
######################################################################
## attempt 1: supplying S3 URL directly to pdfplumber.open() method ##
######################################################################
import pdfplumber
s3_url = "s3://example-s3-bucket/example.pdf"
with pdfplumber.open(s3_url) as pdf:
pages = pdf.pages
## ERROR -- AttributeError: 'S3' object has no attribute 'seek'
#####################################################
## attempt 2: opening content using pre-signed URL ##
#####################################################
import pdfplumber
import boto3
s3 = boto3.client('s3')
pdf_url = s3.generate_presigned_url(
'get_object',
Params={'Bucket': 'example-s3-bucket', 'Key': 'example.pdf'},
ExpiresIn=3600 # Set an expiration time for the URL if needed
)
with pdfplumber.open(pdf_url) as pdf:
pages = pdf.pages
## ERROR -- OSError: [Errno 63] File name too long:
############################################################
## attempt 3: supplying bytes to pdfplumber.open() method ##
############################################################
import pdfplumber
import boto3
s3 = boto3.client('s3')
bucket_name = "example-s3-bucket"
file_key = "example.pdf"
response = s3.get_object(Bucket=bucket_name, Key=file_key)
file_bytes = response['Body'].read()
with pdfplumber.open(file_bytes) as pdf:
pages = pdf.pages
## ERROR -- AttributeError: 'bytes' object has no attribute 'seek'
The package repo owner (jsvine) responded with a solution: https://github.com/jsvine/pdfplumber/discussions/1081
import pdfplumber
import boto3
import io
s3 = boto3.client('s3')
bucket_name = "example-s3-bucket"
file_key = "example.pdf"
response = s3.get_object(Bucket=bucket_name, Key=file_key)
file_bytes = response['Body'].read()
with pdfplumber.open(io.BytesIO(my_bytes)) as pdf:
pages = pdf.pages