[SOLVED] How to add file name pattern in AWS Glue ETL job python script

How to add file name pattern in AWS Glue ETL job python script

I wanted to add file name pattern in AWS Glue ETL job python script where it should generate the files in s3 bucket with pattern dostrp*.csv.gz but could not find way how to provide this file pattern in python script :

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

args = getResolvedOptions(sys.argv, ['target_BucketName', 'JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

outputbucketname = args['target_BucketName']

# Script generated for node AWS Glue Data Catalog
AWSGlueDataCatalog_node188777777 = glueContext.create_dynamic_frame.from_catalog(database="xxxx", table_name="xxxx", transformation_ctx="AWSGlueDataCatalog_node887777777")

# Script generated for node Amazon S3
AmazonS3_node55566666 = glueContext.write_dynamic_frame.from_options(frame=AWSGlueDataCatalog_node8877777777, connection_type="s3", format="csv", format_options={"separator": "|"}, connection_options={"path": outputbucketname, "compression": "gzip", "partitionKeys": []}, transformation_ctx="AmazonS3_node5566677777")

job.commit()

Solution

Use pandas :

import pandas as pd

# Convert DynamicFrame to Pandas DataFrame
df = dynamic_frame.toDF().toPandas()

# Define S3 bucket and prefix
s3_bucket = 'your-s3-bucket'
s3_prefix = 'your/s3/prefix/'

# Define the S3 path for the output file
s3_output_path = f"s3://{s3_bucket}/{s3_prefix}output_file.csv.gz"

# Create an in-memory buffer
buffer = io.BytesIO()

# Save the DataFrame as a CSV file in the buffer with gzip compression
df.to_csv(buffer, index=False, compression='gzip')

# Upload the buffer to S3
s3_client = boto3.client('s3')
s3_client.put_object(Bucket=s3_bucket, Key=f"{s3_prefix}output_file.csv.gz", Body=buffer.getvalue())