pythonamazon-s3aws-gluefilepattern

How to add file name pattern in AWS Glue ETL job python script


I wanted to add file name pattern in AWS Glue ETL job python script where it should generate the files in s3 bucket with pattern dostrp*.csv.gz but could not find way how to provide this file pattern in python script :

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

args = getResolvedOptions(sys.argv, ['target_BucketName', 'JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

outputbucketname = args['target_BucketName']

# Script generated for node AWS Glue Data Catalog
AWSGlueDataCatalog_node188777777 = glueContext.create_dynamic_frame.from_catalog(database="xxxx", table_name="xxxx", transformation_ctx="AWSGlueDataCatalog_node887777777")

# Script generated for node Amazon S3
AmazonS3_node55566666 = glueContext.write_dynamic_frame.from_options(frame=AWSGlueDataCatalog_node8877777777, connection_type="s3", format="csv", format_options={"separator": "|"}, connection_options={"path": outputbucketname, "compression": "gzip", "partitionKeys": []}, transformation_ctx="AmazonS3_node5566677777")

job.commit()

Solution

  • Use pandas :

    import pandas as pd
    
    # Convert DynamicFrame to Pandas DataFrame
    df = dynamic_frame.toDF().toPandas()
    
    # Define S3 bucket and prefix
    s3_bucket = 'your-s3-bucket'
    s3_prefix = 'your/s3/prefix/'
    
    # Define the S3 path for the output file
    s3_output_path = f"s3://{s3_bucket}/{s3_prefix}output_file.csv.gz"
    
    # Create an in-memory buffer
    buffer = io.BytesIO()
    
    # Save the DataFrame as a CSV file in the buffer with gzip compression
    df.to_csv(buffer, index=False, compression='gzip')
    
    # Upload the buffer to S3
    s3_client = boto3.client('s3')
    s3_client.put_object(Bucket=s3_bucket, Key=f"{s3_prefix}output_file.csv.gz", Body=buffer.getvalue())