I am trying to fill an empty strings with a '0' if column Data type is BIGINT/DOUBLE/Integer in a dataframe using pyspark
data = [("James","","Smith","36","M",3000,"1.2"),
("Michael","Rose"," ","40","M",4000,"2.0"),
("Robert","","Williams","42","M",4000,"5.0"),
("Maria","Anne"," ","39","F", ," "),
("Jen","Mary","Brown"," ","F",-1,"")
]
schema = StructType([
StructField("firstname",StringType(),True),
StructField("middlename",StringType(),True),
StructField("lastname",StringType(),True),
StructField("age", StringType(), True),
StructField("gender", StringType(), True),
StructField("salary", IntegerType(), True),
StructField("amount", DoubleType(), True)
])
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()
I am trying like this.
df.select( *[ F.when(F.dtype in ('integertype','doubletype') and F.col(column).ishaving(" "),'0').otherwise(F.col(column)).alias(column) for column in df.columns]).show()
Expected output:
+---------+----------+--------+---+------+------+------+
|firstname|middlename|lastname|age|gender|salary|amount|
+---------+----------+--------+---+------+------+------+
| James| | Smith| 36| M| 3000| 1.2|
| Michael| Rose| | 40| M| 4000| 2.0|
| Robert| |Williams| 42| M| 4000| 5.0|
| Maria| Anne| | 39| F| 0| 0|
| Jen| Mary| Brown| | F| -1| 0|
+---------+----------+--------+---+------+------+------+
You can utilise reduce to accomplish this , it makes the code more cleaner and easier to understand
Additionally create a to_fill
list to match the columns based on your condition , which can be further modified based on your scenarios.
data = [("James","","Smith","36","M",3000,1.2),
("Michael","Rose"," ","40","M",4000,2.0),
("Robert","","Williams","42","M",4000,5.0),
("Maria","Anne"," ","39","F",None,None),
("Jen","Mary","Brown"," ","F",-1,None)
]
schema = StructType([
StructField("firstname",StringType(),True),
StructField("middlename",StringType(),True),
StructField("lastname",StringType(),True),
StructField("age", StringType(), True),
StructField("gender", StringType(), True),
StructField("salary", IntegerType(), True),
StructField("amount", DoubleType(), True)
])
sparkDF = sql.createDataFrame(data=data,schema=schema)
sparkDF.show()
+---------+----------+--------+---+------+------+------+
|firstname|middlename|lastname|age|gender|salary|amount|
+---------+----------+--------+---+------+------+------+
| James| | Smith| 36| M| 3000| 1.2|
| Michael| Rose| | 40| M| 4000| 2.0|
| Robert| |Williams| 42| M| 4000| 5.0|
| Maria| Anne| | 39| F| null| null|
| Jen| Mary| Brown| | F| -1| null|
+---------+----------+--------+---+------+------+------+
to_fill = [ c for c,d in sparkDF.dtypes if d in ['int','bigint','double']]
# to_fill --> ['salary','amount']
sparkDF = reduce(
lambda df, x: df.withColumn(x, F.when(F.col(x).isNull(),0).otherwise(F.col(x))),
to_fill,
sparkDF,
)
sparkDF.show()
+---------+----------+--------+---+------+------+------+
|firstname|middlename|lastname|age|gender|salary|amount|
+---------+----------+--------+---+------+------+------+
| James| | Smith| 36| M| 3000| 1.2|
| Michael| Rose| | 40| M| 4000| 2.0|
| Robert| |Williams| 42| M| 4000| 5.0|
| Maria| Anne| | 39| F| 0| 0.0|
| Jen| Mary| Brown| | F| -1| 0.0|
+---------+----------+--------+---+------+------+------+