pysparkpyspark-schema

Pyspark- Fill an empty strings with a '0' if Data type is BIGINT/DOUBLE/Integer


I am trying to fill an empty strings with a '0' if column Data type is BIGINT/DOUBLE/Integer in a dataframe using pyspark

data = [("James","","Smith","36","M",3000,"1.2"),
    ("Michael","Rose"," ","40","M",4000,"2.0"),
    ("Robert","","Williams","42","M",4000,"5.0"),
    ("Maria","Anne"," ","39","F", ," "),
    ("Jen","Mary","Brown"," ","F",-1,"")
  ]

schema = StructType([ 
    StructField("firstname",StringType(),True), 
    StructField("middlename",StringType(),True), 
    StructField("lastname",StringType(),True), 
    StructField("age", StringType(), True), 
    StructField("gender", StringType(), True), 
    StructField("salary", IntegerType(), True),
    StructField("amount", DoubleType(), True) 
  ])
 
df = spark.createDataFrame(data=data,schema=schema)
df.printSchema()

I am trying like this.

df.select( *[ F.when(F.dtype in ('integertype','doubletype') and F.col(column).ishaving(" "),'0').otherwise(F.col(column)).alias(column) for column in df.columns]).show()

Expected output:

+---------+----------+--------+---+------+------+------+                        
|firstname|middlename|lastname|age|gender|salary|amount|
+---------+----------+--------+---+------+------+------+
|    James|          |   Smith| 36|     M|  3000|   1.2|
|  Michael|      Rose|        | 40|     M|  4000|   2.0|
|   Robert|          |Williams| 42|     M|  4000|   5.0|
|    Maria|      Anne|        | 39|     F|     0|     0|
|      Jen|      Mary|   Brown|   |     F|    -1|     0|
+---------+----------+--------+---+------+------+------+

Solution

  • You can utilise reduce to accomplish this , it makes the code more cleaner and easier to understand

    Additionally create a to_fill list to match the columns based on your condition , which can be further modified based on your scenarios.

    Data Preparation

    data = [("James","","Smith","36","M",3000,1.2),
        ("Michael","Rose"," ","40","M",4000,2.0),
        ("Robert","","Williams","42","M",4000,5.0),
        ("Maria","Anne"," ","39","F",None,None),
        ("Jen","Mary","Brown"," ","F",-1,None)
      ]
    
    schema = StructType([ 
        StructField("firstname",StringType(),True), 
        StructField("middlename",StringType(),True), 
        StructField("lastname",StringType(),True), 
        StructField("age", StringType(), True), 
        StructField("gender", StringType(), True), 
        StructField("salary", IntegerType(), True),
        StructField("amount", DoubleType(), True) 
      ])
     
    sparkDF = sql.createDataFrame(data=data,schema=schema)
    sparkDF.show()
    
    +---------+----------+--------+---+------+------+------+
    |firstname|middlename|lastname|age|gender|salary|amount|
    +---------+----------+--------+---+------+------+------+
    |    James|          |   Smith| 36|     M|  3000|   1.2|
    |  Michael|      Rose|        | 40|     M|  4000|   2.0|
    |   Robert|          |Williams| 42|     M|  4000|   5.0|
    |    Maria|      Anne|        | 39|     F|  null|  null|
    |      Jen|      Mary|   Brown|   |     F|    -1|  null|
    +---------+----------+--------+---+------+------+------+
    

    Reduce

    to_fill =  [ c for c,d in sparkDF.dtypes if d in ['int','bigint','double']] 
    
    # to_fill --> ['salary','amount']
    
    sparkDF = reduce(
        lambda df, x: df.withColumn(x, F.when(F.col(x).isNull(),0).otherwise(F.col(x))),
        to_fill,
        sparkDF,
    )
    
    sparkDF.show()
    
    +---------+----------+--------+---+------+------+------+
    |firstname|middlename|lastname|age|gender|salary|amount|
    +---------+----------+--------+---+------+------+------+
    |    James|          |   Smith| 36|     M|  3000|   1.2|
    |  Michael|      Rose|        | 40|     M|  4000|   2.0|
    |   Robert|          |Williams| 42|     M|  4000|   5.0|
    |    Maria|      Anne|        | 39|     F|     0|   0.0|
    |      Jen|      Mary|   Brown|   |     F|    -1|   0.0|
    +---------+----------+--------+---+------+------+------+