pythonpyspark

"Column is not iterable" when doing operations with dataframe as part of function


Good day, everyone. I'm trying to perform a comparison with calculation between two dataframes and give an output in a form of a table with two columns: "Type of Change" and "Number of Occurences". To do this I've merged two dataframes, adding "_New" and "_Old" suffixes to names of columns to identify them and wrote a little function to compare them, sum and give an output in form of dataframe. Code looks like that:

# Loop to go through columns and count changes
def Form_Change_Report(
    ColumnList,
    Comparison_Dataframe: Comparison_DF,
    ChangeReport_Dataframe: ChangeReport_DF,
):
    for x in ColumnList:
        Comparison_Dataframe = Comparison_Dataframe.withColumn(
            x + "_Change", when(col(x + "_Old") == col(x + "_New"), 0).otherwise(1)
        )
        Change_DF = Comparison_Dataframe.select(
            lit("Change to " + x).alias("Type of Change"),
            sum(x + "_Change").alias("Number of Occurences"),
        )
        ChangeReport_Dataframe = ChangeReport_Dataframe.unionByName(Change_DF)

    return ChangeReport_Dataframe


# Forming blank dataframe for change report
ChangeReport_RDD = spark.sparkContext.emptyRDD()
ChangeReport_Columns = [
    StructField("Type of Change", StringType(), True),
    StructField("Number of Occurences", IntegerType(), True),
]
ChangeReport_DF = spark.createDataFrame([], schema=StructType(ChangeReport_Columns))

ChangeReport_DF = Form_Change_Report(Columns_To_Compare, Comparison_DF, ChangeReport_DF)

When trying to run it returns error "Column in not iterable". I tried to browse net in search of solution, but all examples I found were based on usage of MAX function, which isn't the case in this scenario. Dataframes are formed in PySpark, all operations are done in DataBricks if this is necessary information Would someone be able to point out what the issue might be?

Update 1

I`ve been asked to add a reprex. Code I got looks like this:

import pyspark.sql.functions as f
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.sql.functions import sum as fsum
from pyspark.sql.types import StructType, StructField


data1 = [
    ("Postcode1", "100", "150"),
    ("Postcode2", "200", "250"),
    ("Postcode3", "300", "350"),
    ("Postcode4", "400", "450"),
]
data2 = [
    ("Postcode1", "150", "150"),
    ("Postcode2", "200", "200"),
    ("Postcode3", "350", "350"),
    ("Postcode4", "400", "450"),
]
Columns = ["Postcode", "Count1", "Count2"]

rdd1 = spark.sparkContext.parallelize(data1)
rdd2 = spark.sparkContext.parallelize(data2)

df1 = spark.createDataFrame(rdd1, schema=Columns)
df2 = spark.createDataFrame(rdd2, schema=Columns)

ColumnNames_New = [
    f.col("Postcode"),
    f.col("Count1").alias("Count1_New"),
    f.col("Count2").alias("Count2_New"),
]

df1_NewData = df1.select(ColumnNames_New)

ColumnNames_Old = [
    f.col("Postcode"),
    f.col("Count1").alias("Count1_Old"),
    f.col("Count2").alias("Count2_Old"),
]

df2_OldData = df2.select(ColumnNames_Old)

# Joining two dataframes on postcode
Comparison_DF = df1_NewData.join(df2_OldData, "Postcode")

Columns_to_Compare = [f.col("Count1"), f.col("Count2")]

# Forming blank dataframe for change report
ChangeReport_RDD = spark.sparkContext.emptyRDD()
ChangeReport_Columns = [
    StructField("Type of Change", StringType(), True),
    StructField("Number of Occurences", IntegerType(), True),
]
ChangeReport_DF = spark.createDataFrame([], schema=StructType(ChangeReport_Columns))


def Form_Change_Report(
    ColumnList,
    Comparison_Dataframe: df1_NewData,
    ChangeReport_Dataframe: ChangeReport_DF,
):
    for x in ColumnList:
        Comparison_Dataframe = Comparison_Dataframe.withColumn(
            x + "_Change", when(col(x + "_Old") == col(x + "_New"), 0).otherwise(1)
        )
        Change_DF = Comparison_Dataframe.select(
            lit("Change to " + x).alias("Type of Change"),
            fsum(x + "_Change").alias("Number of Occurences"),
        )
        ChangeReport_Dataframe = ChangeReport_Dataframe.unionByName(Change_DF)

    return ChangeReport_Dataframe


ChangeReport_DF = Form_Change_Report(Columns_to_Compare, Comparison_DF, ChangeReport_DF)

Solution

  • Your reprex cannot run currently. I made some adjustments to make it work, hope it will help you find the problem. I believe it comes from this line :

    # fix this line
    # Columns_to_Compare = [f.col("Count1"), f.col("Count2")]
    Columns_to_Compare = ["Count1", "Count2"]
    
    # import pyspark.sql.functions as f
    # from pyspark.sql import SparkSession
    # from pyspark.sql.functions import col, when
    # from pyspark.sql.functions import sum as fsum
    # from pyspark.sql.types import StructType, StructField
    
    # simplified your import and added reduce for later
    from pyspark.sql import functions as F
    from functools import reduce
    
    
    data1 = [
        ("Postcode1", "100", "150"),
        ("Postcode2", "200", "250"),
        ("Postcode3", "300", "350"),
        ("Postcode4", "400", "450"),
    ]
    data2 = [
        ("Postcode1", "150", "150"),
        ("Postcode2", "200", "200"),
        ("Postcode3", "350", "350"),
        ("Postcode4", "400", "450"),
    ]
    Columns = ["Postcode", "Count1", "Count2"]
    
    # rdd1 = spark.sparkContext.parallelize(data1)
    # rdd2 = spark.sparkContext.parallelize(data2)
    
    # df1 = spark.createDataFrame(rdd1, schema=Columns)
    # df2 = spark.createDataFrame(rdd2, schema=Columns)
    
    # Create directly the DF
    df1 = spark.createDataFrame(data1, schema=Columns)
    df2 = spark.createDataFrame(data2, schema=Columns)
    
    # rename f to F
    ColumnNames_New = [
        F.col("Postcode"),
        F.col("Count1").alias("Count1_New"),
        F.col("Count2").alias("Count2_New"),
    ]
    
    df1_NewData = df1.select(ColumnNames_New)
    
    # rename f to F
    ColumnNames_Old = [
        F.col("Postcode"),
        F.col("Count1").alias("Count1_Old"),
        F.col("Count2").alias("Count2_Old"),
    ]
    
    df2_OldData = df2.select(ColumnNames_Old)
    
    # Joining two dataframes on postcode
    Comparison_DF = df1_NewData.join(df2_OldData, "Postcode")
    
    # fix this line
    # Columns_to_Compare = [f.col("Count1"), f.col("Count2")]
    Columns_to_Compare = ["Count1", "Count2"]
    
    # useless
    # Forming blank dataframe for change report
    # ChangeReport_RDD = spark.sparkContext.emptyRDD()
    # ChangeReport_Columns = [
    #     StructField("Type of Change", StringType(), True),
    #     StructField("Number of Occurences", IntegerType(), True),
    # ]
    # ChangeReport_DF = spark.createDataFrame([], schema=StructType(ChangeReport_Columns))
    
    # update the function with reduce and simplified imports
    def Form_Change_Report(
        ColumnList,
        Comparison_Dataframe: df1_NewData,
    ):
        out = []
        for x in ColumnList:
            Comparison_Dataframe = Comparison_Dataframe.withColumn(
                x + "_Change",
                F.when((F.col(x + "_Old") == F.col(x + "_New")), 0).otherwise(1),
            )
            Change_DF = Comparison_Dataframe.select(
                F.lit("Change to " + x).alias("Type of Change"),
                F.sum(x + "_Change").alias("Number of Occurences"),
            )
            out.append(Change_DF)
    
        return reduce(lambda a, b: a.union(b), out)
    
    # remove not used anymore ChangeReport_DF
    # ChangeReport_DF = Form_Change_Report(Columns_to_Compare, Comparison_DF, ChangeReport_DF)
    ChangeReport_DF = Form_Change_Report(Columns_to_Compare, Comparison_DF)
    

    Result is :

    Type of Change Number of Occurences
    Change to Count1 2
    Change to Count2 1