dataframeapache-sparkpysparkunpivotmelt

Unpivot the data frame from wide to long in PySpark using melt


I am trying to perform melting operation on my data frame. I have tried the code below, but I am getting an error:

A DataFrame object does not have an attribute melt. Please check the spelling and/or the datatype of the object.

df_pivot_jp = JP_ch.melt(id_vars=['c_id'], var_name='views_on_character', value_name='answer')
df_pivot_gj = GJ_ch.melt(id_vars=['c_id'], var_name='views_on_character', value_name='answer')

Can someone please tell me what is this attribute that I am missing?


Solution

  • Input dataframe:

    from pyspark.sql import functions as F
    JP_ch = spark.createDataFrame(
        [('c1', 111, 1111),
         ('c2', 222, 2222),
         ('c3', 333, 3333)],
        ['c_id', 'col2', 'col3'])
    

    Pandas' melt returns this:

    JP_ch = JP_ch.toPandas()
    df_melted_jp = JP_ch.melt(id_vars=['c_id'], var_name='views_on_character', value_name='answer')
    
    print(df_melted_jp)
    #   c_id views_on_character  answer
    # 0   c1               col2     111
    # 1   c2               col2     222
    # 2   c3               col2     333
    # 3   c1               col3    1111
    # 4   c2               col3    2222
    # 5   c3               col3    3333
    

    Spark 3.4+

    df = JP_ch.melt(['c_id'], ['col2', 'col3'], 'views_on_character', 'answer')
    
    df.show()
    # +----+------------------+------+
    # |c_id|views_on_character|answer|
    # +----+------------------+------+
    # |  c1|              col2|   111|
    # |  c1|              col3|  1111|
    # |  c2|              col2|   222|
    # |  c2|              col3|  2222|
    # |  c3|              col2|   333|
    # |  c3|              col3|  3333|
    # +----+------------------+------+
    

    or

    to_melt = [c for c in JP_ch.columns if c not in {'c_id'}]
    df = JP_ch.melt(['c_id'], to_melt, 'views_on_character', 'answer')
    
    df.show()
    # +----+------------------+------+
    # |c_id|views_on_character|answer|
    # +----+------------------+------+
    # |  c1|              col2|   111|
    # |  c1|              col3|  1111|
    # |  c2|              col2|   222|
    # |  c2|              col3|  2222|
    # |  c3|              col2|   333|
    # |  c3|              col3|  3333|
    # +----+------------------+------+
    

    Older Spark versions:

    to_melt = {c for c in JP_ch.columns if c not in ['c_id']}
    new_names = '(views_on_character, answer)'
    
    melt_list = [f"\'{c}\', `{c}`" for c in to_melt]
    df = JP_ch.select(
        *(set(JP_ch.columns) - to_melt),
        F.expr(f"stack({len(melt_list)}, {','.join(melt_list)}) {new_names}")
    )
    df.show()
    # +----+------------------+------+
    # |c_id|views_on_character|answer|
    # +----+------------------+------+
    # |  c1|              col3|  1111|
    # |  c1|              col2|   111|
    # |  c2|              col3|  2222|
    # |  c2|              col2|   222|
    # |  c3|              col3|  3333|
    # |  c3|              col2|   333|
    # +----+------------------+------+