pythonpandasmatplotlibseabornpdfpages

How to create groups of different plots based on column dtype


I have a dataframe

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
df= {
    'Gen':['M','M','M','M','F','F','F','F','M','M','M','M','F','F','F','F'],
    'Site':['FRX','FX','FRX','FRX','FRX','FX','FRX','FX','FX','FX','FX','FRX','FRX','FRX','FRX','FRX'],
    'Type':['L','L','L','L','L','L','L','L','R','R','R','R','R','R','R','R'],
     'UID':[1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004],
    'color':['R','R','G','G','B','G','B','B','R','G','R','G','B','B','R','G'],
    'Time2':[150.78,162.34,188.53,197.69,208.07,217.76,229.48,139.51,146.87,182.54,189.57,199.97,229.28,244.73,269.91,249.19],
     'Time3':[250.78,262.34,288.53,297.69,308.07,317.7,329.81,339.15,346.87,382.54,369.59,399.97,329.28,347.73,369.91,349.12],
     'Time4':[240.18,232.14,258.53,276.69,338.07,307.74,359.16,339.25,365.87,392.48,399.97,410.75,429.08,448.39,465.15,469.33],
     'Time5':[270.84,282.14,298.53,306.69,318.73,327.47,369.63,389.59,398.75,432.18,449.78,473.55,494.85,509.39,515.52,539.23]
}
df = pd.DataFrame(df,columns = ['Gen','Site','Type','UID','color','Time2','Time3','Time4','Time5'])
df.info()

enter image description here

I want to write a function that takes in a dataframe and does the following:

  1. countplots for columns with the object dtype ( 4 countplots for GEN, Site, Type and color columns )

  2. boxplot for columns with float dtype ( 4 boxplots for Time2,....,Time5 columns )

  3. export the graphs as a pdf file(s) - two graphs per page

My attempt :

# I am open to other approaches
def data_explorer(data):
    for col in data.columns:
        # 1. countplots for columns with the object dtype
        if data[col].dtype == 'object':
            sns.countplot(x = col, data = data)
         # 2. boxplots for columns with the float dtype   
        elif data[col].dtype == 'float':
            sns.boxplot(data[col])
            
        else:
            print("skip integer dtype")
         # 3. save the graphs as pdf- 4 graphs per page
       
        plt.savefig('data_exploration.pdf')


Pls note: The final output should have a total of 8 graphs


Solution

  • Option 1: 4 figures with 2 plots per page

    1. Select all the columns of the dataframe by dtype with .select_dtypes
    2. Separate the columns into chunks based on the number of plots per page using a list comprehension. Adjust the chunk size n as needed.
    3. Iterate through each group of columns
    4. Create a figure with a number of rows equal to the number of plots per page
    5. Add the plots to the figure and save the figure
    def data_explorer(df):
        # get object and float data
        dobj = df.select_dtypes(include=['object'])
        dflo = df.select_dtypes(include=['float'])
        
        # split columns into groups of two; two being the plots per page
        n = 2
        cols_obj = [dobj.columns[i:i+n] for i in range(0, len(dobj.columns), n)]
        cols_flo = [dflo.columns[i:i+n] for i in range(0, len(dflo.columns), n)]
        
        # create a figure with two plots for each pair in dobj
        for cols in cols_obj:  # iterate through each group
            fig, axes = plt.subplots(n, 1, figsize=(15, 30))
            for col, ax in zip(cols, axes):
                sns.countplot(data=dobj[[col]], x=col, ax=ax)
            fig.savefig(f'data_exploration_{"_".join(cols)}.pdf')
            
        # create a figure with two plots for each pair in dflo
        for cols in cols_flo:  # iterate through each group
            fig, axes = plt.subplots(n, 1, figsize=(15, 30))
            for col, ax in zip(cols, axes):
                sns.boxplot(data=dflo[[col]], x=col, ax=ax)
            fig.savefig(f'data_exploration_{"_".join(cols)}.pdf')
    
    
    data_explorer(df)
    

    Option 2: 2 figures with 4 plots per page

    1. Select all the columns of the dataframe by dtype with .select_dtypes
    2. Create a figure to match the number of plots per page, equal to the total number of columns per group.
    3. Add each group of columns to a plot figure, and save the figure.
    def data_explorer(df):
        # get object and float data
        dobj = df.select_dtypes(include=['object'])
        dflo = df.select_dtypes(include=['float'])
        
        # create a figure with two plots for each pair in dobj
        fig, axes = plt.subplots(2, 2, figsize=(20, 30))
        for col, ax in zip(dobj.columns, axes.flat):
            sns.countplot(data=dobj[[col]], x=col, ax=ax)
        fig.savefig(f'data_exploration_{"_".join(dobj.columns)}.pdf')
            
        # create a figure with two plots for each pair in dflo
        fig, axes = plt.subplots(2, 2, figsize=(20, 30))
        for col, ax in zip(dflo.columns, axes.flat):
            sns.boxplot(data=dflo[[col]], x=col, ax=ax)
        fig.savefig(f'data_exploration_{"_".join(dflo.columns)}.pdf')
    
    
    data_explorer(df)
    

    enter image description here

    enter image description here