pythonlegendaltair

Altair chart legend for subset of data


As an exercise for learning more advanced altair, I'm trying to generate a simplified version of this chart: https://climatereanalyzer.org/clim/t2_daily/?dm_id=world.

To simplify, I'm using gray for all years prior to 2023 and then red and black for 2023 and 2024, respectively. I'd like to have a legend that is either just for 2023 & 2024 or is "1940-2022", "2023", "2024".

Right now I'm focused on getting a compact legend that reflect either subset of years, but I'd take any advice on how to improve the code / approach.

import pandas as pd
import altair as alt

# Function to fetch and prepare the data
def fetch_and_prep_data():
    url = "https://climatereanalyzer.org/clim/t2_daily/json/era5_world_t2_day.json"
    data = requests.get(url).json()

    years = []
    all_temperatures = []

    for year_data in data:
        year = year_data['name']
        temperatures = year_data['data']
        temperatures = [temp if temp is not None else float('nan') for temp in temperatures]
        days = list(range(1, len(temperatures) + 1))

        df = pd.DataFrame({
            'Year': [year] * len(temperatures),
            'Day': days,
            'Temperature': temperatures
        })

        years.append(year)
        all_temperatures.append(df)

    df_at = pd.concat(all_temperatures)

    # Drop all rows where Year is more than 4 digits
    df_at = df_at[df_at['Year'].str.len() <= 4]

    return df_at

# Function to create the last day in month labels
def get_last_day_in_month_labels():
    dates = pd.date_range(start='2023-01-01', end='2023-12-31', freq='D')
    last_days = dates[dates.is_month_end]
    labels = {day_of_year: month_abbr for day_of_year, month_abbr in zip(last_days.day_of_year, last_days.strftime('%b'))}
    return labels

# Functions to determine opacity, color, and stroke width
def determine_opacity(year):
    try:
        year_int = int(year)
        return 0.01 if year_int < 2023 else 1.0
    except ValueError:
        return 1.0

def determine_color(year):
    color = 'gray'
    try:
        year_int = int(year)
        if year_int < 2023:
            color = 'gray'
        elif year_int == 2023:
            color = 'red'
        elif year_int == 2024:
            color = 'black'
    except ValueError:
        color = 'black'
    return color

def determine_strokewidth(year):
    width = 1
    try:
        year_int = int(year)
        if year_int < 2023:
            width = 1
        else:
            width = 4
    except ValueError:
        width = 4
    return width

# Applying the functions to the 'Year' column
# Fetch and prepare the data
df_at = fetch_and_prep_data()
df_all = df_at.copy()
df_all['Opacity'] = df_all['Year'].apply(determine_opacity)
df_all['Color'] = df_all['Year'].apply(determine_color)
df_all['Width'] = df_all['Year'].apply(determine_strokewidth)

# Ensure 'Day' is correctly interpreted as a quantitative variable
df_all['Day'] = pd.to_numeric(df_all['Day'], errors='coerce')

# Filter the data to ensure 'Day' values are within the desired range
df_filtered = df_all[df_all['Day'] <= 365]

# Create last day in month labels
last_day_in_month_labels = get_last_day_in_month_labels()

# Extract the keys and values for tick marks and labels
tick_values = list(last_day_in_month_labels.keys())
tick_labels = list(last_day_in_month_labels.values())

# Plotting the main data using Altair with the existing Color and Opacity columns
line_chart = alt.Chart(df_filtered).mark_line().encode(
    x=alt.X(
        'Day:Q',
        title='Month',
        scale=alt.Scale(domain=(0, 365), clamp=True),
        axis=alt.Axis(
            labels=True,
            tickCount=12,
            values=tick_values,
            labelExpr=f"datum.value == {tick_values[0]} ? '{tick_labels[0]}' : " +
                      " : ".join([f"datum.value == {tick} ? '{label}'" for tick, label in zip(tick_values[1:], tick_labels[1:])]) +
                      " : ''",
            labelOffset= -30  # Shift the x-axis labels to the left by 30 units
        )
    ),
    y=alt.Y(
        'Temperature:Q',
        title='Temperature (C)',
        scale=alt.Scale(domain=(11, 18), clamp=True),
    ),
    color=alt.Color('Color:N', legend=None, scale=None),  # Use the "Color" column for line colors
    opacity=alt.Opacity('Opacity:Q', legend=None),  # Use the "Opacity" column
    detail=alt.Detail('Year:N'),  # Add detail encoding for Year, otherwise you get vertical lines
    strokeWidth=alt.StrokeWidth('Width:N'), legend=None)  # Use the "Width" column
).properties(
    width=800,
    height=600
)

line_chart```

Solution

  • This can be done without storing plotting information in the dataframe by using conditions. Also, Altair has some powerful functionality for processing dates, so there's no need manually assign the labels or shift dates for leap years.

    # Function to fetch and prepare the data
    def fetch_and_prep_data():
        url = "https://climatereanalyzer.org/clim/t2_daily/json/era5_world_t2_day.json"
        data = requests.get(url).json()
    
        years = []
        all_temperatures = []
    
        for year_data in data:
            year = year_data["name"]
            temperatures = year_data["data"]
            temperatures = [
                temp if temp is not None else float("nan") for temp in temperatures
            ]
            days = list(range(1, len(temperatures) + 1))
    
            df = pd.DataFrame(
                {
                    "Year": [year] * len(temperatures),
                    "Day": days,
                    "Temperature": temperatures,
                }
            )
    
            years.append(year)
            all_temperatures.append(df)
    
        df_at = pd.concat(all_temperatures)
    
        # Drop all rows where Year is more than 4 digits
        df_at = df_at[df_at["Year"].str.len() <= 4]
    
        return df_at
    
    
    # Applying the functions to the 'Year' column
    # Fetch and prepare the data
    df_at = fetch_and_prep_data()
    df_all = df_at.copy()
    
    # Ensure 'Day' is correctly interpreted as a quantitative variable
    df_all["Day"] = pd.to_numeric(df_all["Day"], errors="coerce")
    df_all["Date"] = pd.to_datetime(
        df_all["Year"] + "-" + df_all["Day"].astype(str), format="%Y-%j"
    )
    
    # Plotting the main data using Altair
    line_chart = (
        alt.Chart(df_all)
        .mark_line()
        .encode(
            x=alt.X("Date:T", title="Date", timeUnit="monthdate").axis(format="%b"),
            y=alt.Y(
                "Temperature:Q",
                title="Temperature (C)",
                scale=alt.Scale(domain=(11, 18), clamp=True),
            ),
            color=alt.condition(
                alt.datum.Year < 2023,
                alt.value("gray"),
                alt.Color("Year:N").scale(
                    domain=["2023", "2024"], range=["black", "orange"]
                ),
            ),  
            opacity=alt.condition(
                alt.datum.Year < 2023, alt.value(0.1), alt.Opacity("Opacity:Q", legend=None)
            ),  
            strokeWidth=alt.condition(
                alt.datum.Year < 2023,
                alt.value(1),
                alt.value(4),
            ),
        )
        .properties(width=800, height=600)
    )
    
    line_chart
    

    enter image description here