Let's say I have the following time data for 1 month or (31 days) in January:
import os, holoviews as hv
os.environ['HV_DOC_HTML'] = 'true'
hv.extension('bokeh')
import pandas as pd
import pandas_bokeh
from pandas_bokeh import plot_bokeh
pandas_bokeh.output_notebook()
#-----------------------------------------------------------
# Libs
#-----------------------------------------------------------
#!pip install hvplot
#!pip install pandas-bokeh
#-----------------------------------------------------------
# LOAD THE DATASET
#-----------------------------------------------------------
df = pd.read_csv('azure.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.rename(columns={'min cpu': 'min_cpu',
'max cpu': 'max_cpu',
'avg cpu': 'avg_cpu',})
#df = df.set_index('timestamp')
df.head()
# Data preparation
# ==============================================================================
sliced_df = df[['timestamp', 'avg_cpu']]
# convert column to datetime object
#sliced_df['timestamp'] = pd.to_datetime(sliced_df['timestamp'], format='%Y-%m-%d %H:%M:%S')
# get the hour, day month
sliced_df['hour'] = sliced_df['timestamp'].dt.hour
sliced_df['day'] = sliced_df['timestamp'].dt.day
sliced_df['month'] = sliced_df['timestamp'].dt.month
sliced_df['year'] = sliced_df['timestamp'].dt.year
year_input=2017
month_input=1
day_input=21
# Retrive average CPU usage for hourly
df_avg = sliced_df.groupby('hour').agg({'avg_cpu': 'mean'}).reset_index()
df_21 = sliced_df[(sliced_df.year == year_input) & (sliced_df.month == month_input) & (sliced_df.day == day_input)]
df_21 = df_21.groupby('hour').agg({'avg_cpu': 'max'}).reset_index()
df_above = pd.merge(df_21, df_avg, on='hour', suffixes=('_hour','_avg'))
df_above['above'] = df_above.loc[df_above[f"avg_cpu_hour"] >= df_above["avg_cpu_avg"], f"avg_cpu_hour"]
df_above['above_value'] = df_above['avg_cpu_hour'] - df_above['avg_cpu_avg']
df_below = pd.merge(df_21, df_avg, on='hour', suffixes=('_hour','_avg'))
df_below['below'] = df_below.loc[df_below[f"avg_cpu_hour"] < df_below["avg_cpu_avg"], f"avg_cpu_hour"]
df_below['below_value'] = df_below['avg_cpu_hour'] - df_below['avg_cpu_avg']
above_count = df_above['above'].value_counts().sum()
below_count = df_below['below'].value_counts().sum()
dark_red = "#FF5555"
dark_blue = "#5588FF"
plot = sliced_df.hvplot( x="hour", y="avg_cpu", by="day", color="grey", alpha=0.02, legend=False, hover=False)
plot_avg = df_avg.hvplot( x="hour", y="avg_cpu", color="grey", legend=False)
plot_21th = df_21.hvplot( x="hour", y="avg_cpu", color="black", legend=False)
plot_above = df_above.hvplot.area(x="hour", y="avg_cpu_avg", y2="avg_cpu_hour").opts(fill_alpha=0.2, line_alpha=0.8, line_color=dark_red, fill_color=dark_red)
plot_below = df_below.hvplot.area(x="hour", y="avg_cpu_avg", y2="avg_cpu_hour").opts(fill_alpha=0.2, line_alpha=0.8, line_color=dark_blue, fill_color=dark_blue)
text_days_above = hv.Text(5, df_21["avg_cpu"].max(), f"{above_count}", fontsize=14).opts(text_align="right", text_baseline="bottom", text_color=dark_red, text_alpha=0.8)
text_days_below = hv.Text(5, df_21["avg_cpu"].max(), f"{below_count}", fontsize=14).opts(text_align="right", text_baseline="top", text_color=dark_blue, text_alpha=0.8)
text_above = hv.Text(5, df_21["avg_cpu"].max(), "DAYS ABOVE", fontsize=7).opts(text_align="left", text_baseline="bottom", text_color="lightgrey", text_alpha=0.8)
text_below = hv.Text(5, df_21["avg_cpu"].max(), "DAYS BELOW", fontsize=7).opts(text_align="left", text_baseline="above", text_color="lightgrey", text_alpha=0.8)
hv.renderer("bokeh").theme = theme
final = (
plot
* plot_21th
* plot_avg
* plot_above
* plot_below
* text_days_above
* text_days_below
* text_above
* text_below
).opts(
xlabel="hourly",
ylabel="CPU [Hz]",
title=f"{day_input}th Jan data vs AVERAGE",
gridstyle={"ygrid_line_alpha": 0},
xticks=[
(0, "00:00"),
(1, "01:00"),
(2, "02:00"),
(3, "03:00"),
(4, "04:00"),
(5, "05:00"),
(6, "06:00"),
(7, "07:00"),
(8, "08:00"),
(9, "09:00"),
(10, "10:00"),
(11, "11:00"),
(12, "12:00"),
(13, "13:00"),
(14, "14:00"),
(15, "15:00"),
(16, "16:00"),
(17, "17:00"),
(18, "18:00"),
(19, "19:00"),
(20, "20:00"),
(21, "21:00"),
(22, "22:00"),
(23, "23:00"),
],
xrotation=45,
show_grid=True,
fontscale=1.18,
)
hv.save(final, "final.html")
final
Update II:
I have tried the following code unsuccessfully inspired by the answer of @chitown88 using bokeh instead of seaborn package to get the desired output for output1 at least. My current output:
Example output for years:
As seen in the above example, I want to retrieve, reflect, and visualize my data in smaller time resolution (hourly\daily\monthly) rather than annually. I need to retrieve the number of CPU usage of column avg cpu
within dataframe observations that exceeding\below:
the hourly average for one certain day ( i.e., 21st Jan. 2024-01-21 00:00:00 till 2024-01-22 00:00:00)
- x-axis: hourly timestamp of records for 21 Jan
- y-axis: 'avg CPU' usage for 21 Jan
- Threshold line: average CPU usage for 21 Jan
The daily average for Jan.
- x-axis: daily timestamp of Jan (1th-31th)
- y-axis: 'avg CPU' usage for Jan
- Threshold line: average CPU usage for Jan month
What is the elegant way to treat my dataframe to achieve this and the find threshold for interested outpu1 and output2 accordingly?
Let's try something more concrete:
# read data
df = pd.read_csv('https://raw.githubusercontent.com/amcs1729/Predicting-cloud-CPU-usage-on-Azure-data/master/azure.csv')
# parse date time
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['date'] = df['timestamp'].dt.normalize()
df['hour'] = df['timestamp'].dt.hour
# store the hourly average for the entire dataset
hourly_avg = df.groupby('hour')['avg_cpu'].mean()
# compute the max by hour-date
max_hour_daily = df.groupby(['date', 'hour'])['avg_cpu'].max().unstack('hour')
# compare these maxes to the hourly_avg
# convert to DataFrame for easy manipulation
labels = pd.DataFrame(np.where(max_hour_daily.ge(hourly_avg, axis=1), 'above', 'below'),
columns=max_hour_daily.columns,
index=max_hour_daily.index)
# the expected output - a dataframe
# indexed by the date, with columns being `above`, `below`
output = labels.stack().groupby(level='date').value_counts().unstack(fill_value=0)
Sample output:
# first five days
print(output.head())
Output:
above below
date
2017-01-01 13 11
2017-01-02 22 2
2017-01-03 21 3
2017-01-04 24 0
2017-01-05 16 8
Or you can get data for a specific day
# yes, string date works with `loc`
print(output.loc['2017-01-21'])
Output:
above 17
below 7
Name: 2017-01-21 00:00:00, dtype: int64
If you want the plot, it's pretty straightforward from the DataFrames/Series above:
day = '2017-01-21'
day_max_hourly = max_hour_daily.loc[day]
fig, ax = plt.subplots(figsize=(10, 6))
plt.fill_between(day_max_hourly.index, day_max_hourly, hourly_avg, where=day_max_hourly >= hourly_avg, facecolor='red', interpolate=True, alpha=0.5)
plt.fill_between(day_max_hourly.index, day_max_hourly, hourly_avg, where=day_max_hourly < hourly_avg, facecolor='blue', interpolate=True, alpha=0.5)
plt.plot(hourly_avg, label='Hourly Average', color='black')
plt.plot(day_max_hourly, label='Max Hourly', color='green')
plt.xlabel('Hour')
plt.xticks(hourly_avg.index)
plt.legend()
Output: