I am trying to use data science tool kedro
according to this tutorial.
I followed the instruction(write config.yaml, node.py and pipeline.py etc) and do exactly the same as the documentation) and could run kedro run
successfully.
And next step, I tried kedro viz
and could show the pipelines but I cannot see plotly chart.
Here is the result of the visualization. Please see the left pane. I can see Shuttle Passenger Capacity Plot
but it is not activated and plots does not show up.
Also, I set conf/base/catalog.yaml
to output json file to load for plotly but I cannot see any in 08_reporting
directory. This could be the cause of the issue?
nodes.py
and pipeline.py
is located here.
nodes.py
import pandas as pd
def _is_true(x: pd.Series) -> pd.Series:
return x == "t"
def _parse_percentage(x: pd.Series) -> pd.Series:
x = x.str.replace("%", "")
x = x.astype(float) / 100
return x
def _parse_money(x: pd.Series) -> pd.Series:
x = x.str.replace("$", "").str.replace(",", "")
x = x.astype(float)
return x
def preprocess_companies(companies: pd.DataFrame) -> pd.DataFrame:
"""Preprocesses the data for companies.
Args:
companies: Raw data.
Returns:
Preprocessed data, with `company_rating` converted to a float and
`iata_approved` converted to boolean.
"""
companies["iata_approved"] = _is_true(companies["iata_approved"])
companies["company_rating"] = _parse_percentage(companies["company_rating"])
return companies
def preprocess_shuttles(shuttles: pd.DataFrame) -> pd.DataFrame:
"""Preprocesses the data for shuttles.
Args:
shuttles: Raw data.
Returns:
Preprocessed data, with `price` converted to a float and `d_check_complete`,
`moon_clearance_complete` converted to boolean.
"""
shuttles["d_check_complete"] = _is_true(shuttles["d_check_complete"])
shuttles["moon_clearance_complete"] = _is_true(shuttles["moon_clearance_complete"])
shuttles["price"] = _parse_money(shuttles["price"])
return shuttles
def create_model_input_table(
shuttles: pd.DataFrame, companies: pd.DataFrame, reviews: pd.DataFrame
) -> pd.DataFrame:
"""Combines all data to create a model input table.
Args:
shuttles: Preprocessed data for shuttles.
companies: Preprocessed data for companies.
reviews: Raw data for reviews.
Returns:
Model input table.
"""
rated_shuttles = shuttles.merge(reviews, left_on="id", right_on="shuttle_id")
model_input_table = rated_shuttles.merge(
companies, left_on="company_id", right_on="id"
)
model_input_table = model_input_table.dropna()
return model_input_table
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
# the below function uses plotly.express
def compare_passenger_capacity(preprocessed_shuttles: pd.DataFrame):
fig = px.bar(data_frame=preprocessed_shuttles.groupby(["shuttle_type"]).mean().reset_index(), x="shuttle_type", y="passenger_capacity", )
return fig
pipeline.py
from kedro.pipeline import Pipeline, node
from kedro.pipeline.modular_pipeline import pipeline
from .nodes import create_model_input_table, preprocess_companies, preprocess_shuttles, compare_passenger_capacity
def create_pipeline(**kwargs) -> Pipeline:
return pipeline(
[
node(
func=preprocess_companies,
inputs="companies",
outputs="preprocessed_companies",
name="preprocess_companies_node",
),
node(
func=preprocess_shuttles,
inputs="shuttles",
outputs="preprocessed_shuttles",
name="preprocess_shuttles_node",
),
node(
func=create_model_input_table,
inputs=["preprocessed_shuttles", "preprocessed_companies", "reviews"],
outputs="model_input_table",
name="create_model_input_table_node",
),
node(
func=compare_passenger_capacity,
inputs="preprocessed_shuttles",
outputs="shuttle_passenger_capacity_plot",
),
],
namespace="data_processing",
inputs=["companies", "shuttles", "reviews"],
outputs="model_input_table",
)
Reference: https://kedro.readthedocs.io/en/stable/tutorial/visualise_pipeline.html
I passed wrong arguments to pipeline.
After I deleted the below three lines of code, it worked.
namespace="data_processing",
inputs=["companies", "shuttles", "reviews"],
outputs="model_input_table",
Here is corrected code:
pipeline.py
from .nodes import create_model_input_table, preprocess_companies, preprocess_shuttles, compare_passenger_capacity
def create_pipeline(**kwargs) -> Pipeline:
return pipeline(
[
node(
func=preprocess_companies,
inputs="companies",
outputs="preprocessed_companies",
name="preprocess_companies_node",
),
node(
func=preprocess_shuttles,
inputs="shuttles",
outputs="preprocessed_shuttles",
name="preprocess_shuttles_node",
),
node(
func=create_model_input_table,
inputs=["preprocessed_shuttles", "preprocessed_companies", "reviews"],
outputs="model_input_table",
name="create_model_input_table_node",
),
node(
func=compare_passenger_capacity,
inputs="preprocessed_shuttles",
outputs="shuttle_passenger_capacity_plot",
),
],
)