trying to develop a multiple page app on data analysis and regression
- i got an error notification i have been trying to fix for day
Attempting to connect a callback Input item to component:
“month_checklist”
but no components with that id exist in the layout.
If you are assigning callbacks to components that are
generated by other callbacks (and therefore not in the
initial layout), you can suppress this exception by setting
suppress_callback_exceptions=True
.
This ID was used in the callback(s) for Output(s):
age_group_and_loan_distribution.figure
job_and_marital.figure
Features_Deposit.figure
Month_and_week_conversion.figure
Models_Performance_chart.figure
Actual_vs_Predicted.figure
Feature_Importances.figure
model_performance_test.figure
model_performance_test_table.figure
2. i filtered my dataset in order to be interactive with the drop to see if i can get more insight rgards to month dropdown.
AttributeError: module ‘pages.Overview’ has no attribute ‘layout’
Traceback (most recent call last)
File “C:\Users\Moritus Peters\Documents\Bank_Market_Cap\main.py”, line 48, in display_page
return Overview.layout
^^^^^^^^^^^^^^^
AttributeError: module ‘pages.Overview’ has no attribute ‘layout’
This is the Copy/Paste friendly version of the traceback.
Traceback (most recent call last):
File “C:\Users\Moritus Peters\Documents\Bank_Market_Cap\main.py”, line 48, in display_page
return Overview.layout
^^^^^^^^^^^^^^^
AttributeError: module ‘pages.Overview’ has no attribute ‘layout’
- the regression refuse to display i dont know if is from the capacity of my locat machine to load the regression analysis or from my code
this is my code
main.py
import dash
from dash import dcc, html, Input, Output, callback
import dash_bootstrap_components as dbc
from pages import Overview, Table, Data_Exploration, Regression
app = dash.Dash(name, external_stylesheets=[dbc.themes.LUMEN, ‘/assets/style2.css’])
Define navigation links
navbar = dbc.Navbar(
children=[
dbc.Nav(
[
dbc.NavItem(dbc.NavLink(“Overview”, href=“/”)),
dbc.NavItem(dbc.NavLink(“Dataset Details”, href=“/page-2”)),
dbc.NavItem(dbc.NavLink(“Data Exploration”, href=“/page-3”)),
dbc.NavItem(dbc.NavLink(“Regression”, href=“/page-4”)),
],
pills=True,
className=“ml-auto text-center”,
),
],
color=“light”,
dark=False,
className=“mb-4 text-center”,
)
App layout
app.layout = dbc.Container([
dbc.Row([
dbc.Col(html.H1(“Bank Marketing Campaign”, className=“dashboard-title text-center”), width=12)
]),
navbar,
dcc.Location(id=‘url’, refresh=False),
html.Div(id=‘page-content’)
], fluid=True)
Callback to update page content based on URL
@callback(Output(‘page-content’, ‘children’),
[Input(‘url’, ‘pathname’)])
def display_page(pathname):
if pathname == ‘/page-2’:
return Table.layout
elif pathname == ‘/page-3’:
return Data_Exploration.layout
elif pathname == ‘/page-4’:
return Regression.layout
else:
return Overview.layout
Run the app
if name == ‘main’:
app.run_server(debug=True, port=5050)
data exploration.py
import dash
from dash import dcc, html, Input, Output, callback
import dash_bootstrap_components as dbc
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
Load and preprocess data
data = pd.read_csv(‘C:\Users\Moritus Peters\Documents\Datasets\bank-additional-full.csv’, delimiter=‘;’)
month_Matching = {
‘jan’: ‘January’,
‘feb’: ‘February’,
‘mar’: ‘March’,
‘apr’: ‘April’,
‘may’: ‘May’,
‘jun’: ‘June’,
‘jul’: ‘July’,
‘aug’: ‘August’,
‘sep’: ‘September’,
‘oct’: ‘October’,
‘nov’: ‘November’,
‘dec’: ‘December’
}
data[‘month’] = data[‘month’].replace(month_Matching)
data[‘month’] = pd.Categorical(data[‘month’], categories=[‘January’, ‘February’, ‘March’, ‘April’, ‘May’, ‘June’, ‘July’, ‘August’, ‘September’, ‘October’, ‘November’, ‘December’], ordered=True)
data[‘day_of_week’] = pd.Categorical(data[‘day_of_week’], categories=[‘mon’, ‘tue’, ‘wed’, ‘thu’, ‘fri’, ‘sat’, ‘sun’], ordered=True)
data = data.rename(columns={
‘previous’: “Previous_contacts_distribution”,
‘poutcome’: ‘Outcome_of_previous_marketing_campaign’,
‘emp.var.rate’: ‘Employment_variation_rate’,
‘cons.price.idx’: ‘Consumer_price_index’,
‘cons.conf.idx’: ‘Consumer_confidence_index’,
‘euribor3m’: ‘Euribor 3 month rate’,
‘nr.employed’: ‘Number of employees’,
‘y’: ‘Deposit’
})
available_indicators = [‘age’, ‘job’, ‘marital’, ‘education’, ‘default’, ‘housing’, ‘loan’, ‘contact’, ‘month’, ‘day_of_week’,
‘pdays’, ‘Previous_contacts_distribution’, ‘Outcome_of_previous_marketing_campaign’,
‘Employment_variation_rate’, ‘Consumer_price_index’, ‘Consumer_confidence_index’,
‘Euribor 3 month rate’, ‘Number of employees’, ‘Deposit’]
layout = html.Div([
html.H2(‘Data Exploration’),
# Add your overview content here
dbc.Row([
dbc.Col([
html.Div([
html.Label(“Month Checklist”, className=‘dropdown-label text’),
dbc.Checklist(
id=‘month_checklist’,
options=[{‘label’: str(month), ‘value’: month} for month in sorted(data[‘month’].unique(), key=lambda x: data[‘month’].cat.categories.tolist().index(x))],
value=,
inline=True,
className=‘text-center px-2’
)
], className=‘metric-container’)
], width=12)
]),
dbc.Row([
dbc.Col(
dbc.Card(
dbc.CardBody([
html.H5(‘Conversion Rate by Age Group and Loan’, className='text-center '),
dcc.Graph(id=‘age_group_and_loan_distribution’, figure={}),
])
), width=6
),
dbc.Col(
dbc.Card([
dbc.CardHeader(“Conversion Rate by Job and Marital Status”, className=‘text-center fs-2’),
dbc.CardBody([
# html.H5(‘Conversion Rate by Job and Marital Status’, className=‘text-center’),
dcc.Graph(id=‘job_and_marital’, figure={}),
]),
dbc.CardFooter("Thia bar chart shows the conversion rate of the percentage of ‘yes’ deposits based on job type and marital status. more insight is gotten by selecting each monthor more for actionable descision making "),
]), width=6
),
]),
dbc.Row([
dbc.Col(
dbc.Card(
dbc.CardBody([
html.H5(‘Correlation of Features with Deposit’,
className=‘text-center’,),
dcc.Graph(id=‘Features_Deposit’, figure={}),
])
),width=7
),
dbc.Col([
dbc.Card(
dbc.CardBody([
html.H5(‘Conversion Rate by Month and Week’,
className=‘text-center’),
dcc.Graph(id=‘Month_and_week_conversion’, figure={}),
])
),
], width=5),
])
])
Callbacks for charts
@callback(Output(‘age_group_and_loan_distribution’, ‘figure’),
Input(‘month_checklist’, ‘value’),
suppress_callback_exceptions=True)
def update_age_group_and_loan_distribution(selected_month=None):
if not selected_month:
return {}
filtered_data = data[data[‘month’].isin(selected_month)] #
filtered_data[‘age_group’] = pd.cut(filtered_data[‘age’], bins=[0, 20, 30, 40, 50, 60, np.inf], labels=[‘<20’, ‘20-30’, ‘30-40’, ‘40-50’, ‘50-60’, ‘60+’])
conversion_rate_by_age_housing = filtered_data.groupby([‘age_group’, ‘loan’])[‘Deposit’].apply(lambda x: (x == ‘yes’).mean() * 100).unstack(fill_value=0)
fig = px.bar(conversion_rate_by_age_housing, barmode=‘group’)
fig.update_layout(legend=dict(orientation=‘h’, yanchor=‘bottom’, y=1.02, xanchor=‘right’, x=1))
return fig
@callback(Output(‘job_and_marital’, ‘figure’),
Input(‘month_checklist’, ‘value’),
suppress_callback_exceptions=True)
def update_job_and_marital(selected_month=None):
if not selected_month:
return {}
filtered_data = data[data[‘month’].isin(selected_month)]
conversion_rate_by_job_marital = filtered_data.groupby([‘job’, ‘marital’])[‘Deposit’].apply(lambda x: (x == ‘yes’).mean() * 100).unstack(fill_value=0)
fig = px.bar(conversion_rate_by_job_marital, barmode=‘group’)
fig.update_layout(legend=dict(orientation=‘h’, yanchor=‘bottom’, y=1.02, xanchor=‘right’, x=1))
return fig
@callback(Output(‘Features_Deposit’, ‘figure’),
Input(‘month_checklist’, ‘value’),
suppress_callback_exceptions=True)
def update_features_deposit(selected_month=None):
if not selected_month:
return {}
filtered_data = data[data[‘month’].isin(selected_month)]
# Encode the Deposit column
filtered_data['Deposit'] = filtered_data['Deposit'].map({'yes': 1, 'no': 0})
categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'Outcome_of_previous_marketing_campaign']
# Convert categorical variables to numerical for correlation calculation
data_encoded = pd.get_dummies(filtered_data, columns=categorical_columns, drop_first=True)
# Calculate the correlation matrix
correlation_matrix = data_encoded.corr()
# Extract correlations with 'Deposit'
corr_with_deposit = correlation_matrix['Deposit'].sort_values(ascending=False)
# Create a table to display the correlation values with 'Deposit'
fig = go.Figure(data=[go.Table(
header=dict(values=['Feature', 'Correlation with Deposit'],
fill_color='paleturquoise',
align='left'),
cells=dict(values=[corr_with_deposit.index, corr_with_deposit.values],
fill_color='lavender',
align='left'))
])
return fig
@callback(Output(‘Month_and_week_conversion’, ‘figure’),
Input(‘month_checklist’, ‘value’),
suppress_callback_exceptions=True)
def update_month_week_conversion(selected_month=None):
if not selected_month:
return {}
filtered_data = data[data[‘month’].isin(selected_month)]
conversion_rate_by_month_week = filtered_data.groupby([‘month’, ‘day_of_week’])[‘Deposit’].apply(lambda x: (x == ‘yes’).mean() * 100).unstack(fill_value=0)
fig = go.Figure(data=go.Heatmap(
z=conversion_rate_by_month_week.values,
x=conversion_rate_by_month_week.columns.tolist(),
y=conversion_rate_by_month_week.index.tolist(),
colorscale=‘Viridis’))
fig.update_layout( xaxis_title=‘Weekday’, yaxis_title=‘Month’)
return fig
regression.py
import dash
from dash import dcc, html, Input, Output, callback
import dash_bootstrap_components as dbc
from dash import dash_table
import plotly.express as px
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
Load and preprocess data
Load and preprocess data
data = pd.read_csv(‘C:\Users\Moritus Peters\Documents\Datasets\bank-additional-full.csv’, delimiter=‘;’)
month_Matching = {
‘jan’: ‘January’,
‘feb’: ‘February’,
‘mar’: ‘March’,
‘apr’: ‘April’,
‘may’: ‘May’,
‘jun’: ‘June’,
‘jul’: ‘July’,
‘aug’: ‘August’,
‘sep’: ‘September’,
‘oct’: ‘October’,
‘nov’: ‘November’,
‘dec’: ‘December’
}
data[‘month’] = data[‘month’].replace(month_Matching)
data[‘month’] = pd.Categorical(data[‘month’], categories=[‘January’, ‘February’, ‘March’, ‘April’, ‘May’, ‘June’, ‘July’, ‘August’, ‘September’, ‘October’, ‘November’, ‘December’], ordered=True)
data[‘day_of_week’] = pd.Categorical(data[‘day_of_week’], categories=[‘mon’, ‘tue’, ‘wed’, ‘thu’, ‘fri’, ‘sat’, ‘sun’], ordered=True)
data = data.rename(columns={
‘previous’: “Previous_contacts_distribution”,
‘poutcome’: ‘Outcome_of_previous_marketing_campaign’,
‘emp.var.rate’: ‘Employment_variation_rate’,
‘cons.price.idx’: ‘Consumer_price_index’,
‘cons.conf.idx’: ‘Consumer_confidence_index’,
‘euribor3m’: ‘Euribor 3 month rate’,
‘nr.employed’: ‘Number of employees’,
‘y’: ‘Deposit’
})
layout = html.Div([
html.H2(‘Regression Analysis’),
dbc.Row([
dbc.Col([
html.Div([
html.Label("Month Checklist", className='dropdown-label text'),
dbc.Checklist(
id='month_checklist',
options=[{'label': str(month), 'value': month} for month in sorted(data['month'].unique(), key=lambda x: data['month'].cat.categories.tolist().index(x))],
value=[],
inline=True,
className='text-center px-2'
)
], className='metric-container')
], width=12)
]),
dbc.Row([
dbc.Col(
dbc.Card(
dbc.CardBody([
html.H5(‘Model Performance Chart’,
className=‘text-center’,),
dcc.Graph(id=‘Models_Performance_chart’, figure={}),
])
),width=4
),
dbc.Col([
dbc.Card(
dbc.CardBody([
html.H5(‘Actual vs Predicted Values (xgboost)’,
className=‘text-center’),
dcc.Graph(id=‘Actual_vs_Predicted’, figure={}),
])
),
], width=4),
dbc.Col([
dbc.Card(
dbc.CardBody([
html.H5(‘Top 10 Feature Importances’,
className=‘text-center’),
dcc.Graph(id=’ Feature_Importances’, figure={}),
])
),
], width=4),
]),
dbc.Row([
dbc.Col(
dbc.Card(
dbc.CardBody([
html.H5(‘Model Performance Chart’,
className=‘text-center’,),
dcc.Graph(id=‘Model_Performance’, figure={}),
])
),width=4
),
dbc.Col([
dbc.Card(
dbc.CardBody([
html.H5(‘Model Evaluation Results (Cross-Validation Scores)’,
className=‘text-center’),
dcc.Graph(id=‘Hyperparameter_Tuning’, figure={}),
])
),
], width=8),
]),
dbc.Row([
dbc.Col(
dbc.Card(
dbc.CardBody([
html.H5(‘Model Performance Chart’,
className=‘text-center’,),
dcc.Graph(id=‘model_performance_test_table’, figure={}),
])
),width=12
),
])
])
@callback(Output(‘Models_Performance_chart’, ‘figure’),
Input(‘month_checklist’, ‘value’))
def model_performance_chart(selected_month):
if not selected_month:
return {}
filtered_data = data[data('month')].isin(selected_month)
filtered_data.reset_index(drop=True, inplace = True)
le = LabelEncoder()
filtered_data['Deposit'] = le.fit_transform(filtered_data['Deposit'].values.ravel())
filtered_data['Outcome_of_previous_marketing_campaign'] = pd.to_numeric(filtered_data['Outcome_of_previous_marketing_campaign'],errors='coerce' )
filtered_data['duration'] = pd.to_numeric(filtered_data['duration'], errors='coerce')
filtered_data['Outcome_of_previous_marketing_campaign'].fillna(0, inplace=True)
filtered_data['duration'].fillna(0, inplace=True)
filtered_data['Outcome_of_previous_marketing_campaign_squared'] = filtered_data['Outcome_of_previous_marketing_campaign'] **2
filtered_data['Deposit'] = data['duration'] **2
filtered_data = pd.get_dummies(filtered_data, columns=['job', 'marital', 'education', 'contact', 'month', 'day_of_week'])
X = pd.get_dummies(filtered_data.drop(columns=['Deposit']), drop_first=True)
y = filtered_data['Deposit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# using minMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
models = {
'Linear Regression': LinearRegression(),
'Random Forest': RandomForestRegressor(random_state=42),
'Gradient Boosting': GradientBoostingRegressor(random_state=42),
'Ridge Regression': Ridge(),
'Lasso Regression': Lasso(),
'XGBoost': XGBRegressor(random_state=42)
}
# Hyperparameter tuning parameters for XGBoost
xgb_params = {
'n_estimators': [100, 200, 300],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
'subsample': [0.8, 0.9, 1.0],
'colsample_bytree': [0.8, 0.9, 1.0],
}
# Perform GridSearchCV for XGBoost
grid_search_xgb = GridSearchCV(XGBRegressor(random_state=42), xgb_params, cv=5, scoring='r2')
grid_search_xgb.fit(X_train_scaled, y_train)
best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = grid_search_xgb.best_score_
print(f"Best XGBoost Parameters: {best_params_xgb}")
print(f"Best XGBoost R² Score: {best_score_xgb}")
# Evaluate on test set
y_pred_xgb = grid_search_xgb.best_estimator_.predict(X_test_scaled)
r2_xgb = r2_score(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost - R²: {r2_xgb}, MSE: {mse_xgb}")
cv_results = {}
model_performance = {}
for name, model in models.items():
if name == 'XGBoost':
model = grid_search_xgb.best_estimator_ # Use best XGBoost model from GridSearchCV for evaluation
pipeline = Pipeline([('scaler', MinMaxScaler()), ('model', model)])
# Cross-validation scores
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
cv_results[name] = cv_scores
# Model performance on test set
pipeline.fit(X_train, y_train) # Fit on unscaled data
y_pred = pipeline.predict(X_test) # Predict on unscaled data
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
model_performance[name] = {'R2': r2, 'MSE': mse}
model_performance_data = pd.DataFrame(model_performance).T.reset_index.rename(columns={'index': 'Model'})
fig =px.bar(model_performance_data,
x='Model', y='R2', # Use 'R2' to match the column name
labels={'R2': 'R² Score'})
return fig
@callback(Output(‘Actual_vs_Predicted’, ‘figure’),
Input(‘month_checklist’, ‘value’))
def model_performance_chart(selected_month):
if not selected_month:
return {}
filtered_data = data[data('month').isin(selected_month)]
filtered_data.reset_index(drop=True, inplace = True)
le = LabelEncoder()
filtered_data['Deposit'] = le.fit_transform(filtered_data['Deposit'].values.ravel())
filtered_data['Outcome_of_previous_marketing_campaign'] = pd.to_numeric(filtered_data['Outcome_of_previous_marketing_campaign'],errors='coerce' )
filtered_data['duration'] = pd.to_numeric(filtered_data['duration'], errors='coerce')
filtered_data['Outcome_of_previous_marketing_campaign'].fillna(0, inplace=True)
filtered_data['duration'].fillna(0, inplace=True)
filtered_data['Outcome_of_previous_marketing_campaign_squared'] = filtered_data['Outcome_of_previous_marketing_campaign'] **2
filtered_data['Deposit'] = data['duration'] **2
filtered_data = pd.get_dummies(filtered_data, columns=['job', 'marital', 'education', 'contact', 'month', 'day_of_week'])
X = pd.get_dummies(filtered_data.drop(columns=['Deposit']), drop_first=True)
y = filtered_data['Deposit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# using minMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
models = {
'Linear Regression': LinearRegression(),
'Random Forest': RandomForestRegressor(random_state=42),
'Gradient Boosting': GradientBoostingRegressor(random_state=42),
'Ridge Regression': Ridge(),
'Lasso Regression': Lasso(),
'XGBoost': XGBRegressor(random_state=42)
}
# Hyperparameter tuning parameters for XGBoost
xgb_params = {
'n_estimators': [100, 200, 300],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
'subsample': [0.8, 0.9, 1.0],
'colsample_bytree': [0.8, 0.9, 1.0],
}
# Perform GridSearchCV for XGBoost
grid_search_xgb = GridSearchCV(XGBRegressor(random_state=42), xgb_params, cv=5, scoring='r2')
grid_search_xgb.fit(X_train_scaled, y_train)
best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = grid_search_xgb.best_score_
print(f"Best XGBoost Parameters: {best_params_xgb}")
print(f"Best XGBoost R² Score: {best_score_xgb}")
# Evaluate on test set
y_pred_xgb = grid_search_xgb.best_estimator_.predict(X_test_scaled)
r2_xgb = r2_score(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
residuals = y_test - y_pred_xgb
fig = px.scatter(x=y_pred_xgb, y=residuals,
labels={'x': 'Predicted Values', 'y': 'Residuals'})
fig.add_hline(y=0, line_dash='dash',
line_color='red')
return fig
@callback(Output(‘Feature_Importances’, ‘figure’),
Input(‘month_checklist’, ‘value’))
def model_performance_chart(selected_month):
if not selected_month:
return {}
filtered_data = data.copy()
filtered_data = filtered_data[filtered_data('month').isin(selected_month)]
filtered_data.reset_index(drop=True, inplace = True)
le = LabelEncoder()
filtered_data['Deposit'] = le.fit_transform(filtered_data['Deposit'].values.ravel())
filtered_data['Outcome_of_previous_marketing_campaign'] = pd.to_numeric(filtered_data['Outcome_of_previous_marketing_campaign'],errors='coerce' )
filtered_data['duration'] = pd.to_numeric(filtered_data['duration'], errors='coerce')
filtered_data['Outcome_of_previous_marketing_campaign'].fillna(0, inplace=True)
filtered_data['duration'].fillna(0, inplace=True)
filtered_data['Outcome_of_previous_marketing_campaign_squared'] = filtered_data['Outcome_of_previous_marketing_campaign'] **2
filtered_data['Deposit'] = data['duration'] **2
filtered_data = pd.get_dummies(filtered_data, columns=['job', 'marital', 'education', 'contact', 'month', 'day_of_week'])
X = pd.get_dummies(filtered_data.drop(columns=['Deposit']), drop_first=True)
y = filtered_data['Deposit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# using minMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
models = {
'Linear Regression': LinearRegression(),
'Random Forest': RandomForestRegressor(random_state=42),
'Gradient Boosting': GradientBoostingRegressor(random_state=42),
'Ridge Regression': Ridge(),
'Lasso Regression': Lasso(),
'XGBoost': XGBRegressor(random_state=42)
}
# Hyperparameter tuning parameters for XGBoost
xgb_params = {
'n_estimators': [100, 200, 300],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
'subsample': [0.8, 0.9, 1.0],
'colsample_bytree': [0.8, 0.9, 1.0],
}
# Perform GridSearchCV for XGBoost
grid_search_xgb = GridSearchCV(XGBRegressor(random_state=42), xgb_params, cv=5, scoring='r2')
grid_search_xgb.fit(X_train_scaled, y_train)
best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = grid_search_xgb.best_score_
print(f"Best XGBoost Parameters: {best_params_xgb}")
print(f"Best XGBoost R² Score: {best_score_xgb}")
# Evaluate on test set
y_pred_xgb = grid_search_xgb.best_estimator_.predict(X_test_scaled)
r2_xgb = r2_score(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost - R²: {r2_xgb}, MSE: {mse_xgb}")
cv_results = {}
model_performance = {}
for name, model in models.items():
if name == 'XGBoost':
model = grid_search_xgb.best_estimator_ # Use best XGBoost model from GridSearchCV for evaluation
pipeline = Pipeline([('scaler', MinMaxScaler()), ('model', model)])
# Cross-validation scores
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
cv_results[name] = cv_scores
# Model performance on test set
pipeline.fit(X_train, y_train) # Fit on unscaled data
y_pred = pipeline.predict(X_test) # Predict on unscaled data
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
model_performance[name] = {'R2': r2, 'MSE': mse}
# Train Random forest for feature importance
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train_scaled, y_train)
feature_importances = xgb_model.feature_importance_
features = X.columns
feature_importance_data = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)
fig =px.bar(feature_importance_data.head(10),
x='Importance',
y='Feature',
orientation='h',
)
return fig
@callback(Output(‘model_performance_test’, ‘figure’),
Input(‘month_checklist’, ‘figure’),
allow_duplicate=True)
def model_performance_test(selected_month):
if not selected_month:
return {}
filtered_data = data.copy()
filtered_data = filtered_data[filtered_data('month').isin(selected_month)]
filtered_data.reset_index(drop=True, inplace = True)
le = LabelEncoder()
filtered_data['Deposit'] = le.fit_transform(filtered_data['Deposit'].values.ravel())
filtered_data['Outcome_of_previous_marketing_campaign'] = pd.to_numeric(filtered_data['Outcome_of_previous_marketing_campaign'],errors='coerce' )
filtered_data['duration'] = pd.to_numeric(filtered_data['duration'], errors='coerce')
filtered_data['Outcome_of_previous_marketing_campaign'].fillna(0, inplace=True)
filtered_data['duration'].fillna(0, inplace=True)
filtered_data['Outcome_of_previous_marketing_campaign_squared'] = filtered_data['Outcome_of_previous_marketing_campaign'] **2
filtered_data['Deposit'] = data['duration'] **2
filtered_data = pd.get_dummies(filtered_data, columns=['job', 'marital', 'education', 'contact', 'month', 'day_of_week'])
X = pd.get_dummies(filtered_data.drop(columns=['Deposit']), drop_first=True)
y = filtered_data['Deposit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# using minMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
models = {
'Linear Regression': LinearRegression(),
'Random Forest': RandomForestRegressor(random_state=42),
'Gradient Boosting': GradientBoostingRegressor(random_state=42),
'Ridge Regression': Ridge(),
'Lasso Regression': Lasso(),
'XGBoost': XGBRegressor(random_state=42)
}
# Hyperparameter tuning parameters for XGBoost
xgb_params = {
'n_estimators': [100, 200, 300],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
'subsample': [0.8, 0.9, 1.0],
'colsample_bytree': [0.8, 0.9, 1.0],
}
# Perform GridSearchCV for XGBoost
grid_search_xgb = GridSearchCV(XGBRegressor(random_state=42), xgb_params, cv=5, scoring='r2')
grid_search_xgb.fit(X_train_scaled, y_train)
best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = grid_search_xgb.best_score_
print(f"Best XGBoost Parameters: {best_params_xgb}")
print(f"Best XGBoost R² Score: {best_score_xgb}")
# Evaluate on test set
y_pred_xgb = grid_search_xgb.best_estimator_.predict(X_test_scaled)
r2_xgb = r2_score(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost - R²: {r2_xgb}, MSE: {mse_xgb}")
cv_results = {}
model_performance = {}
for name, model in models.items():
if name == 'XGBoost':
model = grid_search_xgb.best_estimator_ # Use best XGBoost model from GridSearchCV for evaluation
pipeline = Pipeline([('scaler', MinMaxScaler()), ('model', model)])
# Cross-validation scores
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
cv_results[name] = cv_scores
# Model performance on test set
pipeline.fit(X_train, y_train) # Fit on unscaled data
y_pred = pipeline.predict(X_test) # Predict on unscaled data
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
model_performance[name] = {'R2': r2, 'MSE': mse}
# Create data for the tables
cv_table_data = []
for name, scores in cv_results.items():
cv_table_data.append({
'Model': name,
'Fold 1': scores[0],
'Fold 2': scores[1],
'Fold 3': scores[2],
'Fold 4': scores[3],
'Fold 5': scores[4],
'Mean CV R² Score': scores.mean()
})
model_table_data = []
for name, metrics in model_performance.items():
model_table_data.append({
'Model': name,
'R²': metrics['R2'],
'MSE': metrics['MSE']
})
hyperparam_table_data = []
for name, result in {'XGBoost': {'best_params': best_params_xgb, 'best_score': best_score_xgb}}.items():
hyperparam_table_data.append({
'Model': name,
'Best Parameters': result['best_params'],
'Best CV R² Score': result['best_score']
})
fig = dash_table.DataTable(
columns =[{'name': i, 'id':i} for i in model_table_data[0].keys()],
data=model_table_data,
style_table={'overflowX': 'auto'},
style_cell={'textAlign': 'left', 'padding': '5px'},
style_header={
'backgroundColor': 'rgb(230, 230, 230)',
'fontWeight': 'bold'
},
),
return fig