Tip on developing a regression dash App

Moritus · July 7, 2024, 7:52am

Please i need help, am trying to develop a dash app on regression analysis and is not is not coming up, can regression analysis work on a dash app or do i need to apply any adding independences for it to run and display

import dash
from dash import dcc, html, Input, Output, callback
import dash_bootstrap_components as dbc
from dash import dash_table
import plotly.express as px
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor


# Load and preprocess data
# Load and preprocess data
data = pd.read_csv('C:\\Users\\Moritus Peters\\Documents\\Datasets\\bank-additional-full.csv', delimiter=';')
month_Matching = {
    'jan': 'January',
    'feb': 'February',
    'mar': 'March',
    'apr': 'April',
    'may': 'May',
    'jun': 'June',
    'jul': 'July',
    'aug': 'August',
    'sep': 'September',
    'oct': 'October',
    'nov': 'November',
    'dec': 'December'
}
data['month'] = data['month'].replace(month_Matching)
data['month'] = pd.Categorical(data['month'], categories=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], ordered=True)
data['day_of_week'] = pd.Categorical(data['day_of_week'], categories=['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'], ordered=True)
data = data.rename(columns={
    'previous': "Previous_contacts_distribution",
    'poutcome': 'Outcome_of_previous_marketing_campaign',
    'emp.var.rate': 'Employment_variation_rate',
    'cons.price.idx': 'Consumer_price_index',
    'cons.conf.idx': 'Consumer_confidence_index',
    'euribor3m': 'Euribor 3 month rate',
    'nr.employed': 'Number of employees',
    'y': 'Deposit'
})


layout = html.Div([
    html.H2('Regression Analysis'),
    
    dbc.Row([
        dbc.Col([
            html.Div([
                html.Label("Month Checklist", className='dropdown-label text'),
                dbc.Checklist(
                    id='month_checklist',
                    options=[{'label': str(month), 'value': month} for month in sorted(data['month'].unique(), key=lambda x: data['month'].cat.categories.tolist().index(x))],
                    value=[],
                    inline=True,
                    className='text-center px-2'
                )
            ], className='metric-container')
        ], width=12)
    ]),

   dbc.Row([
        dbc.Col(
            dbc.Card(
                dbc.CardBody([
                    html.H5('Model Performance Chart', 
                            className='text-center',),
                    dcc.Graph(id='Models_Performance_chart', figure={}),
                ])
            ),width=4
        ),
        dbc.Col([
        dbc.Card(
            dbc.CardBody([
                html.H5('Actual vs Predicted Values (xgboost)',
                        className='text-center'),
                dcc.Graph(id='Actual_vs_Predicted', figure={}),
            ])
        ),
    ], width=4),
    dbc.Col([
        dbc.Card(
            dbc.CardBody([
                html.H5('Top 10 Feature Importances',
                        className='text-center'),
                dcc.Graph(id=' Feature_Importances', figure={}),
            ])
        ),
    ], width=4),
    ]),
    dbc.Row([
          dbc.Col(
            dbc.Card(
                dbc.CardBody([
                    html.H5('Model Performance Chart', 
                            className='text-center',),
                    dcc.Graph(id='Model_Performance', figure={}),
                ])
            ),width=4
        ),
        dbc.Col([
        dbc.Card(
            dbc.CardBody([
                html.H5('Model Evaluation Results (Cross-Validation Scores)',
                        className='text-center'),
                dcc.Graph(id='Hyperparameter_Tuning', figure={}),
            ])
        ),
    ], width=8),
    ]),
    dbc.Row([
         dbc.Col(
            dbc.Card(
                dbc.CardBody([
                    html.H5('Model Performance Chart', 
                            className='text-center',),
                    dcc.Graph(id='model_performance_test_table', figure={}),
                ])
            ),width=12
        ),
    ])
    
])

@callback(Output('Models_Performance_chart', 'figure'),
          Input('month_checklist', 'value'))

def model_performance_chart(selected_month):
    if not selected_month:
        return {}
    
    
    filtered_data = data[data('month')].isin(selected_month)

    filtered_data.reset_index(drop=True, inplace = True)

    le = LabelEncoder()
    filtered_data['Deposit'] = le.fit_transform(filtered_data['Deposit'].values.ravel())

    filtered_data['Outcome_of_previous_marketing_campaign'] = pd.to_numeric(filtered_data['Outcome_of_previous_marketing_campaign'],errors='coerce' )
    filtered_data['duration'] = pd.to_numeric(filtered_data['duration'], errors='coerce')

    filtered_data['Outcome_of_previous_marketing_campaign'].fillna(0, inplace=True)
    filtered_data['duration'].fillna(0, inplace=True)

    filtered_data['Outcome_of_previous_marketing_campaign_squared'] = filtered_data['Outcome_of_previous_marketing_campaign'] **2
    filtered_data['Deposit'] = data['duration'] **2

    filtered_data = pd.get_dummies(filtered_data, columns=['job', 'marital', 'education', 'contact', 'month', 'day_of_week'])

    X = pd.get_dummies(filtered_data.drop(columns=['Deposit']), drop_first=True)
    y = filtered_data['Deposit']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # using minMaxScaler 
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'Ridge Regression': Ridge(),
        'Lasso Regression': Lasso(),
        'XGBoost': XGBRegressor(random_state=42)  
    }

    # Hyperparameter tuning parameters for XGBoost
    xgb_params = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
    }

    # Perform GridSearchCV for XGBoost
    grid_search_xgb = GridSearchCV(XGBRegressor(random_state=42), xgb_params, cv=5, scoring='r2')
    grid_search_xgb.fit(X_train_scaled, y_train)

    best_params_xgb = grid_search_xgb.best_params_
    best_score_xgb = grid_search_xgb.best_score_

    print(f"Best XGBoost Parameters: {best_params_xgb}")
    print(f"Best XGBoost R² Score: {best_score_xgb}")

    # Evaluate on test set
    y_pred_xgb = grid_search_xgb.best_estimator_.predict(X_test_scaled)
    r2_xgb = r2_score(y_test, y_pred_xgb)
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    print(f"XGBoost - R²: {r2_xgb}, MSE: {mse_xgb}")

    cv_results = {}
    model_performance = {}

    for name, model in models.items():
        if name == 'XGBoost':
            model = grid_search_xgb.best_estimator_  # Use best XGBoost model from GridSearchCV for evaluation
        pipeline = Pipeline([('scaler', MinMaxScaler()), ('model', model)])
        
        # Cross-validation scores
        cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
        cv_results[name] = cv_scores
        
        # Model performance on test set
        pipeline.fit(X_train, y_train)  # Fit on unscaled data
        y_pred = pipeline.predict(X_test)  # Predict on unscaled data
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        model_performance[name] = {'R2': r2, 'MSE': mse}

    model_performance_data = pd.DataFrame(model_performance).T.reset_index.rename(columns={'index': 'Model'})
    fig =px.bar(model_performance_data,
            x='Model', y='R2',  # Use 'R2' to match the column name
            labels={'R2': 'R² Score'})
        

    return fig

@callback(Output('Actual_vs_Predicted', 'figure'),
          Input('month_checklist', 'value'))
           
def model_performance_chart(selected_month):
    if not selected_month:
        return {}
    
    
    filtered_data = data[data('month').isin(selected_month)]

    filtered_data.reset_index(drop=True, inplace = True)

    le = LabelEncoder()
    filtered_data['Deposit'] = le.fit_transform(filtered_data['Deposit'].values.ravel())

    filtered_data['Outcome_of_previous_marketing_campaign'] = pd.to_numeric(filtered_data['Outcome_of_previous_marketing_campaign'],errors='coerce' )
    filtered_data['duration'] = pd.to_numeric(filtered_data['duration'], errors='coerce')

    filtered_data['Outcome_of_previous_marketing_campaign'].fillna(0, inplace=True)
    filtered_data['duration'].fillna(0, inplace=True)

    filtered_data['Outcome_of_previous_marketing_campaign_squared'] = filtered_data['Outcome_of_previous_marketing_campaign'] **2
    filtered_data['Deposit'] = data['duration'] **2

    filtered_data = pd.get_dummies(filtered_data, columns=['job', 'marital', 'education', 'contact', 'month', 'day_of_week'])

    X = pd.get_dummies(filtered_data.drop(columns=['Deposit']), drop_first=True)
    y = filtered_data['Deposit']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # using minMaxScaler 
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'Ridge Regression': Ridge(),
        'Lasso Regression': Lasso(),
        'XGBoost': XGBRegressor(random_state=42)  
    }

    # Hyperparameter tuning parameters for XGBoost
    xgb_params = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
    }

    # Perform GridSearchCV for XGBoost
    grid_search_xgb = GridSearchCV(XGBRegressor(random_state=42), xgb_params, cv=5, scoring='r2')
    grid_search_xgb.fit(X_train_scaled, y_train)

    best_params_xgb = grid_search_xgb.best_params_
    best_score_xgb = grid_search_xgb.best_score_

    print(f"Best XGBoost Parameters: {best_params_xgb}")
    print(f"Best XGBoost R² Score: {best_score_xgb}")

    # Evaluate on test set
    y_pred_xgb = grid_search_xgb.best_estimator_.predict(X_test_scaled)
    r2_xgb = r2_score(y_test, y_pred_xgb)
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    residuals = y_test - y_pred_xgb

    fig = px.scatter(x=y_pred_xgb, y=residuals,
                      labels={'x': 'Predicted Values', 'y': 'Residuals'})
    fig.add_hline(y=0, line_dash='dash',
                  line_color='red')
    return fig






@callback(Output('Feature_Importances', 'figure'),
          Input('month_checklist', 'value'))


def model_performance_chart(selected_month):
    if not selected_month:
        return {}
    
    filtered_data = data.copy()
    filtered_data = filtered_data[filtered_data('month').isin(selected_month)]

    filtered_data.reset_index(drop=True, inplace = True)

    le = LabelEncoder()
    filtered_data['Deposit'] = le.fit_transform(filtered_data['Deposit'].values.ravel())

    filtered_data['Outcome_of_previous_marketing_campaign'] = pd.to_numeric(filtered_data['Outcome_of_previous_marketing_campaign'],errors='coerce' )
    filtered_data['duration'] = pd.to_numeric(filtered_data['duration'], errors='coerce')

    filtered_data['Outcome_of_previous_marketing_campaign'].fillna(0, inplace=True)
    filtered_data['duration'].fillna(0, inplace=True)

    filtered_data['Outcome_of_previous_marketing_campaign_squared'] = filtered_data['Outcome_of_previous_marketing_campaign'] **2
    filtered_data['Deposit'] = data['duration'] **2

    filtered_data = pd.get_dummies(filtered_data, columns=['job', 'marital', 'education', 'contact', 'month', 'day_of_week'])

    X = pd.get_dummies(filtered_data.drop(columns=['Deposit']), drop_first=True)
    y = filtered_data['Deposit']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # using minMaxScaler 
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'Ridge Regression': Ridge(),
        'Lasso Regression': Lasso(),
        'XGBoost': XGBRegressor(random_state=42)  
    }

    # Hyperparameter tuning parameters for XGBoost
    xgb_params = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
    }

    # Perform GridSearchCV for XGBoost
    grid_search_xgb = GridSearchCV(XGBRegressor(random_state=42), xgb_params, cv=5, scoring='r2')
    grid_search_xgb.fit(X_train_scaled, y_train)

    best_params_xgb = grid_search_xgb.best_params_
    best_score_xgb = grid_search_xgb.best_score_

    print(f"Best XGBoost Parameters: {best_params_xgb}")
    print(f"Best XGBoost R² Score: {best_score_xgb}")

    # Evaluate on test set
    y_pred_xgb = grid_search_xgb.best_estimator_.predict(X_test_scaled)
    r2_xgb = r2_score(y_test, y_pred_xgb)
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    print(f"XGBoost - R²: {r2_xgb}, MSE: {mse_xgb}")

    cv_results = {}
    model_performance = {}

    for name, model in models.items():
        if name == 'XGBoost':
            model = grid_search_xgb.best_estimator_  # Use best XGBoost model from GridSearchCV for evaluation
        pipeline = Pipeline([('scaler', MinMaxScaler()), ('model', model)])
        
        # Cross-validation scores
        cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
        cv_results[name] = cv_scores
        
        # Model performance on test set
        pipeline.fit(X_train, y_train)  # Fit on unscaled data
        y_pred = pipeline.predict(X_test)  # Predict on unscaled data
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        model_performance[name] = {'R2': r2, 'MSE': mse}

    # Train Random forest  for feature importance
    xgb_model = XGBRegressor(random_state=42)
    xgb_model.fit(X_train_scaled, y_train)
    feature_importances = xgb_model.feature_importance_
    features = X.columns
    feature_importance_data = pd.DataFrame({'Feature': features, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)


    fig =px.bar(feature_importance_data.head(10),
                 x='Importance',
                   y='Feature',
                     orientation='h', 
                     )

    return fig


@callback(Output('model_performance_test', 'figure'),
          Input('month_checklist', 'figure'),
          allow_duplicate=True)


def model_performance_test(selected_month):
    if not selected_month:
        return {}
    
    filtered_data = data.copy()
    filtered_data = filtered_data[filtered_data('month').isin(selected_month)]

    filtered_data.reset_index(drop=True, inplace = True)

    le = LabelEncoder()
    filtered_data['Deposit'] = le.fit_transform(filtered_data['Deposit'].values.ravel())

    filtered_data['Outcome_of_previous_marketing_campaign'] = pd.to_numeric(filtered_data['Outcome_of_previous_marketing_campaign'],errors='coerce' )
    filtered_data['duration'] = pd.to_numeric(filtered_data['duration'], errors='coerce')

    filtered_data['Outcome_of_previous_marketing_campaign'].fillna(0, inplace=True)
    filtered_data['duration'].fillna(0, inplace=True)

    filtered_data['Outcome_of_previous_marketing_campaign_squared'] = filtered_data['Outcome_of_previous_marketing_campaign'] **2
    filtered_data['Deposit'] = data['duration'] **2

    filtered_data = pd.get_dummies(filtered_data, columns=['job', 'marital', 'education', 'contact', 'month', 'day_of_week'])

    X = pd.get_dummies(filtered_data.drop(columns=['Deposit']), drop_first=True)
    y = filtered_data['Deposit']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # using minMaxScaler 
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'Ridge Regression': Ridge(),
        'Lasso Regression': Lasso(),
        'XGBoost': XGBRegressor(random_state=42)  
    }

    # Hyperparameter tuning parameters for XGBoost
    xgb_params = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
    }

    # Perform GridSearchCV for XGBoost
    grid_search_xgb = GridSearchCV(XGBRegressor(random_state=42), xgb_params, cv=5, scoring='r2')
    grid_search_xgb.fit(X_train_scaled, y_train)

    best_params_xgb = grid_search_xgb.best_params_
    best_score_xgb = grid_search_xgb.best_score_

    print(f"Best XGBoost Parameters: {best_params_xgb}")
    print(f"Best XGBoost R² Score: {best_score_xgb}")

    # Evaluate on test set
    y_pred_xgb = grid_search_xgb.best_estimator_.predict(X_test_scaled)
    r2_xgb = r2_score(y_test, y_pred_xgb)
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    print(f"XGBoost - R²: {r2_xgb}, MSE: {mse_xgb}")

    cv_results = {}
    model_performance = {}

    for name, model in models.items():
        if name == 'XGBoost':
            model = grid_search_xgb.best_estimator_  # Use best XGBoost model from GridSearchCV for evaluation
        pipeline = Pipeline([('scaler', MinMaxScaler()), ('model', model)])
        
        # Cross-validation scores
        cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
        cv_results[name] = cv_scores
        
        # Model performance on test set
        pipeline.fit(X_train, y_train)  # Fit on unscaled data
        y_pred = pipeline.predict(X_test)  # Predict on unscaled data
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        model_performance[name] = {'R2': r2, 'MSE': mse}

        # Create data for the tables
        cv_table_data = []
        for name, scores in cv_results.items():
            cv_table_data.append({
                'Model': name,
                'Fold 1': scores[0],
                'Fold 2': scores[1],
                'Fold 3': scores[2],
                'Fold 4': scores[3],
                'Fold 5': scores[4],
                'Mean CV R² Score': scores.mean()
            })

        model_table_data = []
        for name, metrics in model_performance.items():
            model_table_data.append({
                'Model': name,
                'R²': metrics['R2'],
                'MSE': metrics['MSE']
            })

        hyperparam_table_data = []
        for name, result in {'XGBoost': {'best_params': best_params_xgb, 'best_score': best_score_xgb}}.items():
            hyperparam_table_data.append({
                'Model': name,
                'Best Parameters': result['best_params'],
                'Best CV R² Score': result['best_score']
            })

        fig = dash_table.DataTable(
            columns =[{'name': i, 'id':i} for i in model_table_data[0].keys()],
            data=model_table_data,
            style_table={'overflowX': 'auto'},
            style_cell={'textAlign': 'left', 'padding': '5px'},
            style_header={
            'backgroundColor': 'rgb(230, 230, 230)',
            'fontWeight': 'bold'
            },
            ),
        return fig

@callback(Output('model_performance_test_table', 'figure'),
          Input('month_checklist', 'figure'))


def model_performance_test(selected_month):
    if not selected_month:
        return {}
    
    filtered_data = data.copy()
    filtered_data = filtered_data[filtered_data('month').isin(selected_month)]

    filtered_data.reset_index(drop=True, inplace = True)

    le = LabelEncoder()
    filtered_data['Deposit'] = le.fit_transform(filtered_data['Deposit'].values.ravel())

    filtered_data['Outcome_of_previous_marketing_campaign'] = pd.to_numeric(filtered_data['Outcome_of_previous_marketing_campaign'],errors='coerce' )
    filtered_data['duration'] = pd.to_numeric(filtered_data['duration'], errors='coerce')

    filtered_data['Outcome_of_previous_marketing_campaign'].fillna(0, inplace=True)
    filtered_data['duration'].fillna(0, inplace=True)

    filtered_data['Outcome_of_previous_marketing_campaign_squared'] = filtered_data['Outcome_of_previous_marketing_campaign'] **2
    filtered_data['Deposit'] = data['duration'] **2

    filtered_data = pd.get_dummies(filtered_data, columns=['job', 'marital', 'education', 'contact', 'month', 'day_of_week'])

    X = pd.get_dummies(filtered_data.drop(columns=['Deposit']), drop_first=True)
    y = filtered_data['Deposit']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # using minMaxScaler 
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'Ridge Regression': Ridge(),
        'Lasso Regression': Lasso(),
        'XGBoost': XGBRegressor(random_state=42)  
    }

    # Hyperparameter tuning parameters for XGBoost
    xgb_params = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
    }

    # Perform GridSearchCV for XGBoost
    grid_search_xgb = GridSearchCV(XGBRegressor(random_state=42), xgb_params, cv=5, scoring='r2')
    grid_search_xgb.fit(X_train_scaled, y_train)

    best_params_xgb = grid_search_xgb.best_params_
    best_score_xgb = grid_search_xgb.best_score_

    print(f"Best XGBoost Parameters: {best_params_xgb}")
    print(f"Best XGBoost R² Score: {best_score_xgb}")

    # Evaluate on test set
    y_pred_xgb = grid_search_xgb.best_estimator_.predict(X_test_scaled)
    r2_xgb = r2_score(y_test, y_pred_xgb)
    mse_xgb = mean_squared_error(y_test, y_pred_xgb)
    print(f"XGBoost - R²: {r2_xgb}, MSE: {mse_xgb}")

    cv_results = {}
    model_performance = {}

    for name, model in models.items():
        if name == 'XGBoost':
            model = grid_search_xgb.best_estimator_  # Use best XGBoost model from GridSearchCV for evaluation
        pipeline = Pipeline([('scaler', MinMaxScaler()), ('model', model)])
        
        # Cross-validation scores
        cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
        cv_results[name] = cv_scores
        
        # Model performance on test set
        pipeline.fit(X_train, y_train)  # Fit on unscaled data
        y_pred = pipeline.predict(X_test)  # Predict on unscaled data
        r2 = r2_score(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        model_performance[name] = {'R2': r2, 'MSE': mse}

        # Create data for the tables
        cv_table_data = []
        for name, scores in cv_results.items():
            cv_table_data.append({
                'Model': name,
                'Fold 1': scores[0],
                'Fold 2': scores[1],
                'Fold 3': scores[2],
                'Fold 4': scores[3],
                'Fold 5': scores[4],
                'Mean CV R² Score': scores.mean()
            })

        model_table_data = []
        for name, metrics in model_performance.items():
            model_table_data.append({
                'Model': name,
                'R²': metrics['R2'],
                'MSE': metrics['MSE']
            })

        hyperparam_table_data = []
        for name, result in {'XGBoost': {'best_params': best_params_xgb, 'best_score': best_score_xgb}}.items():
            hyperparam_table_data.append({
                'Model': name,
                'Best Parameters': result['best_params'],
                'Best CV R² Score': result['best_score']
            })

        fig = dash_table.DataTable(
            columns =[{'name': i, 'id':i} for i in cv_table_data[0].keys()],
            data=model_table_data,
            style_table={'overflowX': 'auto'},
            style_cell={'textAlign': 'left', 'padding': '5px'},
            style_header={
                'backgroundColor': 'rgb(230, 230, 230)',
                'fontWeight': 'bold'
            },
            ),
        return fig
        ```
        is a multiple dash app

davidharris · July 7, 2024, 10:13am

It’s mightily difficult to follow that much code formatted in that way, but one possible thing is that a Dash app needs to have as a minimum something like the following, and I can’t see any of this in your code:

from dash import Dash
app = Dash()
app.layout = ...

if __name__ == "__main__":
    app.run_server()

Moritus · July 7, 2024, 11:49am

thanks for the correction i will add and run it again, though the defined structure was developed in my main.py file to influence the other .py file to the multiple dash app

Moritus · July 7, 2024, 7:49pm

i have tried app the app = Dash()
app.layout
i got an error

may be because is in the main.py file

my main challenge now is the error notification below which i have check my code with the temple formate from Adam on youtube and making sure all my ids and callback are linked propare or may there is something am not doing right . i need help too on this

Attempting to connect a callback Input item to component:
“month_checklist”
but no components with that id exist in the layout.

If you are assigning callbacks to components that are
generated by other callbacks (and therefore not in the
initial layout), you can suppress this exception by setting
suppress_callback_exceptions=True.
This ID was used in the callback(s) for Output(s):
age_group_and_loan_distribution.figure
job_and_marital.figure
Features_Deposit.figure
Month_and_week_conversion.figure
Models_Performance_chart.figure
Actual_vs_Predicted.figure
Feature_Importances.figure
model_performance_test.figure
model_performance_test_table.figure

please help @davidharris , @AIMPED

AIMPED · July 7, 2024, 9:20pm

Hey @Moritus I just formatted the code for you.

Please try to create a minimal app which reproduces your error as suggested by @davidharris.

In general:
run your app with debug=True and check for errors.

If you do a lot of heavy lifting in the callbacks, you might run into timeout errors. Consider using background callbacks.

You can do a regression analysis including models in dash, so this in not an issue.

Moritus · July 7, 2024, 9:28pm

thank let me see if joining all the callback together to reduce the amount of code used if that will work
for the suggestion @davidharris i tried it i got error indication in my vscode IDE

let me try it again to see what was wrong

thanks for the help

Moritus · July 8, 2024, 5:27am

i have applied the feedback from @davidharris and @AIMPED which have result the reduction of the codes but the chart couldn’t display after long loading period

import dash
from dash import dcc, html, Input, Output, callback
import dash_bootstrap_components as dbc
from dash import dash_table
import plotly.express as px
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

Load and preprocess data

data = pd.read_csv(‘C:\Users\Moritus Peters\Documents\Datasets\bank-additional-full.csv’, delimiter=‘;’)
month_Matching = {
‘jan’: ‘January’,
‘feb’: ‘February’,
‘mar’: ‘March’,
‘apr’: ‘April’,
‘may’: ‘May’,
‘jun’: ‘June’,
‘jul’: ‘July’,
‘aug’: ‘August’,
‘sep’: ‘September’,
‘oct’: ‘October’,
‘nov’: ‘November’,
‘dec’: ‘December’
}
data[‘month’] = data[‘month’].replace(month_Matching)
data[‘month’] = pd.Categorical(data[‘month’], categories=[‘January’, ‘February’, ‘March’, ‘April’, ‘May’, ‘June’, ‘July’, ‘August’, ‘September’, ‘October’, ‘November’, ‘December’], ordered=True)
data[‘day_of_week’] = pd.Categorical(data[‘day_of_week’], categories=[‘mon’, ‘tue’, ‘wed’, ‘thu’, ‘fri’, ‘sat’, ‘sun’], ordered=True)
data = data.rename(columns={
‘previous’: “Previous_contacts_distribution”,
‘poutcome’: ‘Outcome_of_previous_marketing_campaign’,
‘emp.var.rate’: ‘Employment_variation_rate’,
‘cons.price.idx’: ‘Consumer_price_index’,
‘cons.conf.idx’: ‘Consumer_confidence_index’,
‘euribor3m’: ‘Euribor 3 month rate’,
‘nr.employed’: ‘Number of employees’,
‘y’: ‘Deposit’
})

app = dash.Dash(name, external_stylesheets=[dbc.themes.LUMEN])

app.layout = html.Div([
html.H2(‘Regression Analysis’),

dbc.Row([
    dbc.Col([
        html.Div([
            html.Label("Month Checklist", className='dropdown-label text'),
            dbc.Checklist(
                id='month_checklist',
                options=[{'label': str(month), 'value': month} for month in sorted(data['month'].unique(), key=lambda x: data['month'].cat.categories.tolist().index(x))],
                value=[],
                inline=True,
                className='text-center px-2'
            )
        ], className='metric-container')
    ], width=12)
]),

dbc.Row([
    dbc.Col(
        dbc.Card(
            dbc.CardBody([
                html.H5('Model Performance Chart', 
                        className='text-center',),
                dcc.Graph(id='Models_Performance_chart', figure={}),
            ])
        ), width=4
    ),
    dbc.Col([
        dbc.Card(
            dbc.CardBody([
                html.H5('Actual vs Predicted Values (xgboost)',
                        className='text-center'),
                dcc.Graph(id='Actual_vs_Predicted', figure={}),
            ])
        ),
    ], width=4),
    dbc.Col([
        dbc.Card(
            dbc.CardBody([
                html.H5('Top 10 Feature Importances',
                        className='text-center'),
                dcc.Graph(id='Feature_Importances', figure={}),
            ])
        ),
    ], width=4),
]),
dbc.Row([
    dbc.Col(
        dbc.Card(
            dbc.CardBody([
                html.H5('Model Performance Chart', 
                        className='text-center',),
                dcc.Graph(id='Model_Performance', figure={}),
            ])
        ), width=4
    ),
    dbc.Col([
        dbc.Card(
            dbc.CardBody([
                html.H5('Model Evaluation Results (Cross-Validation Scores)',
                        className='text-center'),
                dcc.Graph(id='Hyperparameter_Tuning', figure={}),
            ])
        ),
    ], width=8),
]),
dbc.Row([
    dbc.Col(
        dbc.Card(
            dbc.CardBody([
                html.H5('Model Performance Chart', 
                        className='text-center',),
                html.Div(id='model_performance_test_table')
            ])
        ), width=12
    ),
])

])

developing the callback

@callback([
Output(‘Models_Performance_chart’, ‘figure’),
Output(‘Actual_vs_Predicted’, ‘figure’),
Output(‘Feature_Importances’, ‘figure’),
Output(‘Hyperparameter_Tuning’, ‘figure’),
Output(‘model_performance_test_table’, ‘children’)],
Input(‘month_checklist’, ‘value’)
)
def model_performance_chart(selected_month):
if not selected_month:
return {}, {}, {}, {}, {}

filtered_data = data.copy()
filtered_data = filtered_data[filtered_data['month'].isin(selected_month)]
filtered_data.reset_index(drop=True, inplace=True)

le = LabelEncoder()
filtered_data['Deposit'] = le.fit_transform(filtered_data['Deposit'].values.ravel())

filtered_data['Outcome_of_previous_marketing_campaign'] = pd.to_numeric(filtered_data['Outcome_of_previous_marketing_campaign'], errors='coerce')
filtered_data['duration'] = pd.to_numeric(filtered_data['duration'], errors='coerce')

filtered_data['Outcome_of_previous_marketing_campaign'].fillna(0, inplace=True)
filtered_data['duration'].fillna(0, inplace=True)

filtered_data['Outcome_of_previous_marketing_campaign_squared'] = filtered_data['Outcome_of_previous_marketing_campaign'] ** 2
filtered_data['Deposit'] = filtered_data['duration'] ** 2

filtered_data = pd.get_dummies(filtered_data, columns=['job', 'marital', 'education', 'contact', 'month', 'day_of_week'])

X = pd.get_dummies(filtered_data.drop(columns=['Deposit']), drop_first=True)
y = filtered_data['Deposit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'XGBoost': XGBRegressor(random_state=42)
}

# Hyperparameter tuning parameters for XGBoost
xgb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

# Perform GridSearchCV for XGBoost
grid_search_xgb = GridSearchCV(XGBRegressor(random_state=42), xgb_params, cv=5, scoring='r2')
grid_search_xgb.fit(X_train_scaled, y_train)

best_params_xgb = grid_search_xgb.best_params_
best_score_xgb = grid_search_xgb.best_score_

# Evaluate on test set
y_pred_xgb = grid_search_xgb.best_estimator_.predict(X_test_scaled)
r2_xgb = r2_score(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)

cv_results = {}
model_performance = {}

for name, model in models.items():
    if name == 'XGBoost':
        model = grid_search_xgb.best_estimator_  # Use best XGBoost model from GridSearchCV for evaluation
    pipeline = Pipeline([('scaler', StandardScaler()), (name, model)])
    cv_results[name] = cross_val_score(pipeline, X_train, y_train, cv=10, scoring='r2')
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    model_performance[name] = {'R2': r2, 'MSE': mse}

# Create figure for Models Performance Chart (Bar chart)
model_performance_data = pd.DataFrame(model_performance).T.reset_index()
fig_models_performance = px.bar(model_performance_data, x='index', y='R2', labels={'index': 'Model', 'R2': 'R² Score'},
                                title='Model Performance')

# Example of creating a Plotly Express scatter plot for actual vs predicted values (Scatter plot)
fig_actual_vs_predicted = px.scatter(x=y_pred_xgb, y=y_test, labels={'x': 'Predicted Values', 'y': 'Actual Values'},
                                     title='Actual vs Predicted Values')

# Define the feature_importance_data based on the XGBoost model
feature_importance_data = pd.DataFrame({
    'Feature': X.columns,
    'Importance': grid_search_xgb.best_estimator_.feature_importances_
}).sort_values(by='Importance', ascending=False).head(10)

fig_feature_importances = px.bar(feature_importance_data, x='Importance', y='Feature', orientation='h', title='Top 10 Feature Importances')

# Example of creating a Plotly Express bar chart for hyperparameter tuning results (if applicable)
hyperparam_table_data = pd.DataFrame({
    'Model': ['XGBoost'],
    'Best CV R² Score': [best_score_xgb]
})
fig_hyperparameter_tuning = px.bar(hyperparam_table_data, x='Model', y='Best CV R² Score', title='Hyperparameter Tuning Results')

# Creating a Dash DataTable for model performance on test set (Dash DataTable)
cv_table_data = pd.DataFrame(cv_results)
table_model_performance_test = dash_table.DataTable(
    columns=[{'name': col, 'id': col} for col in cv_table_data.columns],
    data=cv_table_data.to_dict('records'),
    style_table={'overflowX': 'auto'},
    style_cell={'textAlign': 'left', 'padding': '5px'},
    style_header={
        'backgroundColor': 'rgb(230, 230, 230)',
        'fontWeight': 'bold'
    }
)

return fig_models_performance, fig_actual_vs_predicted, fig_feature_importances, fig_hyperparameter_tuning

if name == ‘main’:
app.run_server(debug=True, port=6020)

Moritus · July 11, 2024, 7:10pm

thanks i reduced the amount of code i it worked, but the Id issues still persist, i thinks i have to use the model way of multiple dashboard. lastly i there any way i can develop my metrics card to be interactive

AIMPED · July 12, 2024, 7:06am

Sorry, I can’t follow you thoughts.

Moritus · July 15, 2024, 7:05am

ok sorry about that, i want to learn how to build an interactive metric card with icons, any an idea on how to go about it

am still experiencing the ID issue i complained about even when i have checked all my id verifying that they are properly linked to the layout

Topic		Replies	Views
Error ID not found in the layout and others Dash Python question	6	69	July 8, 2024
Show and Tell - Community Thread :tada: Dash Python show-and-tell	87	48576	June 7, 2023
Unable to replicate a simple chart from plotly into dash. Dash not showing any warning or error signs Dash Python	5	425	February 22, 2021
Dash Online Course Final Project: Bank Churn Dash Python dash-online-course	4	400	October 23, 2023
Plotly Dash Example Apps Challenge Dash Python announcements	32	8315	May 9, 2023

Tip on developing a regression dash App

Load and preprocess data

developing the callback

Related topics