My approach for FF Week 29 was to design an application named “Budget Optimizer.”
Imagine an organization (like Wake County in North Carolina) that has numerous expenses, classified into hundreds of categories. Sometimes, these categories can be very similar or even refer to the same thing, but with slightly different names (for example, “Office Supplies” and “Stationery”).
This application acts like a smart assistant that helps find these highly similar expense categories. Its main goals are:
The code
import dash
from dash import dcc, html, Input, Output, dash_table
import dash_bootstrap_components as dbc
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
Inicializar la app Dash con tema Bootstrap
app = dash.Dash(name, external_stylesheets=[dbc.themes.UNITED])
app.title = “Budget Optimizer”
Función para cargar y procesar datos
def load_and_process_data():
# Aquí cargarías tu dataset real
# Para la demostración, se asume que ‘wake_county2_budget2023.csv’ existe.
# Si no tienes el archivo, puedes crear un DataFrame de ejemplo para probar.
try:
df = pd.read_csv(‘wake_county2_budget2023.csv’)
except FileNotFoundError:
print(“Error: ‘wake_county2_budget2023.csv’ no encontrado. Creando datos de ejemplo.”)
data = {
‘Record_Date’: pd.to_datetime([‘2023-01-01’, ‘2023-01-05’, ‘2023-01-10’, ‘2023-01-15’, ‘2023-01-20’,
‘2023-02-01’, ‘2023-02-05’, ‘2023-02-10’, ‘2023-02-15’, ‘2023-02-20’]),
‘Consolidated_Expenditure’: [
‘Office Supplies’, ‘IT Services’, ‘Consulting Fees’, ‘Travel Expenses’, ‘Training Programs’,
‘Office Furniture’, ‘Software Licenses’, ‘Legal Services’, ‘Marketing Campaigns’, ‘Employee Benefits’
],
‘Actual_Amount’: [150.00, 1200.50, 5000.00, 300.75, 800.00,
450.00, 2500.00, 7000.00, 1500.00, 10000.00]
}
df = pd.DataFrame(data)
# Añadir más datos para tener suficientes categorías y entradas para las recomendaciones
more_data = {
‘Record_Date’: pd.to_datetime([‘2023-03-01’, ‘2023-03-05’, ‘2023-03-10’, ‘2023-03-15’, ‘2023-03-20’]),
‘Consolidated_Expenditure’: [
‘Stationery’, ‘Computer Support’, ‘Advisory Services’, ‘Business Travel’, ‘Professional Development’
],
‘Actual_Amount’: [100.00, 1500.00, 6000.00, 400.00, 900.00]
}
df = pd.concat([df, pd.DataFrame(more_data)], ignore_index=True)
# Añadir algunas categorías duplicadas o similares para que TF-IDF funcione
df = pd.concat([df, pd.DataFrame({
‘Record_Date’: pd.to_datetime([‘2023-04-01’, ‘2023-04-05’]),
‘Consolidated_Expenditure’: [‘Office Supplies’, ‘IT Services’],
‘Actual_Amount’: [75.00, 900.00]
})], ignore_index=True)
df['Record_Date'] = pd.to_datetime(df['Record_Date'])
return df
Función para calcular similitudes usando TF-IDF
def calculate_similarities(categories):
# Limpiar y procesar texto
cleaned_categories = [cat.lower().replace(‘:’, ’ ‘).replace(’-', ’ ') for cat in categories]
# Crear vectores TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tfidf_matrix = vectorizer.fit_transform(cleaned_categories)
# Calcular similitud coseno
cosine_sim = cosine_similarity(tfidf_matrix)
return cosine_sim, vectorizer
Función para obtener recomendaciones
def get_recommendations(category, df, cosine_sim, categories, top_n=5):
try:
idx = categories.index(category)
except ValueError:
return pd.DataFrame()
# Obtener puntuaciones de similitud
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Obtener top N similares (excluyendo el mismo)
sim_indices = [i[0] for i in sim_scores[1:top_n+1]]
similar_categories = [categories[i] for i in sim_indices]
similarity_scores = [sim_scores[i+1][1] for i in range(top_n)]
# Crear DataFrame con información adicional
recommendations = []
for cat, score in zip(similar_categories, similarity_scores):
cat_data = df[df['Consolidated_Expenditure'] == cat]
total_amount = cat_data['Actual_Amount'].sum()
avg_amount = cat_data['Actual_Amount'].mean()
frequency = len(cat_data)
recommendations.append({
'Categoria_Similar': cat,
'Puntuacion_Similitud': round(score, 3),
'Monto_Total': round(total_amount, 2),
'Monto_Promedio': round(avg_amount, 2),
'Frecuencia': frequency
})
return pd.DataFrame(recommendations)
Cargar datos
df = load_and_process_data()
categories = df[‘Consolidated_Expenditure’].unique().tolist()
cosine_sim, vectorizer = calculate_similarities(categories)
Estilo para los encabezados de las tarjetas
CARD_HEADER_STYLE = {
‘background-color’: ‘#f8f8f8’, # Un gris muy claro para los encabezados de las tarjetas
‘color’: ‘#333333’, # Texto gris oscuro para contraste con el fondo claro
‘border-bottom’: ‘1px solid rgba(0,0,0,.125)’,
‘border-top-left-radius’: ‘calc(.25rem - 1px)’,
‘border-top-right-radius’: ‘calc(.25rem - 1px)’,
‘padding’: ‘1rem 1.25rem’
}
Layout con narrativa profesional
app.layout = dbc.Container([
# Header con hero section
dbc.Row([
dbc.Col([
html.Div([
html.H1(“Budget Optimizer”,
className=“display-4 text-white fw-bold mb-3”),
html.P(“Wake County North Carolina • Fiscal Year 2023”,
className=“lead text-white-50 mb-4”),
html.Hr(className=“border-light”),
html.P("Using Machine Learning to identify consolidation and budget optimization "
“opportunities through semantic analysis of 534 expenditure categories.”,
className=“text-white fs-5”)
], className=“p-5”, style={
# Nuevo gradiente para el encabezado principal con un toque de púrpura
‘background’: ‘linear-gradient(45deg, #4CAF50 0%, #673AB7 100%)’, # Verde a Púrpura
‘borderRadius’: ‘15px’,
‘marginBottom’: ‘30px’,
‘boxShadow’: ‘0 4px 8px rgba(0,0,0,0.2)’ # Sombra para resaltarlo
})
])
]),
# Sección de contexto y proceso
dbc.Row([
dbc.Col([
dbc.Card([
dbc.CardHeader([
html.H4("🎯 The Challenge", className="mb-0", style={'color': CARD_HEADER_STYLE['color']})
], style=CARD_HEADER_STYLE),
dbc.CardBody([
html.P("With over 132,000 expense records distributed across 534 unique categories, "
"identifying consolidation opportunities manually is virtually impossible. "
"Our Machine Learning algorithm analyzes semantic similarities to reveal "
"hidden patterns.", className="card-text")
])
], className="h-100 shadow-sm") # Añadir sombra a las tarjetas
], width=4),
dbc.Col([
dbc.Card([
dbc.CardHeader([
html.H4("🤖 The Solution", className="mb-0", style={'color': CARD_HEADER_STYLE['color']})
], style=CARD_HEADER_STYLE),
dbc.CardBody([
html.P("We use TF-IDF (Term Frequency-Inverse Document Frequency) combined with "
"cosine similarity to analyze the text of each expense category. The algorithm "
"automatically identifies related categories that could benefit from "
"consolidation or joint negotiation.", className="card-text")
])
], className="h-100 shadow-sm") # Añadir sombra a las tarjetas
], width=4),
dbc.Col([
dbc.Card([
dbc.CardHeader([
html.H4("💡 The Impact", className="mb-0", style={'color': CARD_HEADER_STYLE['color']})
], style=CARD_HEADER_STYLE),
dbc.CardBody([
html.P("Recommendations enable administrators to quickly identify "
"savings opportunities, improve procurement efficiency, and optimize "
"budget structure based on quantitative evidence.", className="card-text")
])
], className="h-100 shadow-sm") # Añadir sombra a las tarjetas
], width=4)
], className="mb-5"),
# Panel de control principal
dbc.Row([
dbc.Col([
dbc.Card([
dbc.CardHeader([
html.H5("🔍 Similarity Analysis", className="mb-0", style={'color': CARD_HEADER_STYLE['color']})
], style=CARD_HEADER_STYLE),
dbc.CardBody([
html.Label("Select Expenditure Category for Analysis:",
className="fw-bold mb-2"),
dcc.Dropdown(
id='category-dropdown',
options=[{'label': cat, 'value': cat} for cat in sorted(categories)],
value=categories[0] if categories else None,
className="mb-4"
),
html.Label("Number of Recommendations:", className="fw-bold mb-2"),
dbc.InputGroup([
dbc.InputGroupText("#"),
dbc.Input(
id='num-recommendations',
type="number",
min=3,
max=10,
step=1,
value=5,
className="text-center"
)
], className="mb-4"),
html.Div(id='selected-category-info')
])
], className="shadow-sm") # Añadir sombra a la tarjeta
], width=4),
dbc.Col([
dbc.Card([
dbc.CardHeader([
html.H5("📊 Identified Similar Categories", className="mb-0", style={'color': CARD_HEADER_STYLE['color']})
], style=CARD_HEADER_STYLE),
dbc.CardBody([
html.P("The algorithm has identified the following categories with highest semantic similarity. "
"Scores close to 1.0 indicate higher similarity.",
className="text-muted small mb-3"),
dash_table.DataTable(
id='recommendations-table',
columns=[
{'name': 'Similar Category', 'id': 'Categoria_Similar'},
{'name': 'Similarity', 'id': 'Puntuacion_Similitud', 'type': 'numeric', 'format': {'specifier': '.3f'}},
{'name': 'Total Amount ($)', 'id': 'Monto_Total', 'type': 'numeric', 'format': {'specifier': ',.0f'}},
{'name': 'Frecuencia', 'id': 'Frecuencia', 'type': 'numeric'}
],
style_cell={
'textAlign': 'left',
'fontSize': 12,
'fontFamily': 'Arial, sans-serif'
},
style_header={
'backgroundColor': '#e0e0e0', # Un gris un poco más oscuro para el encabezado de la tabla
'fontWeight': 'bold',
'border': '1px solid #dee2e6'
},
style_data_conditional=[
{
'if': {'filter_query': '{Puntuacion_Similitud} > 0.7'},
'backgroundColor': '#d1ecf1',
'color': 'black',
},
{
'if': {'filter_query': '{Puntuacion_Similitud} > 0.5 && {Puntuacion_Similitud} <= 0.7'},
'backgroundColor': '#d4edda',
'color': 'black',
}
],
page_size=10,
sort_action="native",
sort_by=[{'column_id': 'Monto_Total', 'direction': 'desc'}]
)
])
], className="shadow-sm") # Añadir sombra a la tarjeta
], width=8)
], className="mb-4"),
# Visualizaciones con narrativa
dbc.Row([
dbc.Col([
dbc.Card([
dbc.CardHeader([
html.H5("💰 Financial Analysis", className="mb-0", style={'color': CARD_HEADER_STYLE['color']})
], style=CARD_HEADER_STYLE),
dbc.CardBody([
html.P("Comparison of total amounts between selected category and similar ones. "
"This helps prioritize consolidation opportunities.",
className="text-muted small mb-3"),
dcc.Graph(id='amount-comparison', style={'height': '400px'})
])
], className="shadow-sm") # Añadir sombra a la tarjeta
], width=6),
dbc.Col([
dbc.Card([
dbc.CardHeader([
html.H5("📈 Similarity Distribution", className="mb-0", style={'color': CARD_HEADER_STYLE['color']})
], style=CARD_HEADER_STYLE),
dbc.CardBody([
html.P("Visualization of similarity scores. Higher values indicate greater "
"potential for consolidation or joint negotiation.",
className="text-muted small mb-3"),
dcc.Graph(id='similarity-distribution', style={'height': '400px'})
])
], className="shadow-sm") # Añadir sombra a la tarjeta
], width=6)
], className="mb-4"),
# Sección de insights y recomendaciones
dbc.Row([
dbc.Col([
dbc.Card([
dbc.CardHeader([
html.H5("🎯 Strategic Recommendations", className="mb-0", style={'color': CARD_HEADER_STYLE['color']})
], style=CARD_HEADER_STYLE),
dbc.CardBody([
html.Div(id='consolidation-insights')
])
], className="shadow-sm") # Añadir sombra a la tarjeta
])
], className="mb-4"),
# Footer con información técnica
dbc.Row([
dbc.Col([
dbc.Card([
dbc.CardBody([
html.Small([
"📊 Web Application developed using Python Plotly-Dash | Thanks to Raleigh Open Data Portal for Dataset",
], className="text-muted")
])
], color="light", className="shadow-sm") # Añadir sombra a la tarjeta del footer
])
])
], fluid=True, className=“px-4 py-3”, style={‘background-color’: ‘#f0f2f5’}) # Color de fondo para toda la aplicación
Callbacks con narrativa mejorada
@app.callback(
[Output(‘selected-category-info’, ‘children’),
Output(‘recommendations-table’, ‘data’),
Output(‘amount-comparison’, ‘figure’),
Output(‘similarity-distribution’, ‘figure’),
Output(‘consolidation-insights’, ‘children’)],
[Input(‘category-dropdown’, ‘value’),
Input(‘num-recommendations’, ‘value’)]
)
def update_recommendations(selected_category, num_recs):
# Manejar el caso donde selected_category es None si no hay categorías
if not selected_category:
return (
dbc.Alert(“No category selected or available data.”, color=“warning”),
, {}, {}, dbc.Alert(“No recommendations available.”, color=“warning”)
)
# Información de categoría seleccionada
selected_data = df[df['Consolidated_Expenditure'] == selected_category]
total_amount = selected_data['Actual_Amount'].sum()
avg_amount = selected_data['Actual_Amount'].mean()
frequency = len(selected_data)
category_info = dbc.Alert([
html.H6("📋 Selected Category:", className="alert-heading"),
html.P(f"💵 Total Amount: ${total_amount:,.2f}", className="mb-1"),
html.P(f"📊 Average Amount: ${avg_amount:,.2f}", className="mb-1"),
html.P(f"🔢 Frequency: {frequency} records", className="mb-0")
], color="info")
# Obtener recomendaciones
recommendations_df = get_recommendations(selected_category, df, cosine_sim, categories, num_recs)
if recommendations_df.empty:
return category_info, [], {}, {}, dbc.Alert("No recommendations found.", color="warning")
# Ordenar el DataFrame para el gráfico de montos (mayor a menor)
recommendations_df_amounts_sorted = recommendations_df.sort_values(by='Monto_Total', ascending=True)
# Gráfico de comparación de montos mejorado (barras horizontales)
fig_amounts = px.bar(
recommendations_df_amounts_sorted,
x='Monto_Total',
y='Categoria_Similar',
color='Puntuacion_Similitud',
title='Consolidation Potential by Amount',
color_continuous_scale='purples',
labels={'Monto_Total': 'Total Amount ($)', 'Puntuacion_Similitud': 'Similarity Score','Categoria_Similar': ''},
text_auto='.2s', template='seaborn'
)
fig_amounts.update_layout(
height=500,
font={'size': 10},
plot_bgcolor='rgba(0,0,0,0)',
paper_bgcolor='rgba(0,0,0,0)',
coloraxis_showscale=False,
yaxis={'categoryorder':'total ascending'}
)
fig_amounts.update_xaxes(visible=False)
fig_amounts.update_traces(hovertemplate='<b>%{y}</b><br>Amount: $%{x:,.0f}<br>Similarity: %{marker.color:.3f}')
# Ordenar el DataFrame para el gráfico de similitudes (mayor a menor)
recommendations_df_similarity_sorted = recommendations_df.sort_values(by='Puntuacion_Similitud', ascending=True)
# Gráfico de distribución de similitudes mejorado (barras horizontales)
fig_similarity = px.bar(
recommendations_df_similarity_sorted,
x='Puntuacion_Similitud',
y='Categoria_Similar',
title='Semantic Similarity Scores',
color='Puntuacion_Similitud',
color_continuous_scale='purples',
labels={'Puntuacion_Similitud': 'Similarity Score', 'Categoria_Similar': ''},
text_auto='.2f', template='seaborn'
)
fig_similarity.update_layout(
height=500,
font={'size': 10},
plot_bgcolor='rgba(0,0,0,0)',
paper_bgcolor='rgba(0,0,0,0)',
yaxis={'categoryorder':'total ascending'}
)
fig_similarity.update_xaxes(visible=False)
fig_similarity.update_traces(hovertemplate='<b>%{y}</b><br>Similarity: %{x:.3f}')
high_similarity = recommendations_df[recommendations_df['Puntuacion_Similitud'] > 0.5]
total_consolidation_potential = high_similarity['Monto_Total'].sum()
top_recommendation = recommendations_df.iloc[0]
if len(high_similarity) > 0:
alert_color = "success"
icon = "🎉"
message = "Excellent opportunities identified!"
else:
alert_color = "warning"
icon = "⚠️"
message = "Limited opportunities detected"
insights = dbc.Alert([
html.H6(f"{icon} {message}", className="alert-heading"),
html.Hr(),
html.Ul([
html.Li(f"🔍 Categories with high similarity (>0.5): {len(high_similarity)} of {len(recommendations_df)}"),
html.Li(f"💰 Consolidation potential: ${total_consolidation_potential:,.2f}"),
html.Li(f"🏆 Best match: '{top_recommendation['Categoria_Similar']}' "
f"(similarity: {top_recommendation['Puntuacion_Similitud']:.3f})"),
html.Li(f"📊 Estimated savings from consolidation: "
f"${(total_consolidation_potential * 0.05):,.2f} - ${(total_consolidation_potential * 0.15):,.2f} (5-15%)")
], className="mb-0")
], color=alert_color)
return (category_info,
recommendations_df.to_dict('records'),
fig_amounts,
fig_similarity,
insights)
if name == ‘main’:
app.run_server(debug=True)