Hello,
I am trying to develop a simple app that uses data from a user-provided csv file and plots its content, given two column names specified from a dropdown menu.
For static data this is very easy:
from dash import Dash, html, dcc, Input, Output
from dash.exceptions import PreventUpdate
import plotly.express as px
import pandas as pd
df = pd.read_csv("samples.csv")
app = Dash(__name__)
app.layout = html.Div([
dcc.Dropdown(df.columns, id='xaxis-dropdown'),
dcc.Dropdown(df.columns, id='yaxis-dropdown'),
dcc.Graph(id='plot')
])
@app.callback(Output('plot', 'figure'),
Input('xaxis-dropdown', 'value'),
Input('yaxis-dropdown', 'value'),
)
def update_graph(xvalue, yvalue):
if xvalue is None or yvalue is None:
raise PreventUpdate
return px.scatter(x=df[xvalue], y=df[yvalue])
if __name__ == '__main__':
app.run(debug=True)
However when adding the capability to upload custom csv data, I get a bit confused.
My first intuition is to try to use the dcc.Store component to store my data and retrieve it when needed, I produced this code:
from dash import Dash, html, dcc, Input, Output, State
from dash.exceptions import PreventUpdate
import base64
from io import StringIO
import plotly.express as px
import pandas as pd
app = Dash(__name__)
app.layout = html.Div(
[
dcc.Upload(
id="load",
children=html.Div("Upload .csv file"),
multiple=False,
style={
"border-style": "dashed",
"border-width": "1px",
"border-radius": "5px",
"text-align": "center",
"margin-top": "16px",
"margin-bottom": "16px",
"line-height": "60px",
"cursor": "pointer",
},
),
dcc.Store(id="storage", storage_type="memory"),
dcc.Dropdown(id="xaxis-dropdown"),
dcc.Dropdown(id="yaxis-dropdown"),
dcc.Graph(id="plot"),
]
)
@app.callback(
Output("plot", "figure"),
State("storage", "data"),
Input("xaxis-dropdown", "value"),
Input("yaxis-dropdown", "value"),
)
def update_graph(data, xvalue, yvalue):
if xvalue is None or yvalue is None:
raise PreventUpdate
df = pd.DataFrame(data)
return px.scatter(x=df[xvalue], y=df[yvalue])
@app.callback(Output("storage", "data"), Input("load", "contents"))
def store_data(contents):
if contents is None:
raise PreventUpdate
_, string = contents.split(",")
decoded = base64.b64decode(string).decode("utf-8")
df = pd.read_csv(StringIO(decoded))
return df.to_dict("records")
@app.callback(
Output("xaxis-dropdown", "options"),
Output("yaxis-dropdown", "options"),
Output("xaxis-dropdown", "value"),
Output("yaxis-dropdown", "value"),
Input("storage", "data"),
)
def update_dropdown_options(data):
df = pd.DataFrame(data)
return df.columns, df.columns, None, None
if __name__ == "__main__":
app.run(debug=True)
I feed the data into a Store object, and every callback now uses the State("storage", "data")
argument to retrieve it. Since this data is in dictionnary form rather than DataFrame form I rebuild the dataframe at the beginning of every callback. The logic of this seems to work fine, but when trying to upload a moderately large dataframe (200k rows, 30 columns) this get very very slow.
I guess the way to manage this properly would be to use memoization, but I’m strugling to use it properly given the two examples in the doc:
- The first example memoizes a callback Output, whereas I’m trying to memoize a dataframe resulting from a function call
- The second example memoizes a dataframe resulting from a function, but does not take a dcc.Store object Input
How could I combine the two to improve the performance of my app ?
Cheers,
Nicolas