Hi! I have been trying to store a large dataset (around 9 Lakh datapoints) on dcc.Store on my multi-page app . I tried using ServerSideOutput for storing the data and then generating a data summary out of it.
Here’s how my app instantiation looks like:
import dash
import dash_bootstrap_components as dbc
import utils.path_config as path_config
from dash_extensions.enrich import Output, DashProxy, Input, MultiplexerTransform,State,Dash,ServersideOutputTransform
external_stylesheets = [dbc.themes.DARKLY]
app = DashProxy(__name__,external_stylesheets=external_stylesheets,
suppress_callback_exceptions=True,assets_folder=path_config.ASSET_DIR,transforms=[MultiplexerTransform(),ServersideOutputTransform()],prevent_initial_callbacks=True)
server = app.server
I am trying to upload the dataset and store it using dcc.Store as follows:
index_page = html.Div(children=
[
dcc.Store(id='stored-data',storage_type="memory"),
dcc.Store(id='training-data',storage_type='memory'),
dcc.Store(id='intermediate-data',storage_type='memory'),
dcc.Store(id='processed-data',storage_type='memory'),
dcc.Store(id='model-data',storage_type='memory'),
dcc.Location(id='url',refresh=False),
html.Div(id='page-content',style={'margin':'0px'}),
])
app.layout = index_page
app.validation_layout = html.Div([
index_page,
home_page(),
data_load_layout(),
upload_page()
])
# file upload callback
@app.callback(ServersideOutput('stored-data','data'),
Input('upload-data', 'contents'),
[State('upload-data', 'filename'),
State('upload-data', 'last_modified')])
def update_output(list_of_contents, list_of_names, list_of_dates):
logging.info("upload contents")
if list_of_contents is not None:
#print(list_of_contents)
data = f_util.parse_contents(list_of_contents, list_of_names, list_of_dates)
dff = pd.DataFrame(data)
return dff
##display data summary
@app.callback(Output('data-summary','children'),
[Input('upload_data_btn','n_clicks'),Input('stored-data','data')])
def output_data_summary(n_clicks,data):
if data is None or n_clicks is None:
raise PreventUpdate
else:
data = pd.DataFrame(data)
return f_util.uploaded_data_info(df=data)
In case you want to see how the summary function used in uploaded_data_info(df) looks like, here it is:
def data_summary(df):
summary_table = pd.DataFrame(columns=['Column Name','Number of unique values',"Numerical or nominal","Range of values"])
summary_table['Column Name'] = df.columns
summary_table['Number of unique values'] = [df[col].nunique() for col in df.columns]
summary_table["Numerical or nominal"] = ["Numerical" if is_numeric_dtype(df[col]) else "Nominal" for col in df.columns]
summary_table["Range of values"] = [(str(min(df[col])) + '-' + str(max(df[col]))) for col in df.columns]
return summary_table
On running the dash app through command prompt and trying to upload data, this is what happens:
and it goes on as as this loop of strings with the desired output showing up in app in around 15 minutes or more.
Can someone explain what is happening and if there is a solution to cut down on the time?
Thanks in advance!
