A call for a generic Pandas filtering component

Hi,

Building a dashboard often consists of loading the data, filtering/transforming the data and then visualizing the data.

The loading, transforming and visualizing is handled by the amazing combination of pandas and plotly.

However, I find the filtering experience needs to be setup everytime - and hence I got the idea if one could build a generic filtering component running of the information embedded in the dataframe. You see such a filter being built/provided on the fly if utilizing applications like e.g. Spotfire.

image

I’ve provided some dummy code below for basic generic filtering. However, I’m not in the know about how to encapsule the below into a component. All ideas and thoughts are welcome.

In particular I’d like input to how one can speed up the filtering if handling larger datasets? Caching server-side? Redis? which would be best for, say, 30-50 columns and 100K to 1M rows?

from dash import Dash, dcc, html, Input, Output,State,no_update, callback, dash_table,ALL
import pandas as pd
import dash_bootstrap_components as dbc
from dash.exceptions import PreventUpdate




def createFilteringComponet(df):

    ## check df for columns
    
    filterDf = df
    options = []
    dataTypeDict = dict(filterDf.dtypes)


    # collect options for dropdown
    for col in df:
        
        anOption =  {'label': col, 'value': col}
        options.append(anOption)

    colsLabel = dbc.Label('Filter columns:')
    colsDropDown =  dcc.Dropdown(
    id='colsToFilterDropdown',
    multi=True,
    options=options
    )
 
    return [html.Div(children=[colsLabel, colsDropDown])
        ]


# populate filters
@callback(
    Output("filterContainer", "children"),
    State("dfStore", "data"),
    State("filterContainer", "children"),
#    Input("colsToDisplayChecklist", "value")
    Input("colsToFilterDropdown", "value")
)
def populateFilters(dfJson, filterContainerChildren, listOfCols):
    print('populateFilters', listOfCols,filterContainerChildren)
    if (not listOfCols) or (not dfJson):
        return []
    filterItems = []
    filterDf = pd.read_json(dfJson)
    dataTypeDict = dict(filterDf.dtypes)
       
    for col in listOfCols:
        dType = dataTypeDict[col]
        print(dType)
        item = None
        
        # keep filtering values already chosen for columns
        if len(filterContainerChildren) > 0:
            print('child1',filterContainerChildren['props'])
            for child in filterContainerChildren['props']['children']:
                print('child',child['props']['children'])
                if col == child['props']['children'][0]['props']['children']:
                    item = child


        # only add filter if not added already
        if (dType in ['int64', 'float64']) & (not item) :
            item =       dbc.ListGroupItem(
                [dbc.Label(col),
                  dcc.RangeSlider(
                      min=filterDf[col].min(),
                      max=filterDf[col].max(),
                      value=[filterDf[col].min(), filterDf[col].max()],
                        id={
                        'type': 'filter',
                        'index': col
                        }, 
                  )] 
                  ,style = {"width":"300px"}
              )

        # only add filter if not added already
        if (dType in ['object']) & (not item) :
            item =       dbc.ListGroupItem([
                dbc.Label(col),
                  dcc.Input(
                      value='',
                      type='search',
                      placeholder="E.g A or A,B",
                        id={
                        'type': 'filter',
                        'index': col
        }
                  )]
              )
        filterItems.append(item)

    return dbc.ListGroup(children= filterItems,flush=True,horizontal=True, style={'width':'100%'} )
    #return None

@callback(
    Output('dfFilteredStore', 'data'),
    [State('dfStore', 'data'),
    State("colsToFilterDropdown", "value")
    ],
    Input({'type': 'filter', 'index': ALL}, 'value')
)
def display_output(dfJson,filterCols, values):
    if  (not dfJson):
        raise PreventUpdate

    filterDf = pd.read_json(dfJson)
    dataTypeDict = dict(filterDf.dtypes)

    print(filterCols)
    if filterCols:
        i = 0
        for col in filterCols:
            dType = dataTypeDict[col]
            print(col, values[i], dType)
            if dType in ['int64', 'float64']:
                minVal = values[i][0]
                maxVal = values[i][1]
                filterDf = filterDf[(filterDf[col] >= minVal) & (filterDf[col] <= maxVal)]
            if dType in ['object']:
                str = values[i]
                #print(str.split(','))
                if str != '':
                    filterDf = filterDf[filterDf[col].isin(str.split(','))]
            
            i = i+1

    return filterDf.to_json()


app =  dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP],suppress_callback_exceptions=True)




# initialize data of lists.
data = {'COL1': [1, 2, 3,4,5,6,7,8,9,10],
    'COL2': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C','C', 'C'],
        'COL3': [200, 210, 190, 180,100,110,140,155,333,444]}
  
# Create DataFrame
test = pd.DataFrame.from_dict(data)



app.layout = dbc.Container([

    dcc.Store(id='dfStore', storage_type='session',data=test.to_json())
    ,dcc.Store(id='dfFilteredStore', storage_type='session', data=test.to_json())

    ,html.Div(children=createFilteringComponet(test),style={'width':'100%'}), 
    html.Div(id='filterContainer', children=[], style={'width':'100%'}),
    html.Div(id='tableContainer',style={'width':'100%'})

   
])


# populate filters
@callback(
    Output("tableContainer", "children"),
    Input("dfFilteredStore", "data")
)
def populateFilters(dfJson):
    if  (not dfJson):
        raise PreventUpdate
    filterDf = pd.read_json(dfJson)
    dt = dash_table.DataTable(data=filterDf.to_dict('records'), columns=[{"name": i, "id": i} 
        for i in filterDf.columns])
    return dt


if __name__ == '__main__':
    app.run_server(debug=True)

Best regards,
Johannes