Upload 2 datasets, choose columns to merge on, get final merged output

Hi everyone! I’ve been stuck on this issue for some time now and would VERY MMUCH appreciate any help.

I am creating a Dash app where

  1. User can upload two datasets (csv, or excel)
  2. Two dropdowns will populate themselves with the columns from each dataset
  3. User will use the dropdowns to choose what the matching key will be from each dataset
  4. Merge the two datasets based on merging key
  5. Get the final merged dataframe

I’m having trouble with step 4 and 5. I’m not sure how to go about with creating a function and a callback by referring to the uploaded datasets. This has stumped me for soo long!

My current code:

import json
import io
import base64

!pip install numpy==1.21.6

import numpy as np
import pandas as pd
import dash
import dash_bootstrap_components as dbc
from dash import dash_table, dcc, html
from dash.dependencies import Input, Output, State

!pip install plotly==5.9.0

!pip install xarray==0.20.2

#define Dash app, reference Dash Bootstrap as external stylsheet
app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])

layout = html.Div([
html.H5(“Upload Files”),
dcc.Upload(
id=‘upload-data-1’,
children=html.Div([
'File 1: Drag and Drop or ',
html.A(‘Select Files’)
]),
style={
‘width’: ‘80%’,
‘height’: ‘60px’,
‘lineHeight’: ‘60px’,
‘borderWidth’: ‘1px’,
‘borderStyle’: ‘dashed’,
‘borderRadius’: ‘5px’,
‘textAlign’: ‘left’,
‘margin’: ‘10px’
},
multiple=False),

dcc.Upload(
    id='upload-data-2',
    children=html.Div([
        'File 2: Drag and Drop or ',
        html.A('Select Files')
    ]),
    style={
        'width': '80%',
        'height': '60px',
        'lineHeight': '60px',
        'borderWidth': '1px',
        'borderStyle': 'dashed',
        'borderRadius': '5px',
        'textAlign': 'left',
        'margin': '10px'
    },
    multiple=False),

html.Div([
    html.H5("First Dataset"),
    dcc.Dropdown(
        id = 'first_dropdown',
        options = [],
    )
]),
html.Div([
    html.H5("Second Dataset"),
    dcc.Dropdown(
        id = 'second_dropdown',
        options = [],
    )
]),
html.Br(),
html.Button('Merge datasets',
            id = 'merge-button',
            n_clicks = 0),
html.Br(), 
html.Div(id='container', children = 'Choose linking key from each dataset and merge datasets')

])

app.layout = html.Div(
[layout])

#Read dataset 1
def parse_contents_1(contents, filename):
content_type, content_string = contents.split(‘,’)

decoded = base64.b64decode(content_string)
try:
    if 'csv' in filename:
        # Assume that the user uploaded a CSV file
        df1 = pd.read_csv(io.StringIO(decoded.decode('utf-8')))
    elif 'xls' in filename:
        # Assume that the user uploaded an excel file
        df1 = pd.read_excel(io.BytesIO(decoded))
except Exception as e:
    print(e)
    return None
return df1

#Read dataset 2
def parse_contents_2(contents, filename):
content_type, content_string = contents.split(‘,’)

decoded = base64.b64decode(content_string)
try:
    if 'csv' in filename:
        # Assume that the user uploaded a CSV file
        df2 = pd.read_csv(io.StringIO(decoded.decode('utf-8')))
    elif 'xls' in filename:
        # Assume that the user uploaded an excel file
        df2 = pd.read_excel(io.BytesIO(decoded))
except Exception as e:
    print(e)
    return None
return df2

#Dropdowns for Dataset1 and 2
@app.callback(Output(‘first_dropdown’, ‘options’),
Output(‘second_dropdown’, ‘options’),
[Input(‘upload-data-1’, ‘contents’),
Input(‘upload-data-1’, ‘filename’),
Input(‘upload-data-2’, ‘contents’),
Input(‘upload-data-2’, ‘filename’)])

def update_dropdown(contents1, filename1, contents2, filename2):
if contents1 is not None and contents2 is not None:
df1 = parse_contents_1(contents1, filename1)
columns1 = df1.columns.values.tolist()
columns1 = [ {‘label’: x, ‘value’: x} for x in columns1 ]

    df2 = parse_contents_2(contents2, filename2)
    columns2 = df2.columns.values.tolist()
    columns2 = [ {'label': x, 'value': x} for x in columns2 ]

    
    if df1 is not None and df2 is not None:
        return columns1, columns2
    else:
        return []
else:
    return []        

if name==“main”:
app.run(host=“0.0.0.0”,port=300,debug=False)

You could create a div on the main app and then update the children via a callback when the columns have been selected. Use the states of (‘upload-data-1’, ‘contents’) and (‘upload-data-2’, contents’). The trigger could be Input of either of the drop down values. Read them both as data frames and merge the values based on the selected drop down values. Then return a Datatable.