Allowing Duplicate comparisons: half way there

shrykull · December 18, 2025, 7:22pm

I have some spreadsheet files that I load into dataframes, and these files have some columns with the same name. I don’t want to remove the duplicates, In fact, I want to created a way to filter out only the duplicates for comparisons, but when it comes to the plotting part, only the first pair of duplicates show up.

import pandas as pd
import os
import glob
import dash
from dash import Dash, dcc, html, Output, Input, State, callback, ALL, MATCH, Patch
from dash.exceptions import PreventUpdate
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.io import to_html
import numpy as np
import os
from os import listdir
import glob
import json

path3 = './static/mod/'
cikfils = os.path.join(path3, '*.csv')
osfiles=[f for f in os.listdir(path3)]
files = glob.glob(cikfils)

modified_files = list()
for cik_file in files:
    modified_files.append(cik_file)
dfs = [ ]
for file_name in modified_files:
    d = pd.read_csv(file_name)
    dfs.append(d)
df = pd.concat(dfs, ignore_index=False, axis=1)

total = int(len(files))
min_number=0
max_number=total

traces = []
buttonpairs = []
updatemenu = []
dfnames_num = []


## NUMBERED DFS TO HAVE FOR DATAFRAME NAMES (df1, df2, ...)
for number in range(min_number, max_number):
    name = f'df{number}'
    dfnames_num.append(name)
zip_object = zip(dfnames_num, files)

### DUPE FUNCTION
def getDupes(df):
    dupeNames = list()
    for x in range(df.shape[1]):
        col = df.columns[x]
        for y in range(x + 1, df.shape[1]):
            otherCol = df.columns[y]
            if col == otherCol:
                dupeNames.append(df.columns.values[y])
    return list(dupeNames)
if __name__ == "__main__":
    modified_files = list()
    for cik_file in files:
        modified_files.append(cik_file)
    dfs = [ ]
    for file_name in modified_files:
        d0 = pd.read_csv(file_name)
        d0 = d0[d0.columns.difference(['cik', 'annual'])]
        dfs.append(d0)
    df0 = pd.concat(dfs, ignore_index=False, axis=1)
    dupes = getDupes(df0)
    print("Result from getDupes(df) function: THIS PRINTS CORRECTLY, ALL THE DUPLICATES THAT EXIST: ",dupes)
    
    ## This only plots the first group of duplicates...
    for dupecol in dupes:
        for name, file in zip_object:
            d = {name : pd.read_csv(file, low_memory=False)}
            dfp = pd.concat(d, ignore_index=False, axis=0)
            import polars as pl
            dfl = pl.DataFrame(dfp)
            df = pl.concat([dfl], how="align")
            traces.append(go.Scatter(x=df['filing_date'],y=df[dupecol],visible=True,name=dupecol))
            buttonpairs.append(dict(method='restyle',label=dupecol,visible=True,
                args=[{'visible':True},[i for i,x in enumerate(traces) if x.name == dupecol]],
                args2=[{'visible':'legendonly'},[i for i,x in enumerate(traces) if x.name == dupecol]]))
updatemenu.append(dict())
updatemenu[0]['buttons'] = buttonpairs
updatemenu[0]['showactive'] = True
updatemenu[0]['xanchor'] = 'left'
updatemenu[0]['direction'] = 'right'
updatemenu[0]['type'] = 'buttons'

fig = go.Figure(data=traces)
fig.update_layout(showlegend=True, updatemenus=updatemenu, height=900, width=2800)
fig.show()

Topic		Replies	Views
Duplicate plotly bar stack 📊 Plotly Python	0	369	June 12, 2023
Remove Duplicates from the result set (Each line Item) Dash Python	0	572	February 18, 2021
Dash App call back append new entry in data frame and replace if matches with any previous entry Dash Python	1	386	May 21, 2020
How do I make data display as soon as I finish uploading a file, currently if I have two uploads in my page. the data for both the uploads are displayed after i finish the second upload Dash Python	0	490	September 15, 2020
How to plot multiple .csv files in a single graph on a dashboard Dash Python question	15	5911	October 6, 2022

Allowing Duplicate comparisons: half way there

Related topics