I have some spreadsheet files that I load into dataframes, and these files have some columns with the same name. I don’t want to remove the duplicates, In fact, I want to created a way to filter out only the duplicates for comparisons, but when it comes to the plotting part, only the first pair of duplicates show up.
import pandas as pd
import os
import glob
import dash
from dash import Dash, dcc, html, Output, Input, State, callback, ALL, MATCH, Patch
from dash.exceptions import PreventUpdate
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.io import to_html
import numpy as np
import os
from os import listdir
import glob
import json
path3 = './static/mod/'
cikfils = os.path.join(path3, '*.csv')
osfiles=[f for f in os.listdir(path3)]
files = glob.glob(cikfils)
modified_files = list()
for cik_file in files:
modified_files.append(cik_file)
dfs = [ ]
for file_name in modified_files:
d = pd.read_csv(file_name)
dfs.append(d)
df = pd.concat(dfs, ignore_index=False, axis=1)
total = int(len(files))
min_number=0
max_number=total
traces = []
buttonpairs = []
updatemenu = []
dfnames_num = []
## NUMBERED DFS TO HAVE FOR DATAFRAME NAMES (df1, df2, ...)
for number in range(min_number, max_number):
name = f'df{number}'
dfnames_num.append(name)
zip_object = zip(dfnames_num, files)
### DUPE FUNCTION
def getDupes(df):
dupeNames = list()
for x in range(df.shape[1]):
col = df.columns[x]
for y in range(x + 1, df.shape[1]):
otherCol = df.columns[y]
if col == otherCol:
dupeNames.append(df.columns.values[y])
return list(dupeNames)
if __name__ == "__main__":
modified_files = list()
for cik_file in files:
modified_files.append(cik_file)
dfs = [ ]
for file_name in modified_files:
d0 = pd.read_csv(file_name)
d0 = d0[d0.columns.difference(['cik', 'annual'])]
dfs.append(d0)
df0 = pd.concat(dfs, ignore_index=False, axis=1)
dupes = getDupes(df0)
print("Result from getDupes(df) function: THIS PRINTS CORRECTLY, ALL THE DUPLICATES THAT EXIST: ",dupes)
## This only plots the first group of duplicates...
for dupecol in dupes:
for name, file in zip_object:
d = {name : pd.read_csv(file, low_memory=False)}
dfp = pd.concat(d, ignore_index=False, axis=0)
import polars as pl
dfl = pl.DataFrame(dfp)
df = pl.concat([dfl], how="align")
traces.append(go.Scatter(x=df['filing_date'],y=df[dupecol],visible=True,name=dupecol))
buttonpairs.append(dict(method='restyle',label=dupecol,visible=True,
args=[{'visible':True},[i for i,x in enumerate(traces) if x.name == dupecol]],
args2=[{'visible':'legendonly'},[i for i,x in enumerate(traces) if x.name == dupecol]]))
updatemenu.append(dict())
updatemenu[0]['buttons'] = buttonpairs
updatemenu[0]['showactive'] = True
updatemenu[0]['xanchor'] = 'left'
updatemenu[0]['direction'] = 'right'
updatemenu[0]['type'] = 'buttons'
fig = go.Figure(data=traces)
fig.update_layout(showlegend=True, updatemenus=updatemenu, height=900, width=2800)
fig.show()