How to plot the empirical cumulative distribution function for a given array?
I feel like there should be a function fig.add_ecdf()
for this and it would produce a plot which would look as follows:
How to plot the empirical cumulative distribution function for a given array?
I feel like there should be a function fig.add_ecdf()
for this and it would produce a plot which would look as follows:
OK, I see I can use line_shape=hv
to produce a similar plot:
fig = go.Figure()
fig.add_scatter(x=np.unique(data), y=ecdf(data)(np.unique(data)), line_shape='hv')
fig.show()
Where:
data = np.random.binomial(15, p=0.5, size=100)
And the ECDF function is obtained as follows:
def ecdf(x):
x = np.sort(x)
def result(v):
return np.searchsorted(x, v, side='right') / x.size
return result
or via:
from statsmodels.distributions.empirical_distribution import ECDF
But, still, Iād like to make it look more like the plot above.
UPDATE:
My forceful way:
xs = np.unique(data)
steps=ecdf(data)(np.unique(data))
xs1 = np.column_stack([np.insert(xs,0,xs[0]-np.diff(xs).mean()),
np.append(xs, xs[-1]+np.diff(xs).mean())])
steps1 = np.concatenate([[[0,0]], np.column_stack([steps,steps])])
xs2 = np.column_stack([xs,xs])
steps2 = np.column_stack([np.insert(steps, 0, 0)[:-1], steps])
fig = go.Figure()
for i in range(len(steps)+1):
fig.add_scatter(x=xs1[i],y=steps1[i], mode='lines', line_color='#367588', name='')
for i in range(len(steps)):
fig.add_scatter(x=xs2[i],y=steps2[i], mode='lines', line_color='#367588', line_dash='dot', name='')
fig.add_scatter(x=xs, y=steps, mode='markers', marker_color='#367588', name='')
fig.add_scatter(x=xs, y=np.pad(steps,1)[:-2], mode='markers', marker_color='white', marker = dict(line_color='#367588', line_width=1), name='')
fig.layout.update(title='Empirical CDF', showlegend=False)
fig.show(renderer='svg', height=600, width=1000)
Smoothed ECDF
see Malik and Pitt (2011)
def smooth_ecdf(data):
data = np.sort(data)
x = np.unique(data)
w = np.searchsorted(data, np.unique(data), side='right') / data.size
w = np.diff(np.pad(w,1)[:-1])
Ī» = 0.5 * np.concatenate([[w[0]], w[:-1]+w[1:], [w[-1]]])
def result(v):
G = np.concatenate([[1], (v-x[:-1])/(x[1:]-x[:-1]), [0]])
G = np.clip(G, a_min=0, a_max=1)
# G = stats.norm.cdf(G)
return np.sum(Ī»*G)
return result
fig = go.Figure()
for i in range(len(steps)+1):
fig.add_scatter(x=xs1[i],y=steps1[i], mode='lines', line_color='#367588')
for i in range(len(steps)):
fig.add_scatter(x=xs2[i],y=steps2[i], mode='lines', line_color='#367588', line_dash='dot')
fig.add_scatter(x=xs, y=steps, mode='markers', marker_color='#367588')
fig.add_scatter(x=xs, y=np.pad(steps,1)[:-2], mode='markers', marker_color='white', marker = dict(line_color='#367588', line_width=1))
fig.add_scatter(x=np.arange(0,15,0.1), y=[smooth_ecdf(data)(e) for e in np.arange(0,15,0.1)])
fig.layout.update(title='Smoothed Empirical CDF', showlegend=False)
fig.show(renderer='svg', height=600, width=1000)