How to centered strip plot with categorized bar chart

Hello. I am new to Plotly, and find it a super helpful and intuitive tool to visualize data. However, I’ve come up with a trivial but tricky problem lately. I want to plot each value overlay the bar chart while each dot has an adjustable gap between each other(swarm plot?).

After some research, it seems that the plotly.express.strip() function could be a reasonable solution to my problem. Although the combination of the plotly.express.strip() and the plotly.graph_objects.bar() function indeed provides a decent figure, the centerline of the strip plot isn’t aligned to the center of each bar chart. (Fig. 1)

Using the plotly.graph_objects.box() function to plot scatter dots and manually adjust the centerline with the pointpos property could be the “apparent solution” to my problem. (center the plots by trying out the proper numbers) But obviously, this isn’t the actual solution.

Therefore, I was wondering if there’s a way I could automatically plot swarm plots at the center of the bar chart. I will provide my code and sample CSV data for reference. I would appreciate it if anyone could offer me a solution or give me some ideas.

import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
import numpy as np
from scipy import stats

## Customizing Modebar "Download Plot" Button
config = {
    "toImageButtonOptions": {
        "format": "svg",  # one of png, svg, jpeg, webp
        "filename": "plot_result",
        "scale": 1,  # Multiply title/legend/axis/canvas sizes by this factor
    }
}

## Define variables
df = pd.read_csv("/path/to/csv")
cell_num_order = np.unique(df["cell_num"])
bar_colors = {"A": "#0966B2", "B": "#003365"}

## Plot Scatter
fig = px.strip(
    df,
    x="cell_num",
    y="value",
    color="sample_type",
    color_discrete_map={"A": "#D8D8CE", "B": "#B7B5AC"},
    stripmode="group",
).update_traces(showlegend=False, jitter=0.5)

## Add bar charts on Scatter plot
for t in df["sample_type"].dropna().unique():
    dfp = df[df["sample_type"] == t]
    fig.add_trace(
        go.Bar(
            name=t,
            x=dfp["cell_num"],
            y=dfp["group_avg."],
            error_y=dict(type="data", array=dfp["std.d"], visible=True),
            marker_color=bar_colors[t],
        )
    )

fig.update_yaxes(
    showline=True,
    linecolor="black",
    linewidth=2,
    showgrid=False,
    tickformat=".2f",
    ticks="inside",
)
fig.update_xaxes(
    showline=True,
    linecolor="black",
    linewidth=2,
    showgrid=False,
    type="category",
)
fig.update_layout(
    title={"text": "title", "y": 0.98, "x": 0.5, "xanchor": "center", "yanchor": "top"},
    barmode="group",
    bargroupgap=0.1,
    xaxis_title="xtitleß",
    yaxis_title="ytitle",
    plot_bgcolor="rgba(0, 0, 0, 0)",
    paper_bgcolor="rgba(0, 0, 0, 0)",
)


## Define 'add_pvalue_annotation' function
def add_pvalue_annotation(
    fig,
    cell_num,
    y_range,
    ns="",
    pvalue_th0="",
    pvalue_th1="",
    _format=dict(interline=0.07, text_height=1.05, color="black"),
):
    """
    arguments:
    fig --- specify fig you want to add pvalue annotation, which usually is the fig you will want to output finally.
    cell_num --- specify a cell num you want to annotate in case you want to add annotation on specific sample ONLY. In that case, please also replace `cell_num_order` in bar_xcoord_map segment to `[cell_num]`
    y_range --- a list of y_range in the form [y_min, y_max] in paper units for reference when ploting annotation. (e.g. [1.01, 1.02])
    ns --- provide threshold in `float` format consider to be no significant differences. (e.g. 0.05)
    pvalue_th0 --- provide threshold in `float` format consider to be have statistical significant differences. pvalue larger than this value & lower than ns will be annotated with a symbol "*"
    pvalue_th1 --- provide threshold in `float` format consider to be have statistical significant differences. pvalue larger than this value & lower than pvalue_th0 will be annotated with a symbol "**", else will be annotated "***"
    """
    pvalue = stats.ttest_ind(
        df[(df["cell_num"] == cell_num) & (df["sample_type"] == "A")].value.dropna(),
        df[(df["cell_num"] == cell_num) & (df["sample_type"] == "B")].value.dropna(),
    )[1]

    if pvalue >= ns:
        symbol = "n.s."
    elif pvalue >= pvalue_th0:
        symbol = "*"
    elif pvalue >= pvalue_th1:
        symbol = "**"
    else:
        symbol = "***"
    bar_xcoord_map = {x: idx for idx, x in enumerate(cell_num_order)}
    x_coordinate = bar_xcoord_map[cell_num]
    x_start, x_end = x_coordinate - 0.2, x_coordinate + 0.2
    fig.add_shape(
        type="line",
        xref="x",
        yref="paper",
        x0=x_start,
        y0=y_range[0],
        x1=x_start,
        y1=y_range[1],
        line=dict(
            color="black",
            width=2,
        ),
    )
    fig.add_shape(
        type="line",
        xref="x",
        yref="paper",
        x0=x_start,
        y0=y_range[1],
        x1=x_end,
        y1=y_range[1],
        line=dict(
            color="black",
            width=2,
        ),
    )
    fig.add_shape(
        type="line",
        xref="x",
        yref="paper",
        x0=x_end,
        y0=y_range[1],
        x1=x_end,
        y1=y_range[0],
        line=dict(
            color="black",
            width=2,
        ),
    )

    ## add text at the correct x, y coordinates
    ## for bars, there is a direct mapping from the bar number to 0, 1, 2...
    fig.add_annotation(
        dict(
            font=dict(color=_format["color"], size=14),
            x=(x_start + x_end) / 2,
            y=y_range[1] * _format["text_height"],
            showarrow=False,
            text=symbol,
            textangle=0,
            xref="x",
            yref="y" + " domain",
        )
    )


## Annotate bracket annotation for all samples
for cell_num in cell_num_order:
    add_pvalue_annotation(
        fig, cell_num, [1.01, 1.02], ns=0.05, pvalue_th0=0.01, pvalue_th1=0.005
    )


fig.show("iframe", config=config)
sample,sample_type,cell_num,value,group_avg.,std.d,p-value
S1,A,0.15625,1.117938918,,,
S2,A,0.15625,1.468158665,,,
S3,A,0.15625,1.594299572,,,
S4,A,0.15625,1.1841963747 ,,,
S5,A,0.15625,1.371244508,,,
S6,A,0.15625,1.438600173,,,
,A,0.15625,,1.398048367,0.157641485,
S1,B,0.15625,1.4822850338 ,,,
S2,B,0.15625,1.473497457,,,
S3,B,0.15625,1.782631919,,,
S4,B,0.15625,1.915577222,,,
S5,B,0.15625,1.921744779,,,
S6,B,0.15625,1.718171305,,,
,B,0.15625,,1.762324536,0.164124452,
,,0.15625,,,,0.012584782

Related post