Butterfly/Tornado Chart

Gen · November 3, 2023, 11:20am

I was trying to find the best solution for making butterfly charts (find an example here: Butterfly Chart | Data Viz Project), and given there was no good built-in support amongst the usual graph libraries, I decided to give it a whirl with bokeh.

I just wanted to share my solution so if anyone decides to make something similar in the future, there is at least an example in bokeh (I couldn’t find examples when I was searching).

Though the below code is a bit long, the key points:

You need to make the data going to the left negative so it appears to head to the left.
Likewise, for the x axis ticks you need to make them abs values so they are correct
From what I’ve gathered, bokeh doesn’t allow “moving” the y-axis, so hide it and put custom labels in the “middle” of the graph

For the below example, I used a DataFrame that looks like this:

Gender Age_Group Count
M “10-14” 40
F “10-14” 20
M “15-19” 13
F “15-19” 25

import pandas as pd

from bokeh.plotting import figure
from bokeh.models import LabelSet
from bokeh.models.formatters import CustomJSTickFormatter
from bokeh.models.sources import ColumnDataSource

ABS_COL_POSTFIX='_Abs'

def create_butterfly_graph(
        df:pd.DataFrame,
        l_col_val='男', r_col_val='女',
        y_col_name='Age_Group', x_col_name='Count', group_col_name='Gender',
        width=250, height=250,
        l_color='lightblue', r_color='pink',
        ):
    '''
    Creates a butterfly graph
    DF is structured as so:
    x_col_name | y_col_name | group_col_name | ...and other columns
    group_col_name is either l_col_val or r_col_val
    '''
    pivot_df = _helper_create_pivot_df(df)
    source = ColumnDataSource(pivot_df)
    max_x = pivot_df[[r_col_val, l_col_val+ABS_COL_POSTFIX]].max().max() + 10 # Max x/y to fit labels
    
    p = figure(
        y_range=list(reversed(pivot_df[y_col_name])), # We do this to get descending (smallest up top)
        x_axis_location="below", 
        x_range = (-max_x, max_x),
        width=width, height=height,
        toolbar_location=None
    )
    
    _helper_create_hbars(p, source,l_col_val, r_col_val, y_col_name, l_color, r_color)
    _helper_create_labels(p, source)
    _helper_set_axes(p)
    
    return p
    
    
def _helper_create_hbars(
        p, source, 
        l_col_val, r_col_val, 
        y_col_name,
        l_color, r_color):
    p.hbar_stack([l_col_val], y=y_col_name, source=source, height=0.8, color=[l_color])
    p.hbar_stack([r_col_val], y=y_col_name, source=source, height=0.8, color=[r_color])

def _helper_create_labels(p, source, l_col_val, r_col_val, y_col_name):
    # Labels at the origin (y-axis replacement)
    labels_group = LabelSet(x=0, y = y_col_name, text = y_col_name, level='glyph',
                            x_offset=0, y_offset=-2,
                            source=source,
                            text_align='center', text_font_size='7pt', text_color='gray', text_alpha=0.9, 
                            background_fill_color='white', background_fill_alpha=0.6
                    )

    # Labels for the left side of the butterfly
    labels_m = LabelSet(x = l_col_val, y = y_col_name, 
                        text=l_col_val+ABS_COL_POSTFIX, level='glyph', text_align='center', text_color='gray', text_font_size='8pt',
                        x_offset=-10, y_offset=-2, source=source
                        )
    # Labels for the right side of the butterfly
    labels_f = LabelSet(x = r_col_val, y = y_col_name, 
                        text=r_col_val, level='glyph', text_align='center', text_color='gray', text_font_size='8pt',
                        x_offset=10, y_offset=-2, source=source
                        )
    
    p.add_layout(labels_group)
    p.add_layout(labels_m)
    p.add_layout(labels_f)

def _helper_create_pivot_df(
        df:pd.DataFrame,
        l_col_val, r_col_val,
        x_col_name, y_col_name,
        group_col_name
        )->pd.DataFrame:
    pivot_df = df.pivot(index=y_col_name, columns=group_col_name, values=x_col_name).reset_index()
    if l_col_val not in pivot_df:
        pivot_df[l_col_val]=0
    if r_col_val not in pivot_df:
        pivot_df[r_col_val]=0
    
    # This is used to label the bars
    pivot_df[l_col_val + ABS_COL_POSTFIX] = pivot_df[l_col_val]
    
    # We do this so the left bars go to the left
    pivot_df[l_col_val] = pivot_df[l_col_val]*-1
    return pivot_df

def _helper_set_axes(p):
    p.xaxis.formatter = CustomJSTickFormatter(code="""return Math.abs(tick);""")
    
    p.yaxis.visible=False
    
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None