How do I color code k clusters for plotting on a scatter plot with Spectral10 pallette?

Nut · April 27, 2022, 2:46pm

I’m working on a cluster scatter plot where each cluster has it’s own color. How do I color code each cluster on a scatter plot? I believe there is a trick with ColumnDataSource and and/or factor_cmap but I’m having trouble following along on the docs. Here’s the full code:

import numpy as np
import pandas as pd
import random

from typing import List, Tuple
from bokeh.models import ColumnDataSource, Slider, Div, Select
from bokeh.sampledata.iris import flowers
from bokeh.plotting import figure, curdoc
from bokeh.layouts import column, row
from bokeh.palettes import Spectral10
from bokeh.transform import factor_cmap

# Use these centroids in the first iteration of you algorithm if "Random Centroids" is set to False in the Dashboard
DEFAULT_CENTROIDS = np.array([[5.664705882352942, 3.0352941176470587, 3.3352941176470585, 1.0176470588235293],
                              [5.446153846153847, 3.2538461538461543, 2.9538461538461536, 0.8846153846153846],
                              [5.906666666666667, 2.933333333333333, 4.1000000000000005, 1.3866666666666667],
                              [5.992307692307692, 3.0230769230769234, 4.076923076923077, 1.3461538461538463],
                              [5.747619047619048, 3.0714285714285716, 3.6238095238095243, 1.1380952380952383],
                              [6.161538461538462, 3.030769230769231, 4.484615384615385, 1.5307692307692309],
                              [6.294117647058823, 2.9764705882352938, 4.494117647058823, 1.4],
                              [5.853846153846154, 3.215384615384615, 3.730769230769231, 1.2076923076923078],
                              [5.52857142857143, 3.142857142857143, 3.107142857142857, 1.007142857142857],
                              [5.828571428571429, 2.9357142857142855, 3.664285714285714, 1.1]])

def get_closest(data_point: np.ndarray, centroids: np.ndarray):
    """
    Takes a data_point and a nd.array of multiple centroids and returns the index of the centroid closest to data_point
    by computing the euclidean distance for each centroid and picking the closest.
    """
    N = centroids.shape[0]
    dist = np.empty(N)
    for i, c in enumerate(centroids): 
        dist[i] = np.linalg.norm(c - data_point)
    index_min = np.argmin(dist)
    return index_min

def to_classes(clustering):
    # Get number of samples (you can pass it directly to the function)
    num_samples = sum(x.shape[0] for x in clustering)
    indices = np.empty((num_samples,))  # An empty array with correct size
    for ith, cluster in enumerate(clustering):
        # use cluster indices to assign to correct the cluster index
        indices[cluster] = ith
    return indices.astype(int)

def k_means(data_np: np.ndarray, k:int=3, n_iter:int=500, random_initialization=False) -> Tuple[np.ndarray, int]:
    """
    :param data: your data, a numpy array with shape (n_entries, n_features)
    :param k: The number of clusters to compute
    :param n_iter: The maximal numnber of iterations
    :param random_initialization: If False, DEFAULT_CENTROIDS are used as the centroids of the first iteration.

    :return: A tuple (cluster_indices: A numpy array of cluster_indices,
                      n_iterations: the number of iterations it took until the algorithm terminated)
    """
    # Initialize the algorithm by assigning random cluster labels to each entry in your dataset
    k=k+1
    centroids = data_np[random.sample(range(len(data_np)), k)]
    labels = np.array([np.argmin([(el - c) ** 2 for c in centroids]) for el in data_np])
    clustering = []
    for k in range(k):
        clustering.append(data_np[labels == k])

    # Implement K-Means with a while loop, which terminates either if the centroids don't move anymore, or
    # if the number of iterations exceeds n_iter
    counter = 0
    while counter < n_iter:
        # Compute the new centroids, if random_initialization is false use DEFAULT_CENTROIDS in the first iteration
        # if you use DEFAULT_CENTROIDS, make sure to only pick the k first entries from them.
        if random_initialization is False and counter == 0:
            centroids = DEFAULT_CENTROIDS[random.sample(range(len(DEFAULT_CENTROIDS)), k)]

        # Update the cluster labels using get_closest
        labels = np.array([get_closest(el, centroids) for el in data_np])
        clustering = []
        for i in range(k):
            clustering.append(np.where(labels == i)[0])
            
        counter += 1
        
        new_centroids = np.zeros_like(centroids)
        for i in range(k):
            if len(clustering[i]) > 0:
                new_centroids[i] = data_np[clustering[i]].mean(axis=0)
            else:
                new_centroids[i] = centroids[i]

        # if the centroids didn't move, exit the while loop
        if clustering is not None and (centroids != new_centroids).sum() == 0:
            break    
        else:
            centroids = new_centroids
        pass

    # return the final cluster labels and the number of iterations it took
    clustering = to_classes(clustering)
    return clustering, counter

Note: The callback is still a work in progress and it will be the last thing I work on. Just ignore it for now.

def callback(attr, old, new):
    # recompute the clustering and update the colors of the data points based on the result
    k = slider_k.valued_throttled
    init = select_init.value
    clustering_new, counter_new = k_means(data_np,k,500,init)
    pass

# read and store the dataset
data: pd.DataFrame = flowers.copy(deep=True)
data = data.drop(['species'], axis=1)

This is where I create my ColumnDataSource

# Create a copy of the data as numpy array, which you can use for computing the clustering
data_np = np.asarray(data)

# Create the dashboard
# 1. A Select widget to choose between random initialization or using the DEFAULT_CENTROIDS on top
select_init = Select(title='Random Centroids',value='False',options=['True','False'])

# 2. A Slider to choose a k between 2 and 10 (k being the number of clusters)
slider_k = Slider(start=2,end=10,value=3,step=1,title='k')

# 4. Connect both widgets to the callback
select_init.on_change('value',callback)
slider_k.on_change('value_throttled',callback)

# 3. A ColumnDataSource to hold the data and the color of each point you need
clustering, counter = k_means(data_np,4,500,False)
source = ColumnDataSource(dict(petal_length=data['petal_length'],sepal_length=data['sepal_length'],petal_width=data['petal_width'],clustering=clustering))

This is where I create the plots and where I’m also stuck with factor_cmap()

# 4. Two plots displaying the dataset based on the following table, have a look at the images
# in the handout if this confuses you.
#
#       Axis/Plot	Plot1 	Plot2
#       X	Petal length 	Petal width
#       Y	Sepal length	Petal length
#
# Use a categorical color mapping, such as Spectral10, have a look at this section of the bokeh docs:
# https://docs.bokeh.org/en/latest/docs/user_guide/categorical.html#filling
plot1 = figure(plot_width=100,plot_height=100,title='Scatterplot of flowers distribution by petal length and sepal length')
plot1.yaxis.axis_label = 'Sepal length'
plot1.xaxis.axis_label = 'Petal length'
scatter1 = plot1.scatter(x='petal_length',y='sepal_length',source=source,fill_color=factor_cmap('clustering', palette=Spectral10, factors=clustering))

plot2 = figure(plot_width=100,plot_height=100,title='Scatterplot of flowers distribution by petal width and petal length')
plot2.yaxis.axis_label = 'Petal length'
plot2.xaxis.axis_label = 'Petal width'
scatter2 = plot2.scatter(x='petal_width',y='petal_length',source=source,fill_color=factor_cmap('clustering', palette=Spectral10, factors=clustering))

# 5. A Div displaying the currently number of iterations it took the algorithm to update the plot.
div = Div(text='Number of iterations: ', counter)

lt = row(column[select_init,slider_k,div],plot1,plot2)

curdoc().add_root(lt)

And so the final plot should like so

>cd file_path
>bokeh serve --show file_name.py

This is my current error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-7-4f7e5f657af0> in <module>
     24 plot1.yaxis.axis_label = 'Sepal length'
     25 plot1.xaxis.axis_label = 'Petal length'
---> 26 scatter1 = plot1.scatter(x='petal_length',y='sepal_length',source=source,fill_color=factor_cmap('clustering', palette=Spectral10, factors=clustering))
     27 
     28 plot2 = figure(plot_width=100,plot_height=100,title='Scatterplot of flowers distribution by petal width and petal length')

~\Anaconda3\lib\site-packages\bokeh\transform.py in factor_cmap(field_name, palette, factors, start, end, nan_color)
    119 
    120     '''
--> 121     return field(field_name, CategoricalColorMapper(palette=palette,
    122                                                     factors=factors,
    123                                                     start=start,

~\Anaconda3\lib\site-packages\bokeh\models\mappers.py in __init__(self, palette, **kwargs)
     93         if palette is not None:
     94             kwargs['palette'] = palette
---> 95         super().__init__(**kwargs)
     96 
     97 @abstract

~\Anaconda3\lib\site-packages\bokeh\model.py in __init__(self, **kwargs)
    234         kwargs.pop("id", None)
    235 
--> 236         super().__init__(**kwargs)
    237         default_theme.apply_to_model(self)
    238 

~\Anaconda3\lib\site-packages\bokeh\core\has_props.py in __init__(self, **properties)
    267 
    268         for name, value in properties.items():
--> 269             setattr(self, name, value)
    270 
    271         self._initialized = True

~\Anaconda3\lib\site-packages\bokeh\core\has_props.py in __setattr__(self, name, value)
    296 
    297         if name in props or (descriptor is not None and descriptor.fset is not None):
--> 298             super().__setattr__(name, value)
    299         else:
    300             matches, text = difflib.get_close_matches(name.lower(), props), "similar"

~\Anaconda3\lib\site-packages\bokeh\core\property\descriptors.py in __set__(self, obj, value, setter)
    550             raise RuntimeError(f"{class_name}.{self.name} is a readonly property")
    551 
--> 552         self._internal_set(obj, value, setter=setter)
    553 
    554     def __delete__(self, obj):

~\Anaconda3\lib\site-packages\bokeh\core\property\descriptors.py in _internal_set(self, obj, value, hint, setter)
    782 
    783         """
--> 784         value = self.property.prepare_value(obj, self.name, value)
    785         old = self._get(obj)
    786         self._real_set(obj, old, value, hint=hint, setter=setter)

~\Anaconda3\lib\site-packages\bokeh\core\property\bases.py in prepare_value(self, owner, name, value)
    348         else:
    349             obj_repr = owner if isinstance(owner, HasProps) else owner.__name__
--> 350             raise ValueError(f"failed to validate {obj_repr}.{name}: {error}")
    351 
    352         if isinstance(owner, HasProps):

ValueError: failed to validate CategoricalColorMapper(id='1038', ...).factors: expected an element of either Seq(String), Seq(Tuple(String, String)) or Seq(Tuple(String, String, String)), got array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 0, 2, 0,
       0, 2, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 0, 0,
       2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0])

Bryan · April 27, 2022, 5:45pm

@Nut the immediate error is that you are attempting to use a categorical color mapper on integers. In Bokeh, categorical factors are always strings. If you only need these values for the colormapping, then the quickest solution is probably to convert all the integers to strings up front.

Alternatively, a LinearColorMapper can operate on numerical data.

Nut · April 27, 2022, 6:30pm

@Bryan
Ok, if you’re suggesting changing to strings, do you mean the following

factors=clustering

to

factors=np.unique(clustering).astype(str)

This produced no errors but I have a blank page when I exicute the script.

Bryan · April 27, 2022, 6:33pm

It’s not clear what exact change you made. I can say:

Tthe list of factors in the CDS should not be uniqued (every point needs to have some factor associated, and many points can correspond to the same factor).
The list of factors passed to the colormapper should be uniqued (each color need to have exactly one factor associated)

Nut · April 27, 2022, 6:59pm

@Bryan

clustering

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2])

data: pd.DataFrame = flowers.copy(deep=True)
data = data.drop(['species'], axis=1)
source = ColumnDataSource(dict(petal_length=data['petal_length'],sepal_length=data['sepal_length'],petal_width=data['petal_width'],clustering=clustering))

plot1 = figure(plot_width=100,plot_height=100,title='Scatterplot of flowers distribution by petal length and sepal length')
plot1.yaxis.axis_label = 'Sepal length'
plot1.xaxis.axis_label = 'Petal length'
scatter1 = plot1.scatter(x='petal_length',y='sepal_length',fill_alpha=0.4,source=source,color=factor_cmap('clustering', palette=Spectral10, factors=unique(clustering).astype(str)))

In the CDS, they are not uniqued, in the factor, they are uniqued

Bryan · April 27, 2022, 7:11pm

In the CDS, they still appear to be numbers. Categorical factors are always strings.

Edit: Here is a tiny but complete example for comparison (it maps both marker type and color):

Nut · April 28, 2022, 8:13am

@Bryan
I have a problem with my callback function and updating the CDS.

def callback(attr, old, new):
    # recompute the clustering and update the colors of the data points based on the result
    k = slider_k.value_throttled
    init = select_init.value
    clustering, counter = k_means(data_np,k,500,init)
    
    source.data.update(ColumnDataSource(dict(petal_length=data['petal_length'],
                                             sepal_length=data['sepal_length'],
                                             petal_width=data['petal_width'],
                                             clustering=clustering.astype(str))).data)
    
    scatter1.data_source.data['color'] = factor_cmap('clustering',
                                                     palette=Spectral10,
                                                     factors=np.unique(clustering).astype(str))
    scatter2.data_source.data['color'] = factor_cmap('clustering',
                                                     palette=Spectral10,
                                                     factors=np.unique(clustering).astype(str))
    pass

Throws the error after I select a new k value with the slider widget.

ValueError: failed to validate ColumnDataSource(id='1004', ...).data: expected an element of ColumnData(String, Seq(Any)), got {'petal_length': 0      1.4
1      1.4
2      1.3
3      1.5
4      1.4
      ...
145    5.2
146    5.0
147    5.2
148    5.4
149    5.1
Name: petal_length, Length: 150, dtype: float64, 'sepal_length': 0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ...
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: sepal_length, Length: 150, dtype: float64, 'petal_width': 0      0.2
1      0.2
2      0.2
3      0.2
4      0.2
      ...
145    2.3
146    1.9
147    2.0
148    2.3
149    1.8
Name: petal_width, Length: 150, dtype: float64, 'clustering': array(['3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3',
       '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3',
       '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3',
       '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '3', '1', '1',
       '1', '0', '1', '0', '1', '2', '1', '2', '2', '0', '0', '1', '2',
       '1', '0', '0', '1', '0', '1', '0', '1', '1', '1', '1', '1', '1',
       '1', '2', '2', '2', '0', '1', '0', '1', '1', '0', '0', '0', '0',
       '1', '0', '2', '0', '0', '0', '0', '2', '0', '4', '1', '4', '4',
       '4', '4', '0', '4', '4', '4', '1', '1', '4', '1', '1', '4', '4',
       '4', '4', '1', '4', '1', '4', '1', '4', '4', '1', '1', '4', '4',
       '4', '4', '4', '1', '1', '4', '4', '4', '1', '4', '4', '4', '1',
       '4', '4', '4', '1', '1', '4', '1'], dtype='<U11'), 'color': {'field': 'clustering', 'transform': CategoricalColorMapper(id='1158', ...)}}

I don’t understand why it doesn’t accept the new inputs for the CDS when it accepted the original inputs before the callback.

Bryan · April 28, 2022, 11:15am

This is not a valid value for a CDS:

'color': {'field': 'clustering', 'transform': CategoricalColorMapper(id='1158', ...)}}

Presumably you intend to put a column of untransformed data to be input for the color mapper on a glyph.

Nut · April 28, 2022, 12:26pm

@Bryan
I’m not sure why color is trying to be added to the updated CDS. All I want to do is update the color mapping on scatter1 and scatter2. Is this way incorrect?

scatter1.data_source.data['color'] = factor_cmap('clustering',palette=Spectral10,factors=np.unique(clustering).astype(str))

If I leave out those lines in the callback function, then the clusters get updated, but they all turn grey.

Bryan · April 28, 2022, 4:24pm

I’m not sure why color is trying to be added to the updated CDS.

Because scatter1.data_source.data['color'] = ... explicitly adds it to the CDS. It’s not clear what you are trying to accomplish:

If you want the existing colormapper to use new data (with the same set of cluster ids and palette), then you need to update the relevant CDS column with new untransformed data to map.
If you want to update the actual colormapper (e.g. with a new palette and new set of factors, etc) then you should pass the original colormapper object in the args dict of the CustomJS callback so that the callback can configure those properties to new values.

Regardless, a .data dict on a CDS maps (string) names to columns i.e. arrays, lists, or series, and nothing else.

Nut · April 28, 2022, 4:59pm

Because scatter1.data_source.data['color'] = ... explicitly adds it to the CDS.

Ok, makes sense.

It’s not clear what you are trying to accomplish:

I want to use the existing colormapper with new data calculated with the k_means() function. So if I select say k=5 with the slider widget, k_means() will return an array like [0,1,3,...,4,2,1] of length 150 which then color codes 5 clusters on the plot. Here is a stackoverflow link to the full copy and pastable code with images to display the current results

Bryan · April 28, 2022, 7:35pm

If the number of categories change then you would need to update both the data, and the mapper. I don’t have the bandwidth to dig into your actual code. Here is a minimal example that demonstrates how to update both those things:

from bokeh.layouts import column
from bokeh.models import Button, CategoricalColorMapper, ColumnDataSource
from bokeh.palettes import Spectral3, Spectral4
from bokeh.plotting import curdoc, figure
from bokeh.transform import transform

CATS1 = ["A", "B", "C"]
CATS2 = ["A", "B", "C", "D"]

source = ColumnDataSource(data=dict(
    x = list(range(12)),
    cat = CATS1 * 4,
))

cmap = CategoricalColorMapper(factors=CATS1, palette=Spectral3)

p = figure()
p.circle('x', 0, color=transform('cat', cmap), size=20, source=source)

def update():
    # the categories for each point changed, need to update the data
    source.data["cat"] = CATS2 * 3

    # the number of categories changed, need to update the mapper
    cmap.palette = Spectral4
    cmap.factors = CATS2

b = Button()
b.on_click(update)

curdoc().add_root(column(p, b))

Nut · April 29, 2022, 7:04am

@Bryan
This seemed to work in my callback function

def callback(attr, old, new):
    # recompute the clustering and update the colors of the data points based on the result
    k = slider_k.value_throttled
    init = select_init.value
    clustering_new, counter_new = k_means(data_np,k,500,init)
    
    source.data['clustering'] = clustering_new.astype(str)
    mapper = factor_cmap('clustering',palette=Spectral10,factors=np.unique(clustering_new).astype(str))
    scatter1.glyph.fill_color = mapper
    scatter2.glyph.fill_color = mapper
    scatter1.glyph.line_color = mapper
    scatter2.glyph.line_color = mapper
    div.text = 'Number of iterations: %d'%(counter_new)
    pass

And changed the arguments in my plots

mapper = factor_cmap('clustering',palette=Spectral10,factors=np.unique(clustering).astype(str))

plot1 = figure(title='Scatterplot of flowers distribution by petal length and sepal length')
plot1.yaxis.axis_label = 'Sepal length'
plot1.xaxis.axis_label = 'Petal length'
scatter1 = plot1.scatter(x='petal_length',
                         y='sepal_length',
                         fill_alpha=0.4,
                         source=source,
                         fill_color=mapper,
                         line_color=mapper)

plot2 = figure(title='Scatterplot of flowers distribution by petal width and petal length')
plot2.yaxis.axis_label = 'Petal length'
plot2.xaxis.axis_label = 'Petal width'
scatter2 = plot2.scatter(x='petal_width',
                         y='petal_length',
                         fill_alpha=0.4,
                         source=source,
                         fill_color=mapper,
                         line_color=mapper)

system · July 28, 2022, 7:04am

This topic was automatically closed 90 days after the last reply. New replies are no longer allowed.