Updated Sept 5
- set color & size based on the squared log10 value of views_sum, with normalization to force the minimum value to 1. For example. Charles R. Gleason has views_sum of 300 (lowest value), color & size values are 1. By contrast, Barack Obama has a views sum of 44,897,089, color & size values are 38.1.
- added color/size factor as a hover value, mostly for developer debug
- Added name count as a hover value as many names appear in the dataset more than once.
- changed the hover font to ‘courier’. Not my favorite font, but usefulf for right justification of numbers
Updated Sept 2
Inspired by always helpful suggestions from @li.nguyen and @adamschroeder that I am very grateful for, I made the following changes:
- map type is ‘streets’ with a light color background. Removed the iteration over all 15 map types.
- Removed the square root function that was put in to reduce the range between the smallest and largest view counts
- color and size are both referenced to views_sum (kind of redundant)
- used Magenta_r sequential color map. The _r reverses the scale, so that the darkest colors are used for the very hard to spot low view counts.
- Added average daily views (views_median??), and total views to the hover info.
Biggest challenge for me was arranging the colors so that the low view count markers are visible, without having the high view counts take up excessive area.
I really enjoy this data set, and by using it I solved an issue that has puzzled me for some time.
Columns with long text strings (see the extract column) are challenging to use as hover info because there is no capability I know of to automatically word wrap them. I solved with a function that replaces whitespaces with html line feeds when the desired max line length has been exceeded.
Here are a few screen shots:
Charles R. Gleason, with the lowest number for views_sum
Barack Obama, with very high value of views_sum
Here is the code
import polars as pl
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook_connected"
# us library is a tool for working with US State names and abbreviations
# make a list of valid US states, and filter data with it.
import us
state_list = [s for s in us.states.mapping('abbr', 'name').values()]
def wrap_hover(text, chars_per_line=28):
'''
break long hover text into multiple lines, split with html line feeds.
1st whitespace after chars_per_line value is exceeeded is replaced with <br>
'''
result = []
# Counter to track line_Length
line_length = 0
# Iterate over each character in the text
for char in text:
line_length += 1
if char.isspace():
if line_length > chars_per_line:
result.append('<br>')
line_length = 0
else:
result.append(char)
else:
result.append(char)
return ''.join(result)
def state_abbr(state):
'''
return commonly used abbreviation of full state name, ie New York, NY
'''
return us.states.lookup(state).abbr
#------------------------------------------------------------------------------#
# scan_csv produces polars Lazy frame, with data cleaning flow #
#------------------------------------------------------------------------------#
data_set = (
pl.scan_csv('people-map.csv') # LazyFrame
.with_columns(pl.col('views_median', 'views_sum').cast(pl.Int32))
# call wrap_hover function to inserts line feeds
.with_columns(
extract_wrap =
pl.col('extract')
.map_elements(wrap_hover, return_dtype=pl.String)
)
# add column to count # of times each name appears in the dataset
.with_columns(
NAME_COUNT = pl.col('name_clean').count().over('name_clean')
)
# Filter out rows with invalid state entries
.filter(pl.col('state').is_in(state_list))
# Add column with abbreviated form of each state's name, ie NY for New York
.with_columns(
STATE_ABBR = pl.col('state').map_elements(state_abbr, return_dtype=pl.String)
)
# tweak for Washington DC, state
.with_columns(
state = pl.when(pl.col('city').str.ends_with('D.C.'))
.then(pl.lit('D.C.'))
.otherwise('state')
)
# tweak for Washington DC, city
.with_columns(
city = pl.when(pl.col('city').str.ends_with('D.C.'))
.then(pl.lit('Washington'))
.otherwise('city')
)
# exclude rows with views_median == 0, or views_sum is null
.filter(pl.col('views_median') > 0)
.filter(pl.col('views_sum').is_not_null())
.with_columns(
color_size = (1+ pl.col('views_sum').log10() - pl.col('views_sum').log10().min()).pow(2).round(1)
)
.collect() # optimize and execute this query, return a regular dataframe
)
#------------------------------------------------------------------------------#
# scatter_map uses map type 'streets' with Magenta_r sequential colors #
#------------------------------------------------------------------------------#
fig = px.scatter_map(
data_set,
lat='lat',
lon='lng',
size='color_size',
color='color_size',
color_continuous_scale='Magenta_r',
size_max=35,
zoom=3,
map_style='streets',
custom_data=[
'name_clean', # customdata[0]
'city', # customdata[1]
'STATE_ABBR', # customdata[2]
'views_median', # customdata[3]
'views_sum', # customdata[4]
'NAME_COUNT', # customdata[5]
'color_size', # customdata[6]
'extract_wrap', # customdata[7]
],
range_color=(0,30) # max log of 7 means 10e6)
)
fig.update_layout(
autosize=True,
width=1300,
height=600,
margin=dict(
l=50,
r=50,
b=100,
t=100,
pad=4
),
)
#------------------------------------------------------------------------------#
# Apply hovertemplate #
#------------------------------------------------------------------------------#
fig.update_traces(
hovertemplate =
'<b>%{customdata[0]}: %{customdata[1]}, %{customdata[2]}<br></b>' +
'<b>Daily Views:</b>%{customdata[3]:>20,}<br>' +
'<b>Total Views:</b>%{customdata[4]:>20,}<br>' +
'<b>Name Count: </b>%{customdata[5]:>20}<br>' +
'<b>Color/Size Factor: </b>%{customdata[6]:>13}<br><br>' +
'%{customdata[7]}<br>' +
'<extra></extra>'
)
fig.update_layout(
hoverlabel=dict(
bgcolor="white",
font_size=16,
font_family='courier', # 'sans-serif mono', # 'courier new',
)
)
fig.update_layout(
margin={"r":0, "t":0, "l":0, "b":0},
)
fig.write_html(f'Fig_Fri_Week_35_Map.html')
fig.show()
#------------------------------------------------------------------------------#
# For Data Exploration, histogram of views_sum #
#------------------------------------------------------------------------------#
fig = px.histogram(
data_set, # sqrt()),
x='views_sum',
nbins=1000
)
fig.update_layout(width=800, height=400, template='plotly_white')
fig.show()
#------------------------------------------------------------------------------#
# For Data Exploration, histogram of NAME_COUNT #
#------------------------------------------------------------------------------#
fig = px.histogram(
data_set, # sqrt()),
x='NAME_COUNT',
)
fig.update_layout(width=800, height=400, template='plotly_white')
fig.show()
#------------------------------------------------------------------------------#
# Show min and max values of NAME_COUNT, SIZE #
#------------------------------------------------------------------------------#
print(
data_set
.select(pl.col('name_clean', 'views_sum', 'color_size'))
.sort('views_sum', descending=False)
.filter(pl.col('name_clean').is_in(['Barack Obama', 'Charles R. Gleason']))
.unique('name_clean')
)