"Hello everyone,
As always, great dashboards and ideas this week! This time, I focused on sentiment analysis and developed a Dash web app to explore this area. The app analyzes sentiment and trends in NY Times fiction bestsellers, providing insights into the emotional tone of bestselling fiction. Users can explore the overall sentiment distribution from book descriptions, compare author sentiment, and examine the top title bigrams and description trigrams, all filterable by year.
code here
from dash import dcc, html, Input. Output
import dash_bootstrap_components as dbc
import pandas as pd
import plotly.express as px
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.util import ngrams
from collections import Counter
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
# Prepare the data
df = pd.read_excel("NYT Fiction Bestsellers.xlsx", sheet_name=1)[::-1]
df_subset = df.drop_duplicates(subset=['title'])
df_subset['title'] = df_subset['title'].astype(str)
df_subset['desc'] = df_subset['desc'].astype(str)
df_subset['year'] = df_subset.bestsellers_date.dt.year
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english')) | set(string.punctuation)
def generate_ngrams(text, n):
tokens = word_tokenize(text.lower())
tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
tokens = [lemmatizer.lemmatize(word) for word in tokens]
n_grams = ngrams(tokens, n)
return [' '.join(gram) for gram in n_grams]
def plot_top_ngrams_plotly(freq_counter):
top_n = freq_counter.most_common(10)
ngrams, counts = zip(*top_n)
fig = px.bar(y=list(counts), x=list(ngrams), template="ggplot2", labels={'x':'', 'y':''},
text_auto=True,
)
fig.update_layout(paper_bgcolor='rgb(240, 240, 240)', plot_bgcolor='rgb(240, 240, 240)'
)
fig.update_yaxes(visible=False)
return fig
def get_sentiment(text):
analyzer = SentimentIntensityAnalyzer()
scores = analyzer.polarity_scores(text)
return scores['compound']
radio_style = {
'display': 'flex',
'flex-direction': 'row',
'justify-content': 'space-between',
'padding': '5px',
'border': '2px solid',
'border-radius': '5px',
'boxShadow': '3px 3px 3px rgba(10, 10, 10, 0.3)',
'font-family': 'Aharoni, sans-serif',
'font-size': '20px',
}
header_style={'text-align': 'center', 'margin': '10px','padding': '10px'}
# Initialize Dash app
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.JOURNAL])
app.title = "NY Times Fiction Best-Sellers"
app.layout = dbc.Container([
dbc.Row([
dbc.Col(html.H1("NYT Bestseller Sentiment & Trends", className="title"), width=12) # Clase CSS para el título
]),
dbc.Row([
dbc.Col(html.H5("Sentiment analysis reveals trends in NYT fiction bestsellers: titles, descriptions, sentiment, bigrams/trigrams, and author sentiment."), width=12)
]),
dbc.Row([
dbc.Col(dbc.RadioItems(
id='radio-buttons',
options=[{'label': str(year), 'value': year} for year in df_subset.year.unique()],
value=2018,
inline=True,
style=radio_style
), width=12)
]),
dbc.Row([
dbc.Col([
html.H5("Top 10 Title Bigrams", style=header_style),html.Hr(),
dcc.Graph(id='bigrams-chart', className="dash-graph")], width=6),
dbc.Col([
html.H5("Top 10 Description Trigrams", style=header_style),html.Hr(),
dcc.Graph(id='trigrams-chart', className="dash-graph")], width=6)
]),
html.Hr(),
dbc.Row([
dbc.Col([
html.H5("Book Description Sentiment by Year", style=header_style),
html.Hr(),
dcc.Graph(id='sentiment-chart')], width=5),
dbc.Col(html.Div([
html.H5("Comparative Sentiment Distribution of Bestselling Authors", style=header_style),
html.Hr(),
html.Button('Update Authors', id='update-authors-button', className="update-button"),
dcc.Graph(id='sentiment_author-boxchart')
]), width=7),
dbc.Col(
html.H5("Sentiment scores range from -1 (VERY NEGATIVE) to +1 (VERY POSITIVE). Scores close to 0 indicate neutral sentiment.", style=header_style), width=12)
]),
], fluid=True, style={'backgroundColor': '#f0f0f0'})
@app.callback(
Output('bigrams-chart', 'figure'),
Output('trigrams-chart', 'figure'),
Output('sentiment-chart', 'figure'),
Output('sentiment_author-boxchart', 'figure'),
Input('radio-buttons', 'value'),
Input('update-authors-button', 'n_clicks') # Input del botón
)
def update_charts(year, n_clicks):
filtered_df = df_subset[df_subset.year == year]
bigrams = [generate_ngrams(title, 2) for title in filtered_df['title']]
bigram_freq = Counter([gram for sublist in bigrams for gram in sublist])
bigrams_fig = plot_top_ngrams_plotly(bigram_freq)
trigrams = [generate_ngrams(desc, 3) for desc in filtered_df['desc']]
trigram_freq = Counter([gram for sublist in trigrams for gram in sublist])
trigrams_fig = plot_top_ngrams_plotly(trigram_freq)
filtered_df['sentiment'] = filtered_df['desc'].apply(get_sentiment)
sentiment_fig = px.histogram(
filtered_df, x="sentiment", nbins=10, histnorm='percent',range_x=[-1,1],
template='ggplot2', text_auto= '.2f', labels={'sentiment':''}
)
sentiment_fig.update_layout(paper_bgcolor='rgb(240, 240, 240)', plot_bgcolor='rgb(240, 240, 240)')
sentiment_fig.update_yaxes(visible=False)
if n_clicks is None or n_clicks == 0:
author_to_watch = filtered_df.author.sample(5).tolist()
elif n_clicks > 0:
author_to_watch = filtered_df.author.sample(5).tolist()
author_to_watch = filtered_df.author.sample(5).tolist()
author_df = filtered_df[filtered_df['author'].isin(author_to_watch)]
author_df['sentiment'] = author_df['desc'].apply(get_sentiment)
sentimen_author_fig = px.box(author_df, x="sentiment", y='author',
range_x=[-1,1],
color_discrete_sequence=px.colors.sequential.Bluered_r,
template='ggplot2', labels={'sentiment':'', 'author':''})
sentimen_author_fig.update_layout(paper_bgcolor='rgb(240, 240, 240)', plot_bgcolor='rgb(240, 240, 240)')
return bigrams_fig, trigrams_fig, sentiment_fig, sentimen_author_fig
if __name__ == '__main__':
app.run_server(debug=True)