Hi all,

Im working on a dataset predicting NBA salaries, I thought I had everything but the predicted number was WAY off. I went back and realized some points on my graphs are inaccurate. For the first graph, I plotted salary on the X axis and points per game on the Y. However on what is SUPPOSED to be James Hardens plot, (upper right because lots of money and points) it displays Jalen Brunson who is not even close to there (except his name). This also happens to Chris Paul and Chris Boucher. Any tips on how to fix this? Thanks in advance! Here is my code: The quoted section is the code with the graph. However I included the full code for convenience.

salary.table <- read.csv(‘NBA1920dataPER.csv’, fileEncoding=“UTF-8-BOM”)

ss <- read.csv(‘NBA1920dataPER.csv’, fileEncoding=“UTF-8-BOM”)

#Checking structure of data

str(salary.table)

str(ss)

#selecting Points, Min, TO, Reb, PER, Stl, Aast

salary.table <- salary.table[, c(1,2,3,4,7,23,24,35,36,37,38,39,68,69,77)]

ss <- ss[, c(1,2,3,4,7,23,24,35,36,37,38,39,68,69,77)]

#installing packages

#install.packages(‘data.table’)

#install.packages(‘corrplot’)

#install.packages(‘GGally’)

#install.packages(‘tidyverse’)

#install.packages(‘PerformanceAnalytics’)

#install.packages(‘plotly’)

library (data.table)

library(corrplot)

library(GGally)

library(tidyverse)

library(PerformanceAnalytics)

library(plotly)

#computing per/game stats of main stats

stats20 <-

ss %>% filter(Season >= 2020) %>%

select(Season:GP, MIN, PER, FGM:PTS) %>%

distinct(ss$PLAYER, .keep_all = TRUE) %>%

mutate(MPG = MIN, PPG = PTS, APG = AST,

RPG = REB, TOPG = TOV, BPG = BLK,

SPG = STL)

#Merge datasets

stats_salary <- merge(stats20, salary.table, by.x = ‘ss$PLAYER’, by.y = ‘PLAYER’)

names(stats_salary)[34]<-‘X2019.20.Salary.x’

stats_salary <- stats_salary[-305]

#Check correlation between salary and player’s performance

corrplot(cor(stats_salary %>%

select(X2019.20.Salary.x, MPG:SPG,

AGE, PER.y, contains("%")),

use = “complete.obs”),

method = “circle”,type = “upper”)

str(stats_salary)

# As it can be viewed in the following chart that Salary 19_20

# shows strong correlation with PPG(Point Per Game) and MPG(Minute Per Game)

# Besides that, the correlation between Salary and TOPG and PER is also high

stats_salary_cor <-

stats_salary %>%

select(X2019.20.Salary.x, PPG, MPG, TOPG, RPG, PER.y, SPG, APG)

ggpairs(stats_salary_cor)

# Below in the chart also shows the higher correlation between Salary and PPG,MPG

cor(stats_salary_cor)[,“X2019.20.Salary.x”]

# Below shows the rank of the coefficient of each factors

#Salary x PPG

names(stats_salary)[5] <- “TEAM”

plot_ly(data = stats_salary, x = ~X2019.20.Salary.x, y = ~PPG, color = ~TEAM,

hoverinfo = “text”,

text = ~paste("Player: ", ss$PLAYER,

“

Salary: “, format(X2019.20.Salary.x, big.mark = “,”),”$”,

"

PPG: ", round(PPG, digits = 3),

"

Team: ", TEAM)) %>%

layout(

title = “Salary vs Point Per Game”,

xaxis = list(title = “Salary USD”),

yaxis = list(title = “Point per Game”)

)

#Salary X MPG

names(stats_salary)[5] <- “TEAM”

plot_ly(data = stats_salary, x = ~X2019.20.Salary.x, y = ~MPG, color = ~TEAM,

hoverinfo = “text”,

text = ~paste("PLAYER: ", ss$PLAYER,

“

Salary: “, format(X2019.20.Salary.x, big.mark = “,”),”$”,

"

MPG: ", round(MPG, digits = 3),

"

Team: ", TEAM)) %>%

layout(

title = “Salary vs Minute Per Game”,

xaxis = list(title = “Salary USD”),

yaxis = list(title = “Minute per Game”)

)

#Salary x TO

names(stats_salary)[5] <- “TEAM”

plot_ly(data = stats_salary, x = ~X2019.20.Salary.x, y = ~TOPG, color = ~TEAM,

hoverinfo = “text”,

text = ~paste("Player: ", ss$PLAYER,

“

Salary: “, format(X2019.20.Salary.x, big.mark = “,”),”$”,

"

TOPG: ", round(TOPG, digits = 3),

"

Team: ", TEAM)) %>%

layout(

title = “Salary vs Turnover Per Game”,

xaxis = list(title = “Salary USD”),

yaxis = list(title = “Turnover Per Game”)

)

#Linear Regression Model

#clean data

stats_salary <- stats_salary[, c(1,2,3,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)]

stats_salary %>%

ggplot(aes(x = X2019.20.Salary.x, y = PPG)) +

geom_point() +

geom_smooth(method = “lm”)

stats_salary_regression <-

stats_salary %>% select(X2019.20.Salary.x, MPG:SPG)

lm(X2019.20.Salary.x~., data=stats_salary_regression)

#Salary prediction

salary_prediction <- function(m, point, minutes, turn_over){

pre_new <- predict(m, data.frame(PPG = point, MPG = minutes, TOPG = turn_over))

msg <- paste(“PPG:”, point, “,MPG:”, minutes, “,TOPG:”, turn_over, " ==> Expected Salary: $", format(round(pre_new), big.mark = “,”), sep = “”)

print(msg)

}

#Testing prediction

model <- lm(formula = X2019.20.Salary.x ~ PPG + MPG + TOPG, data = stats_salary_regression)

# Prediction on Salary of Anthony Davis

#Davis’s Stats (PPG=26.1,MPG=34.4, TOPG=2.5)

salary_prediction(model,26.1,34.4,2.5)

# Prediction on Salary of Wesley Matthew

salary_prediction(model,3,12,5)

I think R reads the first couple letters of the name, and assumes that that