Predicting the outcome of cage fights using R Warning: violence Background Rules of MMA

Data

library(tidyverse) library(rvest)

# Function to scrape fights info for each fighter scrape_fighters <- function(x,y) {

x %>% read_html() %>% html_nodes("table.wikitable") %>% # Extract each table on the wiki page map(html_table, fill = T) %>% # Convert each table to df keep(~length(names(.x)) == 10) %>% # Keep only the table containing fights keep(~colnames(.x)[1] == "Res.") %>% # Defined by having 10 columns and the first column names ‘Res.’ bind_rows() %>% mutate(fighter = y, # Add a new column with the fighter Round = as.character(Round))

Sys.sleep(1) # So as not to bombard the website

} library(purrr) library(dplyr) library(rvest) list_of_fighters <- read_csv("./Data/fighterlinksvect.")

# For each fighter wiki page, pull out the table with their mma record fighter_records <- map2(list_of_fighters$wiki_link, list_of_fighters$name, safely(scrape_fighters))

# There will be errors which need to be looked at fighter_records %>% pluck("error") %>% compact()

# Save the result as a data frame fighter_records_complete <- fighter_records %>% pluck("result") %>% bind_rows()

Hypothesis

The more a fighter gets knocked out, the easier they are to knock out in future

“That guy’s chin is shot” Mark Hunt in 2005…. Mark Hunt in 2013 (after having been KO’d a few times) # a simpler method using base lappply lapply(mtcars, mean) %>% bind_cols()

# the purrr version of lapply map(mtcars, mean) %>% bind_cols()

# shorthand for binding cols map_dfr(mtcars, mean)

# parallelised mapping library(furrr) plan(multiprocess) future_map_dfr(mtcars, mean)

## # A tibble: 1 x 11 ## mpg cyl disp hp drat wt qsec vs am gear carb ## ## 1 20.1 6.19 231. 147. 3.60 3.22 17.8 0.438 0.406 3.69 2.81 Past losses Vs proportion of fights won gen_past_kos <- function(date_input, fighter_input) {

df <- fights_full %>% mutate(fighter = tolower(trimws(fighter))) %>% filter(fighter == fighter_input, date <= date_input) %>% arrange(desc(date))

data.frame( n_past_kos = sum(df$result == "Loss" & df$method == "TKO/KO"), fight_result = df %>% head(1) %>% select(result) %>% pull() ) }

library(furrr); plan(multiprocess)

past_kos <- future_map2(fights_full$date, fights_full$fighter, safely(gen_past_kos), Number of past KOs losses going into the fight .progress = T) # past KO losses fights_full %>% arrange(fighter, date) %>% group_by(fighter) %>% transmute(past_ko_losses = cumsum(result == "Loss" & method == "TKO/KO")) %>% bind_cols(fights_full %>% arrange(fighter, date) %>% select(result, date, method)) %>% group_by(past_ko_losses) %>% summarise(win_prop = sum(result == "Win") / n()) %>% ggplot(aes(x = past_ko_losses, y = win_prop)) + geom_point() + geom_smooth() + ggtitle("Past knockout losses Vs Proportion of fights won") + xlab("Number of past KOs losses going into the fight") + ylab("Win Proportion") + xlim(0,15) Age

The older a fighter gets, the slower they are and the easier they are to get knocked out or submitted

“That guy is past his prime”

Age of MMA fighters Vs likelihood of winning their fight

static_stats %>% group_by(fighter_age = round(fighter_age)) %>% summarise(win_proportion = sum(win_loss == 1, na.rm = T) / n()) %>% ggplot(aes(x = fighter_age, y = win_proportion)) + geom_point() + geom_smooth(se = F) + ggtitle("Age of MMA fighters vs. likelihood of winning") + xlim(17, 40)

If we predicted just using age, we would be accurate 56% of the time Reach

The longer a fighters arms, the more chance of them being able to attack from a safe distance

“He fights standing just outside the pocket” Mark Hunt in 2005…. Framing the Problem Regression problem?

Win streak P fighter 1 winning = fighter 1 reach + Home town / country advantage fighter 1 age + Past submission wins

fighter 1 past KO losses + Past submission losses

Number of rounds in fight fighter 2 reach + Title defence

fighter 2 age + Style (, BJJ, )

fighter 2 past KO losses +

…... library(rsample) test_scaled <- test %>% library(ranger) select(-win_loss) %>% select_if(is.numeric) %>% x <- initial_split(model_data, prop = 0.8) scale( train <- training(x) attr(train_scaled_attrs, "scaled:center"), test <- testing(x) attr(train_scaled_attrs, "scaled:scale") ) # scaling attributes train_scaled_attrs <- train %>% # the IDs for use afterwards - includes dep var and non numeric select(-win_loss) %>% train_ids <- keep(is.numeric) %>% train %>% select_if(function(col) scale() ! is.numeric(col) | all(col == .$win_loss)) test_ids <- # scale train and test set (after removing dep var and character vars) test %>% select_if(function(col) train_scaled <- train %>% ! is.numeric(col) | all(col == .$win_loss)) select(-win_loss) %>% select_if(is.numeric) %>% # variables to model with scale( train_data <- train_scaled %>% attr(train_scaled_attrs, "scaled:center"), data.frame() %>% attr(train_scaled_attrs, "scaled:scale") select_if(function(x) ) all(!is.na(x))) %>% bind_cols(train_ids) test_data <- test_scaled %>% data.frame() %>% bind_cols(test_ids) %>% select(one_of(names(train_data)))

# Random Forest model rf_mod <- ranger(win_loss ~ . - fighter - opponent - date ,data = train_data ,importance = 'impurity') preds <- predict(rf_mod_bin, test_data) confusionMatrix(as.factor(test_data$win_loss), preds$predictions) Network problem? Time series? library(tidyverse) library(geomnet)

network_graph <- df %>% ggplot(aes(from_id = fighter, to_id = opponent)) + geom_net( aes(colour = weight_class), layout.alg = "fruchtermanreingold", size = 1, labelon = TRUE, vjust = -0.6, ecolour = "grey60", directed = FALSE, fontsize = 1, ealpha = 0.5 ) + theme_net() + theme(legend.position = "bottom")

Timelibrary series(PlayerRatings) problem? library(lubridate) library(tidyverse)

start_date <- as.Date("2000-01-01")

input <- fights_full %>% distinct() %>% filter(date > start_date) %>% select(date, result, fighter, opponent) %>% transmute( weeks_since_start = time_length(difftime(date, start_date), "weeks"), fighter, opponent, result = case_when(result == "Win" ~ 1, result == "Loss" ~ 0, TRUE ~ 0.5) )

elo(input, history = T)

Time series problem?

P fighter winning = f(Body strikes blocked in last n fights , Opponent body strikes attempted in last n,

Attempted takedowns in last n,

Opponent takedowns defended in last n,

Average fight time of past n fights,

Win streak,

Days since last KO loss,

……) generate_past_statistics <- function(date_input, fighter_input, fighter_or_opponent = "fighter") {

# filter the fights dataframe to show only the fighter in question and data up to their last fight df_ <- fights_full %>% mutate(fighter = tolower(trimws(fighter))) %>% dplyr::filter(fighter == fighter_input, date < date_input) %>% arrange(desc(date))

#check if the fighter or opponent names are in the df if(nrow(df_) == 0) {return(NULL)}

# number of wins and losses wins_losses <- fights_full %>% mutate(fighter = tolower(trimws(fighter))) %>% dplyr::filter(fighter == fighter_input, date <= date_input) %>% dplyr::filter(row_number() == 1) %>% select(wins, losses)

input_prop_win_loss <- list(lookback = lookback_prop_win_loss, #c(2, 4, 6, 8, 16, 32), metric = c("Loss", "Win"))

# map function over each element - should I NA values that are higher than the number of fights? prop_win_loss <- input_prop_win_loss %>% cross_df() %>% pmap_dfc(prop_x_in_last)

# create list of all posible inputs input_win_loss_history <- list( lookback = lookback_n_x_in_last, metric = c("Loss", "Win"), method = c("TKO/KO", "Decision") )

# map function over each element win_loss_history <- input_win_loss_history %>% cross_df() %>% pmap_dfc(n_x_in_last)

# strike history input_strike_history <- list( lookback = lookback_strike_history, method = c("TKO/KO"), metric = c("Loss", "Win") # sig strikes, td, etc )

# map function over each element strike_history <- input_strike_history %>% cross_df() %>% pmap_dfc(n_x_in_last)

# calculate the average strikes of different types in the last n matches - n can be tuned input_fighter_round_vars <- select(fight_statistics, kd:takedown_accuracy) %>% names()

mean_strikes_dealt <- fight_statistics %>% filter(fighter == fighter_input, date < date_input) %>% arrange(desc(date)) %>% filter(row_number() <= lookback_strike_history) %>% summarise_at(input_fighter_round_vars, mean, na.rm = T) %>% set_names(paste0("mean_", colnames(.), "_of_last_", lookback_strike_history))

# calculate the strikes of differnt types absorbed through the career (30 fights) ## select(df, opponent_kd:opponent_takedown_accuracy) %>% names() input_opponent_round_vars <- df_%>% select(contains("opponent_")) %>% select(contains("_lnd")) %>% names() new_names <- str_replace_all(input_opponent_round_vars, c("opponent_" = "", "_lnd" = "")) strikes_absorbed <- mean_x_in_last(30, input_opponent_round_vars) %>% set_names(paste0(new_names, "_absorbed"))

### CURRENT WIN STREAK # rle list the contents of vector and the order and frequency that the values occour ws_ls <- rle(as.character(df_$result))

# if if (is.na(ws_ls$values[1])) { loss_streak <- 0 win_streak <- 0 other_streak <- 0 } else if (ws_ls$values[1] == "Loss") { loss_streak <- ws_ls$lengths[1] win_streak <- 0 other_streak <- 0 } else if (ws_ls$values[1] == "Win") { loss_streak <- 0 win_streak <- ws_ls$lengths[1] other_streak <- 0 } else { loss_streak <- ws_ls$lengths[1] win_streak <- 0 other_streak <- 0 }

# combine win or loss or other into data frame win_loss_streak <- data.frame( loss_streak = loss_streak, win_streak = win_streak, other_streak = other_streak )

### DAYS SINCE LAST KO LOSS last_ko_loss <- df_%>% filter(result == "Loss", method == "TKO/KO", dplyr::row_number() == 1)

# if the fighter has never been KO'd, put the days in as 1000 if (nrow(last_ko_loss) == 0) { days_since_last_ko <- data.frame(days_since_last_ko_loss = 1000)

} else { days_since_last_ko <- last_ko_loss %>% transmute(days_since_last_ko_loss = as.Date(format(as.Date(date_input, origin="1970-01-01"))) - date) %>% mutate(days_since_last_ko_loss = as.numeric(days_since_last_ko_loss), days_since_last_ko_loss = ifelse(days_since_last_ko_loss > 1000, 1000, days_since_last_ko_loss))

}

#LAST FIGHT RESULT last_fight_result <- df_%>% filter(row_number() == 1) %>% transmute(last_fight_result = ifelse(result == "Win", 1, 0))

#AVERAGE FIGHT TIME average_fight_time <- df_%>% summarise( average_fight_time = mean(total_fight_time_seconds), total_fight_time = sum(total_fight_time_seconds), total_fights = n() )

# AGE Age <- df_%>% filter(row_number() == 1) %>% transmute(fighter_age = date_input)

# TOTAL FIGHTS total_fights <- df_%>% summarise(total_fights = n())

# Performance Bonuses n_perf_bonuses <- df_%>% summarise(n_perf_bonuses = sum(performance_bonus))

# the variables will sometimes be blank if there is no fight history - replace with NAs if so first_fight_NA <- function(variable) { if(nrow(variable) == 0) { variable[1,] <- NA return(variable) } else { return(variable) } }

# apply clean up function and bind everythign together map_dfc( list(data.frame(fighter = fighter_input), data.frame(date = date_input), wins_losses, total_fights, prop_win_loss, win_loss_history, win_loss_streak, days_since_last_ko, last_fight_result, n_perf_bonuses, average_fight_time, mean_strikes_dealt, strikes_absorbed ), .f = first_fight_NA) %>% set_names(paste(fighter_or_opponent, colnames(.), sep = "_"))

} library(furrr) plan(multiprocess)

fighter_stats <- future_pmap_dfr( list( date_input = as.Date(fights_to_lookback$date), fighter_input = fights_to_lookback$fighter, fighter_or_opponent = "fighter" ), generate_past_statistics, .progress = T ) opponent_stats <- future_pmap_dfr( list( date_input = as.Date(fights_to_lookback$date), fighter_input = fights_to_lookback$opponent, fighter_or_opponent = "opponent" ), generate_past_statistics, .progress = T ) # long winded for loop to extract column means # a simpler method using base lappply out <- list() lapply(mtcars, mean) %>% bind_cols() for (i in 1:ncol(mtcars)) { # the purrr version of lapply out[i] <- mean(mtcars[,i]) map(mtcars, mean) %>% bind_cols() } # shorthand for binding cols names(out) <- colnames(mtcars) map_dfr(mtcars, mean) bind_rows(out) # parallelised mapping ## # A tibble: 1 x 11 library(furrr) plan(multiprocess) ## mpg cyl disp hp drat wt qsec vs am gear carb

## future_map_dfr(mtcars, mean) ## 1 20.1 6.19 231. 147. 3.60 3.22 17.8 0.438 0.406 3.69 2.81

## # A tibble: 1 x 11 ## mpg cyl disp hp drat wt qsec vs am gear carb ## ## 1 20.1 6.19 231. 147. 3.60 3.22 17.8 0.438 0.406 3.69 2.81 library(keras) Time series? install_keras()

model <- keras_model_sequential() %>% layer_dense(units = 256, activation = 'relu', input_shape = ncol(train_x_matrix)) %>% layer_dropout(rate = 0.4) %>% layer_dense(units = 128, activation = 'relu') %>% layer_dropout(rate = 0.4) %>% layer_dense(units = 1, activation = 'sigmoid')

model %>% compile( loss = "binary_crossentropy", optimizer = "adam", metrics = list("accuracy") )

model %>% fit( train_x_matrix, select(train_data, win_loss) %>% pull(), epochs = 15, batch_size = 3, validation_split = 0.3) Using the model to predict future outcomes

(and make money??)