You can read my notes on (initial attempt at) creating speed ratings here. I haven't tested if they have any predictive power; however I'm happy to provide the SQL and R code below.
# sample races at four all-weather UK race courses between 2006 and 2009 with horses aged at least 3 years
USE smartform;
DROP TABLE IF EXISTS sample_races;
CREATE TABLE sample_races AS (SELECT a.race_id, MIN(COALESCE(age, 0)) AS min_age FROM
(SELECT
race_id
FROM
historic_races
WHERE
course IN ('Kempton' , 'Lingfield', 'Southwell', 'Wolverhampton')
AND race_type_id = 15
AND YEAR(meeting_date) between 2006 and 2009
AND class IS NOT NULL
AND going = 'Standard') a
INNER JOIN
historic_runners b ON a.race_id = b.race_id
GROUP BY race_id
ORDER BY race_id);
DELETE FROM sample_races
WHERE
min_age < 3;
#
ALTER TABLE sample_races
DROP min_age;
setwd("~/smartform data/speed ratings")
source("load.R")
source("clean.R")
source("mordin_stdtimes.R")
source("mordin_speedratings.R")
# load.R
# Sample races at four all-weather UK race courses between 2006 and 2009 with horses aged at least 3 years.
library("RODBC")
ch <- odbcConnect("smartform")
races <- sqlQuery(ch, "SELECT a.race_id, meeting_date, course, class, distance_yards, winning_time_secs FROM sample_races a INNER JOIN historic_races b ON a.race_id = b.race_id order by course, class, distance_yards", as.is = T)
races <- transform(races, class = as.numeric(class))
runners <- sqlQuery(ch, "SELECT a.race_id, name, sire_name, finish_position, distance_behind_winner FROM sample_races a INNER JOIN historic_runners b ON a.race_id = b.race_id", as.is = T)
odbcClose(ch)
rm(ch)
# clean.R
# The IQR is the length of the box in the box-and-whisker plot for a given course and distance. An outlier is any value that lies more than one and a half times the length of the box from either end of the box. That is, if a data point is below Q1 – 1.5×IQR or above Q3 + 1.5×IQR, it is viewed as being too far from the central values to be reasonable.
whisker.lo <- aggregate(races$winning_time_secs, list(course = races$course, distance_yards = races$distance_yards), FUN = function(x) { boxplot.stats(x)$stats[1] })
races <- merge(races, whisker.lo, by = c("course", "distance_yards"), all.x = T)
rm(whisker.lo)
races <- races[races$winning_time_secs >= races$x, c("race_id", "meeting_date", "course", "class", "distance_yards", "winning_time_secs")]
whisker.hi <- aggregate(races$winning_time_secs, list(course = races$course, distance_yards = races$distance_yards), FUN = function(x) { boxplot.stats(x)$stats[5] })
races <- merge(races, whisker.hi, by = c("course", "distance_yards"), all.x = T)
rm(whisker.hi)
races <- races[races$winning_time_secs <= races$x, c("race_id", "meeting_date", "course", "class", "distance_yards", "winning_time_secs")]
# mording_stdtimes.R
# Implement procedure described in "Mordin on Time" for calculating standard times. Exclude courses and distances with fewer than 20 races races.
combos <- as.data.frame(table(races[, c("course", "distance_yards")]))
combos <- combos[combos$Freq >= 20,]
races <- merge(races, combos, by = c("course", "distance_yards"), all.x = T)
rm(combos)
races <- races[!is.na(races$Freq), c("race_id", "meeting_date", "course", "class", "distance_yards", "winning_time_secs")]
rownames(races) <- NULL
# Calculate standard times, deducting 2.50 seconds per mile from races in slowest class and linearly-proportionate numbers of seconds from races between slowest and fastest classes
stdtimes <- aggregate(races$winning_time_secs - (races$class - 1) * 2.5 / 6.0 / races$distance_yards * 1760.0, by = list(races$course, races$distance), FUN = median)
cnts <- aggregate(races$winning_time_secs, by = list(races$course, races$distance), FUN = length)
stdtimes <- cbind(stdtimes, cnts[, 3])
rm(cnts)
names(stdtimes) <- c("course", "distance_yards", "stdtime", "cnt")
# mordin_speedratings.R
# Implement procedure described in "Mordin on Time" for calculating speed ratings.
runners <- merge(runners, races, by = c("race_id"))
runners <- merge(runners, stdtimes[, c("course", "distance_yards", "stdtime")], by = c("course", "distance_yards"))
wt.norm <- ifelse(runners$finish_position == 1, runners$winning_time_secs - (runners$class - 1) * 2.5 / 6.0 / runners$distance_yards * 1760.0 - runners$stdtime, NA)
wt.norm <- round(wt.norm / runners$distance_yards * 1760.0, 2)
runners <- cbind(runners, wt.norm)
rm(wt.norm)
going <- runners[!is.na(runners$wt.norm), c("meeting_date", "course", "wt.norm")]
going <- going[order(going$meeting_date, going$course), ]
going <- aggregate(going$wt.norm, list(meeting_date = going$meeting_date, course = going$course), FUN = mean, trim = 23/42)
names(going) <- c("meeting_date", "course", "going")
runners <- merge(runners, going, by = c("meeting_date", "course"))
rm(going)
runners$winner.speed.rating <- round(100 - 5 * (runners$wt.norm - runners$going))
runners <- merge(runners, runners[!is.na(runners$winner.speed.rating), c("race_id", "winner.speed.rating")], by = c("race_id"))
names(runners)[seq(from = length(runners), by = -1, length = 2)] <- c("winning.speed.rating", "speed.rating")
runners$speed.rating <- round(runners$winning.speed.rating - runners$distance_behind_winner / (runners$distance_yards / 1760))
runners$speed.rating[is.na(runners$speed.rating)] <- runners$winning.speed.rating[is.na(runners$speed.rating)]