You can read my notes on (initial attempt at) creating speed ratings here. I haven't tested if they have any predictive power; however I'm happy to provide the SQL and R code below.

# sample races at four all-weather UK race courses between 2006 and 2009 with horses aged at least 3 years

USE smartform;

DROP TABLE IF EXISTS sample_races;

CREATE TABLE sample_races AS (SELECT a.race_id, MIN(COALESCE(age, 0)) AS min_age FROM

(SELECT

race_id

FROM

historic_races

WHERE

course IN ('Kempton' , 'Lingfield', 'Southwell', 'Wolverhampton')

AND race_type_id = 15

AND YEAR(meeting_date) between 2006 and 2009

AND class IS NOT NULL

AND going = 'Standard') a

INNER JOIN

historic_runners b ON a.race_id = b.race_id

GROUP BY race_id

ORDER BY race_id);

DELETE FROM sample_races

WHERE

min_age < 3;

#

ALTER TABLE sample_races

DROP min_age;

setwd("~/smartform data/speed ratings")

source("load.R")

source("clean.R")

source("mordin_stdtimes.R")

source("mordin_speedratings.R")

# load.R

# Sample races at four all-weather UK race courses between 2006 and 2009 with horses aged at least 3 years.

library("RODBC")

ch <- odbcConnect("smartform")

races <- sqlQuery(ch, "SELECT a.race_id, meeting_date, course, class, distance_yards, winning_time_secs FROM sample_races a INNER JOIN historic_races b ON a.race_id = b.race_id order by course, class, distance_yards", as.is = T)

races <- transform(races, class = as.numeric(class))

runners <- sqlQuery(ch, "SELECT a.race_id, name, sire_name, finish_position, distance_behind_winner FROM sample_races a INNER JOIN historic_runners b ON a.race_id = b.race_id", as.is = T)

odbcClose(ch)

rm(ch)

# clean.R

# The IQR is the length of the box in the box-and-whisker plot for a given course and distance. An outlier is any value that lies more than one and a half times the length of the box from either end of the box. That is, if a data point is below Q1 – 1.5×IQR or above Q3 + 1.5×IQR, it is viewed as being too far from the central values to be reasonable.

whisker.lo <- aggregate(races$winning_time_secs, list(course = races$course, distance_yards = races$distance_yards), FUN = function(x) { boxplot.stats(x)$stats[1] })

races <- merge(races, whisker.lo, by = c("course", "distance_yards"), all.x = T)

rm(whisker.lo)

races <- races[races$winning_time_secs >= races$x, c("race_id", "meeting_date", "course", "class", "distance_yards", "winning_time_secs")]

whisker.hi <- aggregate(races$winning_time_secs, list(course = races$course, distance_yards = races$distance_yards), FUN = function(x) { boxplot.stats(x)$stats[5] })

races <- merge(races, whisker.hi, by = c("course", "distance_yards"), all.x = T)

rm(whisker.hi)

races <- races[races$winning_time_secs <= races$x, c("race_id", "meeting_date", "course", "class", "distance_yards", "winning_time_secs")]

# mording_stdtimes.R

# Implement procedure described in "Mordin on Time" for calculating standard times. Exclude courses and distances with fewer than 20 races races.

combos <- as.data.frame(table(races[, c("course", "distance_yards")]))

combos <- combos[combos$Freq >= 20,]

races <- merge(races, combos, by = c("course", "distance_yards"), all.x = T)

rm(combos)

races <- races[!is.na(races$Freq), c("race_id", "meeting_date", "course", "class", "distance_yards", "winning_time_secs")]

rownames(races) <- NULL

# Calculate standard times, deducting 2.50 seconds per mile from races in slowest class and linearly-proportionate numbers of seconds from races between slowest and fastest classes

stdtimes <- aggregate(races$winning_time_secs - (races$class - 1) * 2.5 / 6.0 / races$distance_yards * 1760.0, by = list(races$course, races$distance), FUN = median)

cnts <- aggregate(races$winning_time_secs, by = list(races$course, races$distance), FUN = length)

stdtimes <- cbind(stdtimes, cnts[, 3])

rm(cnts)

names(stdtimes) <- c("course", "distance_yards", "stdtime", "cnt")

# mordin_speedratings.R

# Implement procedure described in "Mordin on Time" for calculating speed ratings.

runners <- merge(runners, races, by = c("race_id"))

runners <- merge(runners, stdtimes[, c("course", "distance_yards", "stdtime")], by = c("course", "distance_yards"))

wt.norm <- ifelse(runners$finish_position == 1, runners$winning_time_secs - (runners$class - 1) * 2.5 / 6.0 / runners$distance_yards * 1760.0 - runners$stdtime, NA)

wt.norm <- round(wt.norm / runners$distance_yards * 1760.0, 2)

runners <- cbind(runners, wt.norm)

rm(wt.norm)

going <- runners[!is.na(runners$wt.norm), c("meeting_date", "course", "wt.norm")]

going <- going[order(going$meeting_date, going$course), ]

going <- aggregate(going$wt.norm, list(meeting_date = going$meeting_date, course = going$course), FUN = mean, trim = 23/42)

names(going) <- c("meeting_date", "course", "going")

runners <- merge(runners, going, by = c("meeting_date", "course"))

rm(going)

runners$winner.speed.rating <- round(100 - 5 * (runners$wt.norm - runners$going))

runners <- merge(runners, runners[!is.na(runners$winner.speed.rating), c("race_id", "winner.speed.rating")], by = c("race_id"))

names(runners)[seq(from = length(runners), by = -1, length = 2)] <- c("winning.speed.rating", "speed.rating")

runners$speed.rating <- round(runners$winning.speed.rating - runners$distance_behind_winner / (runners$distance_yards / 1760))

runners$speed.rating[is.na(runners$speed.rating)] <- runners$winning.speed.rating[is.na(runners$speed.rating)]