I’m writing this blog post to briefly discuss the improvements I made to my data.table.threads code by following this rubric that Toby created for a course at NAU.
Before diving into lessons learned, I think it would be good to share the before/after versions of the codebase. Here is what the code looks like: A) before following points from the rubric:
runBenchmarks <- function(rowCount, colCount, threadCount)
{
setDTthreads(threadCount)
dt <- data.table(matrix(runif(rowCount * colCount), nrow = rowCount, ncol = colCount))
threadLabel <- ifelse(threadCount == 1, "thread", "threads")
cat(sprintf("Running benchmarks with %d %s, %d rows, and %d columns.\n", getDTthreads(), threadLabel, rowCount, colCount))
benchmarks <- microbenchmark(
forder = setorder(dt, V1),
GForce_sum = dt[, .(sum(V1))],
subsetting = dt[dt[[1]] > 0.5, ],
frollmean = frollmean(dt[[1]], 10),
fcoalesce = fcoalesce(dt[[1]], dt[[2]]),
between = dt[dt[[1]] %between% c(0.4, 0.6)],
fifelse = fifelse(dt[[1]] > 0.5, dt[[1]], 0),
nafill = nafill(dt[[1]], type = "const", fill = 0),
CJ = CJ(sample(rowCount, size = min(rowCount, 5)), sample(colCount, size = min(colCount, 5))),
times = 100
)
benchmark_summary <- summary(benchmarks)
medianTime <- benchmark_summary$median
names(medianTime) <- benchmark_summary$expr
return(data.frame(threadCount = threadCount, expr = names(medianTime), medianTime = medianTime))
}
findOptimalThreadCount <- function(rowCount, colCount) {
setDTthreads(0)
maxThreads <- getDTthreads()
results <- list()
for (threadCount in 1:maxThreads) {
results[[threadCount]] <- runBenchmarks(rowCount, colCount, threadCount)
}
result.list <- do.call(rbind, results)
class(result.list) <- "data_table_threads_benchmark"
result.list
}
print.data_table_threads_benchmark <- function(x, ...)
{
df <- data.frame(expr = x$expr, threadCount = x$threadCount, meanTime = x$medianTime)
fastestMedianTime <- aggregate(medianTime ~ expr, data = df, FUN = min)
bestPerformingThreadCount <- df[df$expr %in% fastestMedianTime$expr & df$medianTime %in% fastestMedianTime$medianTime, "threadCount"]
results <- data.frame(expr = fastestMedianTime$expr, medianTime = fastestMedianTime$medianTime, threadCount = bestPerformingThreadCount)
cat(sprintf("%-20s %-23s %-12s\n", "data.table function", "Fastest runtime (median)", "Thread count"))
cat(rep("-", 29), "\n")
for(i in seq_len(nrow(results)))
{
cat(sprintf("%-20s %-23f %-12d\n", results$expr[i], results$medianTime[i], results$threadCount[i]))
}
}
plot.data_table_threads_benchmark <- function(x, ...)
{
df <- x
rownames(df) <- NULL
df$speedup <- df$medianTime[df$threadCount == 1] / df$medianTime
setDT(df)
maxSpeedup <- df[, .(threadCount = threadCount[which.max(speedup)], speedup = max(speedup)), by = expr]
idealSpeedup <- seq(1, getDTthreads())
idealSpeedupData <- data.frame(threadCount = 1:getDTthreads(), speedup = idealSpeedup)
subOptimalSpeedupData <- data.frame(threadCount = seq(1, getDTthreads(), length.out = getDTthreads()), speedup = seq(1, getDTthreads()/2, length.out = getDTthreads()))
closestPoints <- data.frame()
for(i in unique(df$expr)) {
dfSubset <- df[df$expr == i, ]
suboptimalSubset <- subOptimalSpeedupData[subOptimalSpeedupData$threadCount %in% dfSubset$threadCount, ]
closestPoint <- dfSubset[which.max(dfSubset$speedup - suboptimalSubset$speedup), ]
closestPoints <- rbind(closestPoints, closestPoint)
}
ggplot(df, aes(x = threadCount, y = speedup)) +
geom_line(aes(linetype = "Measured")) +
geom_line(data = idealSpeedupData, aes(x = threadCount, y = speedup, linetype = "Ideal"), color = "black") +
geom_line(data = subOptimalSpeedupData, aes(x = threadCount, y = speedup, linetype = "Sub-optimal"), color = "black") +
geom_point(data = closestPoints, aes(x = threadCount, y = speedup, shape = "Recommended"), color = "black", size = 2) +
geom_point(data = maxSpeedup, aes(x = threadCount, y = speedup, shape = "Best performing"), color = "red", size = 2) +
geom_text(data = closestPoints, aes(label = threadCount), vjust = -0.5, size = 4, na.rm = TRUE) +
geom_text(data = maxSpeedup, aes(label = threadCount), vjust = -0.5, size = 4, na.rm = TRUE) +
geom_ribbon(aes(ymin = speedup - 0.3, ymax = speedup + 0.3), alpha = 0.5) +
facet_wrap(. ~ expr) +
coord_equal() +
labs(x = "Threads", y = "Speedup", title = "data.table functions", linetype = "Legend") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_continuous(breaks = 1:getDTthreads(), labels = 1:getDTthreads()) +
scale_linetype_manual(values = c("Measured" = "solid", "Ideal" = "dashed", "Sub-optimal" = "dotted"), guide = "legend") +
scale_shape_manual(values = c("Recommended" = 16, "Best performing" = 19)) +
guides(linetype = guide_legend(override.aes = list(fill = NA), title = "Speedup"), shape = guide_legend(override.aes = list(fill = NA), title = "Thread count"))
}
B) and finally, after following points in the rubric:
runBenchmarks <- function(rowCount, colCount, threadCount)
{
setDTthreads(threadCount)
dt <- data.table(matrix(runif(rowCount * colCount), nrow = rowCount, ncol = colCount))
threadLabel <- ifelse(threadCount == 1, "thread", "threads")
cat(sprintf("Running benchmarks with %d %s, %d rows, and %d columns.\n", getDTthreads(), threadLabel, rowCount, colCount))
benchmarks <- microbenchmark(
forder = setorder(dt, V1),
GForce_sum = dt[, .(sum(V1))],
subsetting = dt[dt[[1]] > 0.5, ],
frollmean = frollmean(dt[[1]], 10),
fcoalesce = fcoalesce(dt[[1]], dt[[2]]),
between = dt[dt[[1]] %between% c(0.4, 0.6)],
fifelse = fifelse(dt[[1]] > 0.5, dt[[1]], 0),
nafill = nafill(dt[[1]], type = "const", fill = 0),
CJ = CJ(sample(rowCount, size = min(rowCount, 5)), sample(colCount, size = min(colCount, 5))),
times = 100
)
benchmarkSummary <- summary(benchmarks)
medianTime <- benchmarkSummary$median
exprNames <- benchmarkSummary$expr
data.table(threadCount = threadCount, expr = exprNames, medianTime = medianTime)
}
findOptimalThreadCount <- function(rowCount, colCount)
{
setDTthreads(0)
maxThreads <- getDTthreads()
results <- list()
for (threadCount in 1:maxThreads) {
results[[threadCount]] <- runBenchmarks(rowCount, colCount, threadCount)
}
results.dt <- rbindlist(results)
setattr(results.dt, "class", c("data_table_threads_benchmark", class(results.dt)))
results.dt
}
print.data_table_threads_benchmark <- function(x, ...)
{
fastestMedianTime <- x[, .(medianTime = min(medianTime)), by = expr]
bestPerformingThreadCount <- x[fastestMedianTime, on = .(expr, medianTime), .(expr, threadCount, medianTime)]
results <- bestPerformingThreadCount
cat(sprintf("%-20s %-23s %-12s\n", "data.table function", "Fastest runtime (median)", "Thread count"))
cat(rep("-", 29), "\n")
for (i in seq_len(nrow(results)))
{
cat(sprintf("%-20s %-23f %-12d\n", results$expr[i], results$medianTime[i], results$threadCount[i]))
}
}
plot.data_table_threads_benchmark <- function(x, ...)
{
x[, `:=`(
speedup = medianTime[threadCount == 1] / medianTime,
type = "Measured"
), by = expr]
maxSpeedup <- x[, .(threadCount = threadCount[which.max(speedup)],
speedup = max(speedup)), by = expr]
idealSpeedup <- x[, .(threadCount = 1:getDTthreads(),
speedup = seq(1, getDTthreads()),
type = "Ideal",
medianTime = NA), by = expr]
subOptimalSpeedup <- x[, .(threadCount = seq(1, getDTthreads(), length.out = getDTthreads()),
speedup = seq(1, getDTthreads() / 2, length.out = getDTthreads()),
type = "Sub-optimal",
medianTime = NA), by = expr]
combinedLineData <- rbindlist(list(idealSpeedup, subOptimalSpeedup, x), use.names = TRUE, fill = TRUE)
closestPoints <- x[, {
suboptimalSpeedupSubset <- subOptimalSpeedup[expr == .BY$expr]
.SD[.(threadCount = suboptimalSpeedupSubset$threadCount), on = .(threadCount), nomatch = 0L]
.SD[which.max(speedup - suboptimalSpeedupSubset$speedup)]
}, by = expr]
closestPoints[, `:=`(
medianTime = NULL,
type = "Recommended"
)]
maxSpeedup[, type := "Best performing"]
combinedPointData <- rbind(maxSpeedup, closestPoints)
x[, `:=`(
minSpeedup = min(speedup),
maxSpeedup = max(speedup)
), by = expr]
ggplot(x, aes(x = threadCount, y = speedup)) +
geom_line(data = combinedLineData, aes(color = type), size = 1) +
geom_point(data = combinedPointData, aes(color = type), size = 3) +
geom_text(data = combinedPointData, aes(label = threadCount), vjust = -0.5, size = 4, na.rm = TRUE) +
geom_ribbon(aes(ymin = minSpeedup, ymax = maxSpeedup), alpha = 0.5) +
facet_wrap(. ~ expr) +
coord_equal() +
labs(x = "Threads", y = "Speedup", title = "data.table functions") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_continuous(breaks = 1:getDTthreads(), labels = 1:getDTthreads()) +
scale_color_manual(values = c("Measured" = "black", "Ideal" = "#f79494", "Sub-optimal" = "#93c4e0", "Recommended" = "#93c4e0", "Best performing" = "#f79494")) +
guides(color = guide_legend(override.aes = list(fill = NA), title = "Type"))
}
There were different takeaways from the rubric for me, so I compiled a list of actions I took as a part of following them:
- Removed distinction of lines and points in the plot and legend based on type (like solid/dashed/dotted for linetype) and shapes (16/19 for points) and used color only. (Rubric point: Using multiple visual properties for the same data variable can be potentially confusing)
- Refactored the plot method to use perform calls to
geom_line
,geom_point
, andgeom_text
only once each (apart from other changes to the scales to accommodate them). (Rubric point: Multiple geoms should be avoided in a ggplot when you could use a single geom with a bigger data set) - Used more informative names and followed consistent casing. (Rubric point: Use more informative names that are specific to your application)
- Removed repitition by having the terms ‘Speedup’ and ‘Thread Count’ in the plot legend’s titles instead of the values.
- Using
threadCount
andspeedup
instead ofx
andy
for column names in thedata.table
that I’m plotting, towards more informative names.
Exceptions:
- The rubric suggests to avoid single-letter variable names, but given that
x
is the default parameter name for the S3 plot method (signatureplot(x, y, ...)
, and likewise,print(x, ...)
for print), and in order to not avoid adding an unnecessary step/variable likedt <- x
, I’m continuining with use ofx
in my plot method. Toby said this was okay when I brought it up.
Todo:
- Using constants on the right-hand side (value) of
aes
that will result in a legend.