Original article:
Evaluation of Sampling Methods for Scatterplots
Jun Yuan, Shouxing Xiang, Jiazhi Xia, Lingyun Yu, and Shixia Liu
https://arxiv.org/pdf/2007.14666.pdf
Repository with this analysis, the writeup, and the original data:
https://osf.io/hsujr/
# The first time you knit this markdown, you'll need to call install.packages("PACKAGE") for each library call
library(tidyverse) # lots of basic functionality
library(patchwork) # combine plots
library(ggtext) # for colored text
library(glue) # for colored text
library(ragg) # nicer figure output
library(ggdist) # stat_histinterval used in final figure
library(broom) # broom::tidy() puts the anova in a manageable dataframe
library(MBESS) # dependency for apaTables
library(apaTables) # get.ci.partial.eta.squared() gives confidence intervals for the effect sizes
# alpha value of the original article
ALPHA = 0.05
# How many bootstrap iterations to do
BOOT_COUNT = 10 * 1000
# simplify everything when debugging
DEBUG = FALSE
# trial data folder
DATA_FOLDER_TRIALS = "libsampling/trial data/publish"
DATA_FOLDER_ANSWERS = "libsampling/trial data/answer"
# folder for hi res figures and other things
OUTPUT_FOLDER = "paper/"
# for reproducibility
set.seed(1234)
Load the response data from experiment 1.
filenames = list.files(DATA_FOLDER_TRIALS, pattern="*.txt", full.names = T)
# all the experiments are stored together. Starting with experiment 1
EXPERIMENT_1_RANGE = 4:115
# function to read in all the trials
read_trial_data = function(filename) {
text = read_lines(filename)
# read the json line by line
json = lapply(text[EXPERIMENT_1_RANGE], function(line) {
# print(line)
trial = jsonlite::fromJSON(line)
# flatten the answers info
answers = trial$problem_answer
for (a in names(answers)) {
trial[a] = answers[a]
}
trial$problem_answer = NULL
# flatten multi-facetted answers
for (name in names(trial)) {
# drop empties
if (length(trial[name][[1]]) == 0)
trial[name] = NA
# flatten vectors
if (length(trial[name][[1]]) > 1) {
trial[name] = paste(trial[[name]], collapse = ",")
}
}
# add subject name
trial$subjectID = basename(filename) %>% str_split(fixed("."), simplify = TRUE) %>% `[`(1)
# return
trial
})
data = bind_rows(json)
return(data)
}
# apply the function to each filename and merge them together
data = lapply(filenames, read_trial_data) %>%
bind_rows()
# cleanup
rm(EXPERIMENT_1_RANGE, filenames, read_trial_data)
Load the correct answers.
Slow because of all the string manipulation
filenames = list.files(DATA_FOLDER_ANSWERS, pattern="*.txt", full.names = T)
# function to read the correct answers for each subject
read_correct_answers = function(filename) {
lines = read_lines(filename)
lines = lapply(lines, function(line) {
line %>%
str_replace(fixed("random sampling"), fixed("random_sampling")) %>%
str_replace(fixed("blue noise sampling"), fixed("blue_noise_sampling")) %>%
str_replace(fixed("density biased sampling"), fixed("density_biased_sampling")) %>%
str_replace(fixed("multi-view Z order sampling"), fixed("multi-view_Z_order_sampling")) %>%
str_replace(fixed("multi-class blue_noise_sampling"), fixed("multi-class_blue_noise_sampling")) %>%
str_replace(fixed("outlier biased density based sampling"),fixed("outlier_biased_density_based_sampling"))%>%
str_replace(fixed("recursive subdivision based sampling"), fixed("recursive_subdivision_based_sampling")) %>%
str_split(fixed(" "), simplify = T)
})
# drop the experiment description line
lines = lines[-1]
# drop experimnet 2 onward
start_of_experiment_2 = (1:length(lines))[sapply(lines, length)!=3]
lines = lines[1:start_of_experiment_2-1]
# make into tibble
answers = lapply(
lines,
function(line) {
colnames(line) = c("problem_dataset", "problem_sampling", "actual_sort_index");
as_tibble(line)
}) %>%
bind_rows()
answers = answers %>% separate(problem_dataset, c("problem_dataset", "level"), sep = "-")
# drop synthetic condition
answers = answers %>% filter(!str_detect(problem_dataset, fixed("synthetic")))
# add subject name
answers$subjectID = basename(filename) %>% str_split(fixed("."), simplify = TRUE) %>% `[`(1)
return(answers)
}
# apply the loader to each filename and merge
correct_answers = lapply(filenames, read_correct_answers) %>% bind_rows()
# cleanup
rm(filenames, read_correct_answers)
Join the responses to the correct answers and compare.
# filter out answers for missing subjects (why missing???)
correct_answers = correct_answers %>% filter(subjectID %in% unique(data$subjectID))
# merge correct answers and data
correct_answers = correct_answers %>% mutate(problem_sampling = problem_sampling %>%
str_replace(fixed("random_sampling"), fixed("random sampling")) %>%
str_replace(fixed("blue_noise_sampling"), fixed("blue noise sampling")) %>%
str_replace(fixed("density_biased_sampling"), fixed("density biased sampling")) %>%
str_replace(fixed("multi-view_Z_order_sampling"), fixed("multi-view Z order sampling")) %>%
str_replace(fixed("multi-class_blue noise sampling"), fixed("multi-class blue noise sampling")) %>%
str_replace(fixed("outlier_biased_density_based_sampling"), fixed("outlier biased density based sampling")) %>%
str_replace(fixed("recursive_subdivision_based_sampling"), fixed("recursive subdivision based sampling"))
)
correct_answers = correct_answers %>% mutate(level = as.numeric(level))
data = left_join(data, correct_answers)
# check if answer is correct
data = data %>% mutate(
isCorrect = sort_index == actual_sort_index,
isCorrect2 = sort_index != actual_sort_index) # don't know why, but the reverse seems to match the paper's results
# output processed data
# you can load this file to start from here: data = read_csv("Experiment 1 processed.csv")
data %>% write_csv(paste0(OUTPUT_FOLDER, "Experiment 1 processed.csv"))
# cleanup
rm(correct_answers)
Make some variables more presentable.
data = data %>% mutate(level = paste("stimulus", level))
data = data %>% mutate(subjectID = paste0("S", subjectID))
Make the labels, colors, and order from the original article. It’s easiest to take care of this at the beginning, so we don’t have to redo it for each graph.
# make initials and ordering match the original article
data = data %>%
# get initials
rowwise() %>%
mutate(problem_sampling_initials = paste(str_sub(str_split(problem_sampling, " ", simplify = T), 1, 1), collapse="")) %>%
mutate(problem_sampling_initials = str_to_upper(problem_sampling_initials)) %>%
ungroup() %>%
# tweak to match the original
mutate(
problem_sampling_initials = ifelse(problem_sampling_initials == "MZOS", "MVZS", problem_sampling_initials),
problem_sampling_initials = ifelse(problem_sampling_initials == "MBNS", "MCBNS", problem_sampling_initials),
)
# match colors of original graphs
label_colors = tibble(
problem_sampling_initials = c("RS","BNS","DBS","MVZS","MCBNS","OBDBS","RSBS"),
color = c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2")
)
# match order of original graphs
label_order = c("RS","BNS","DBS","MVZS","MCBNS","OBDBS","RSBS")
label_colors = label_colors %>%
mutate(problem_sampling_initials = factor(problem_sampling_initials, levels = label_order))
data = data %>%
mutate(problem_sampling_initials = factor(problem_sampling_initials, levels = label_order))
# pretty labels and colors for graphs
data = data %>%
left_join(label_colors) %>%
mutate(label = glue("<span style='color:{color}'>{problem_sampling_initials}</span>") ) %>%
# match label's factor order to the original's order
arrange(problem_sampling_initials) %>%
mutate(label = fct_inorder(factor(label)))
Recreate figure 7 from “Evaluation of Sampling Methods for Scatterplots”. Importantly, this original approach ignores that the experiment was run within-subjects and with multiple trials per condition. It visualizes the results without regard to which subject ran which trial. Therefore, these results should be very cautiously interpreted.
Some helper functions
# r doesn't have a standard error function
stderr = function(x) sqrt(var(x,na.rm=TRUE)/length(na.omit(x)))
ci = function(x, confidence.interval=1-ALPHA) stderr(x) * qnorm(1 - (1-confidence.interval)/2)
Aggregate the data by condition into mean and 95% confidence intervals of the trial data. Ignore subjects and other variables.
data_ignore_subjects = data %>%
group_by(problem_sampling_initials, color) %>%
summarise(
accuracy2_lo = mean(isCorrect2) - ci(isCorrect2),
accuracy2_hi = mean(isCorrect2) + ci(isCorrect2),
accuracy2 = mean(isCorrect2),
problem_timespend_lo = mean(problem_timespend) - ci(problem_timespend),
problem_timespend_hi = mean(problem_timespend) + ci(problem_timespend),
problem_timespend = mean(problem_timespend),
)
Recreate original’s figure 7
# Accuracy
ggplot(data_ignore_subjects) +
aes(x = problem_sampling_initials, y = accuracy2, color=color) +
geom_pointrange(size=4, aes(ymin=accuracy2_lo, ymax=accuracy2_hi)) +
scale_color_identity() +
theme_classic(42) + theme(plot.margin = margin(0,0,20,0)) +
guides(color = "none") +
labs(title = "Accuracy", x = NULL, y = NULL) +
# point out a pair of similar CIs
annotate("curve", xend="RS", yend=.979, x=1.5, y=.96, curvature=-0.25, color="red", size=2, linetype="22", arrow = arrow(length = unit(0.03, "npc")), lineend = "round") +
annotate("curve", xend="BNS", yend=.977, x=1.5, y=.96, curvature=0.20, color="red", size=2, linetype="22", arrow = arrow(length = unit(0.03, "npc")), lineend = "round") +
annotate("text", x=1.5, y=.957, color="red", size=12, hjust=0.5, label="different?") +
# Speed
ggplot(data_ignore_subjects) +
aes(x = problem_sampling_initials, y = problem_timespend, color=color) +
geom_pointrange(size=4, aes(ymin=problem_timespend_lo, ymax=problem_timespend_hi)) +
scale_color_identity() +
theme_classic(42) + theme(plot.margin = margin(0,0,20,0)) +
guides(color = "none") +
labs(title = "Response time (ms)", x = NULL, y = NULL) +
# point out a pair of similar CIs
annotate("curve", xend="RS", yend=3050, x=1.9, y=3350, curvature=0.25, color="red", size=2, linetype="22", arrow = arrow(length = unit(0.03, "npc")), lineend = "round") +
#annotate("curve", xend="BNS", yend=3220, x=1.9, y=3350, curvature=0, color="red", size=2, linetype="22", arrow = arrow(length = unit(0.03, "npc")), lineend = "round", linejoin = "mitre") +
annotate("curve", xend="DBS", yend=3150, x=1.9, y=3350, curvature=-0.25, color="red", size=2, linetype="22", arrow = arrow(length = unit(0.03, "npc")), lineend = "round") +
annotate("text", x=2, y=3385, color="red", size=12, hjust=0.5, label="different?") +
# Caption
plot_annotation(
caption = "Interpret with caution: This visualization aggregates the data in a questionable way.",
theme = theme(plot.caption = element_text(size=42, color = "#C03333")) )
# Get mean accuracy per subject
accuracy = data %>%
group_by(subjectID) %>%
summarize(accuracy = mean(isCorrect2))
glue::glue("Average accuracy: {mean(accuracy$accuracy) * 100}%")
## Average accuracy: 96.625%
# Check if accuracy varied substantially per condition
data %>%
group_by(problem_sampling_initials, problem_dataset, level) %>%
summarize(accuracy = mean(isCorrect2), .groups = 'drop') %>%
ggplot() +
aes(y = problem_sampling_initials, x = accuracy) +
geom_col() +
geom_vline(xintercept = 0.9, size=1) + # 90% accuracy
facet_grid(level ~ problem_dataset) +
scale_x_continuous(labels = scales::percent, breaks = c(0,.5,.9), limits = 0:1, expand = c(0,0)) +
labs(y=NULL, title="Does accuracy vary much by condition?") +
theme_light(18)
# where do errors occur
data %>% mutate(dataset_is_clothes = problem_dataset == "clothes") %>%
group_by(dataset_is_clothes, level) %>%
summarize(errors = sum(1-isCorrect2), .groups = 'drop') %>%
mutate(errors_proportion = errors/sum(errors))
dataset_is_clothes | level | errors | errors_proportion |
---|---|---|---|
FALSE | stimulus 0 | 100 | 0.2645503 |
FALSE | stimulus 1 | 0 | 0.0000000 |
TRUE | stimulus 0 | 166 | 0.4391534 |
TRUE | stimulus 1 | 112 | 0.2962963 |
Accuracy is pretty high overall, suggesting that it likely hitting a “ceiling effect”. So manipulations impact response time much more than accuracy. However, there is a lot of variation of accuracy by dataset and by stimulus number, suggesting we shouldn’t collapse them.
We will compare subject performance with each sampling technique to that subject’s Random Sampling (RS) performance. First, find each subject’s average performance in the RS condition for each measure (duration and accuracy). Then subtract the subject’s RS performance from each trial’s response time.
The result is one row per subject * condition. All the RS values will be 0 because RS - RS == 0
.
# use "random sampling" as the baseline
baseline = data %>%
filter(problem_sampling_initials == "RS") %>%
group_by(subjectID, problem_dataset, level) %>%
summarise(accuracy2_baseline = mean(isCorrect2), problem_timespend_baseline = mean(problem_timespend)) %>%
ungroup()
# subtract baseline from others
data = data %>%
left_join(baseline) %>%
mutate(relative_accuracy2 = isCorrect2 - accuracy2_baseline) %>%
mutate(relative_timespend = problem_timespend - problem_timespend_baseline)
Normality tests are applied to categorical residuals, not raw data. So we can take the data from each condition (sampling * dataset * level) and normalize it independently. Because we have already subtracted the RS response times by subject, the subject variable is already incorporated.
# for each condition, scale it to mean=0, sd=1
scaled_timespend = data %>%
filter(problem_sampling_initials != "RS") %>%
group_by(problem_sampling_initials, problem_dataset, level) %>%
mutate(relative_timespend = scale(relative_timespend)[,1]) %>%
ungroup()
ggplot(scaled_timespend) +
aes(x = relative_timespend) +
geom_density(size=1) +
scale_x_continuous(limits = c(-2,2)) + # with 11k values, the tails are long
labs(title="Residual distribution") +
theme_light(18)
The distribution appears leptokurtic, but it’s fairly symmetrical, having only a small amount of skew. Jobson (1991) demonstrated that skew is much more of a concern than kurtosis:
In comparison to the skewness measure, the kurtosis measure therefore seems to have little influence on the first two moments of the t-ratio. Thus non-normality resulting in skewness seems to be more important than non-normality resulting in kurtosis.
- Jobson, J.D. (1991). Applied multivariate data analysis: Vol. 1. Regression and experimental design. (p. 55)
As these residuals have minimal skew, aren’t bimodal, and don’t resemble other problematic distributions, they are unlikely to impact false positive rates.
A more quantitative test could still be helpful, though. Unfortunately, tests like Shapiro-Wilk will almost always fail with over 11k trials. So let’s simulate the false positive rate by sampling from this distribution. If we run a t-test that samples from this distribution (a null effect), we expect a p-value below 0.05 to occur 5% of the time. Deviation from 5% suggests that standard ANOVAs are not well suited and non-parametric analyses are needed.
Let’s run many simulations, and see if the result is substantially different from 0.05:
replicate(
BOOT_COUNT, # simulation count
t.test(sample(scaled_timespend$relative_timespend, replace = TRUE))$p.value < 0.05
) %>% mean()
## [1] 0.0485
Since the result is close to 0.05, we don’t need to worry about a substantially inflated false-positive rate. So let’s run an ANOVA. Note, we can use the raw response times rather than the relative response times becuase the ANOVA computes relative differences as part of the process.
slow due to many conditions (112) and subjects (100). Takes ~10 minutes
# compute the ANOVA
# takes 10+ minutes on my computer :(
anova_table = aov(
problem_timespend ~ problem_sampling*problem_dataset*level + Error(subjectID/(problem_sampling*problem_dataset*level)),
data
)
# tidy it up
anova_table = anova_table %>% broom::tidy()
# split up error component from variables
residuals = anova_table %>%
filter(term == "Residuals") %>%
select(-term, -statistic, -p.value, -meansq) %>%
rename(df2 = df, error_ss = sumsq)
anova_table = anova_table %>%
filter(term != "Residuals") %>%
select(-term, -meansq) %>%
rename(df1 = df)
# merge them back
anova_table = anova_table %>% left_join(residuals)
# compute partial eta^2
anova_table = anova_table %>% mutate(pes = sumsq / (sumsq + error_ss))
# add confidence intervals to the effect sizes
anova_table %>%
rowwise() %>%
mutate(pes_lo = get.ci.partial.eta.squared(statistic, df1, df2)$LL) %>%
mutate(pes_hi = get.ci.partial.eta.squared(statistic, df1, df2)$UL) %>%
mutate(across(c(statistic, p.value, pes, pes_lo, pes_hi, sumsq, error_ss), ~format(.x, digits=3)))
stratum | df1 | sumsq | statistic | p.value | df2 | error_ss | pes | pes_lo | pes_hi |
---|---|---|---|---|---|---|---|---|---|
subjectID:problem_sampling | 6 | 2.13e+08 | 8.52 | 6.75e-09 | 594 | 2.48e+09 | 0.0793 | 0.0407 | 0.107 |
subjectID:problem_dataset | 7 | 4.18e+09 | 84.2 | 2.6e-88 | 693 | 4.92e+09 | 0.459 | 0.412 | 0.493 |
subjectID:level | 1 | 2.87e+08 | 26.4 | 1.38e-06 | 99 | 1.08e+09 | 0.211 | 0.101 | 0.319 |
subjectID:problem_sampling:problem_dataset | 42 | 5.24e+08 | 4.39 | 2.96e-19 | 4158 | 1.18e+10 | 0.0425 | 0.0241 | 0.0431 |
subjectID:problem_sampling:level | 6 | 83869772 | 3.72 | 0.00121 | 594 | 2.23e+09 | 0.0362 | 0.00884 | 0.0549 |
subjectID:problem_dataset:level | 7 | 9.34e+08 | 27.6 | 1.4e-33 | 693 | 3.35e+09 | 0.218 | 0.168 | 0.254 |
subjectID:problem_sampling:problem_dataset:level | 42 | 5.98e+08 | 4.38 | 3.47e-19 | 4158 | 1.35e+10 | 0.0424 | 0.024 | 0.043 |
What’s important to note here is a lot of interaction effects. They tell us that the main effect from the sampling technique is modulated by these other variables. To understand them better, we can graph the results.
p_values = data %>%
filter(problem_sampling_initials != "RS") %>%
group_by(problem_sampling_initials, problem_dataset, level) %>%
summarize(p_value = t.test(relative_timespend)$p.value) %>%
ungroup() %>%
# for Holm-Bonferroni, ranked the p-values and use that to adjust the alpha
arrange(desc(p_value)) %>%
mutate(p_rank = 1:n(), adjusted_alpha = ALPHA / p_rank)
# merge it into the data
data = data %>% left_join(p_values)
For each condition, sample with replacement and compute the mean.
Reduce BOOT_COUNT
to speed up.
# put into purrr's nested table format and make bootstrap copies
boots = data %>%
# get one tibble per condition and DV
group_by(problem_sampling_initials, label, color, problem_dataset, level, adjusted_alpha) %>%
nest() %>%
# duplicate each based on number of bootstrap iterations
expand_grid(boot_index = 1:BOOT_COUNT)
# bootstrapped means: randomly sample each set with replacement and compute the mean
boots = boots %>%
#randomly sample with replacement
mutate(data = map(data, ~slice_sample(., prop = 1, replace = TRUE))) %>%
# get the mean of the relative response time
mutate(data = map(data, ~mean(.$relative_timespend))) %>%
unnest(data) %>%
rename(relative_timespend = data)
Confidence interval range with multiple comparison adjustment
# simple Bonferroni (Holm-Bonferroni is more appropriate)
# adjusted_alpha = ALPHA / # initial alpha
# (length(unique(data$problem_sampling_initials)) - 1) / # unique sampling technique conditions (excluding RS)
# length(unique(data$problem_dataset)) / # unique dataset conditions
# length(unique(data$level)) # unique question conditions
# Holm-Bonferroni
CIs = boots %>%
filter(problem_sampling_initials != "RS") %>%
group_by(problem_sampling_initials, problem_dataset, level, adjusted_alpha, label) %>%
summarise(
lo = quantile(relative_timespend, ALPHA/2),
hi = quantile(relative_timespend, 1-ALPHA/2),
lo_adjusted = quantile(relative_timespend, first(adjusted_alpha)/2),
hi_adjusted = quantile(relative_timespend, 1-first(adjusted_alpha)/2)
) %>%
ungroup()
# check the smallest and largest adjustments
CIs %>% arrange(desc(adjusted_alpha)) %>% select(adjusted_alpha, label, everything(), -problem_sampling_initials) %>% head(5)
adjusted_alpha | label | problem_dataset | level | lo | hi | lo_adjusted | hi_adjusted |
---|---|---|---|---|---|---|---|
0.0500000 | DBS | epileptic_seizure | stimulus 0 | -354.1755 | 339.7992 | -354.1755 | 339.7992 |
0.0250000 | RSBS | abalone | stimulus 0 | -327.7552 | 337.2892 | -375.5385 | 379.8015 |
0.0166667 | OBDBS | swiss_roll_3d | stimulus 1 | -191.7658 | 213.1808 | -236.0186 | 266.1707 |
0.0125000 | MCBNS | swiss_roll_3d | stimulus 0 | -237.9269 | 350.1339 | -277.7009 | 476.0591 |
0.0100000 | DBS | crowdsourced_mapping | stimulus 1 | -281.3541 | 280.6519 | -347.0757 | 376.9461 |
CIs %>% arrange(desc(adjusted_alpha)) %>% select(adjusted_alpha, label, everything(), -problem_sampling_initials) %>% tail(5)
adjusted_alpha | label | problem_dataset | level | lo | hi | lo_adjusted | hi_adjusted |
---|---|---|---|---|---|---|---|
0.0005435 | BNS | clothes | stimulus 1 | 831.4457 | 2269.626 | 368.9494 | 2994.765 |
0.0005376 | MCBNS | clothes | stimulus 1 | 862.3777 | 2235.617 | 408.3022 | 2739.010 |
0.0005319 | BNS | mnist | stimulus 0 | 686.6127 | 1712.245 | 407.1720 | 2281.451 |
0.0005263 | RSBS | mnist | stimulus 0 | 994.4585 | 2234.014 | 659.8339 | 2896.398 |
0.0005208 | MCBNS | mnist | stimulus 0 | 759.9706 | 1733.681 | 443.6657 | 2159.975 |
Plot histogram of the bootstrapped means for each condition relative to RS. Show adjusted (thin) and unadjusted (thick) 95% confidence intervals of bootstraped means.
# we can filter out RS because they're all 0
ggplot(boots %>% filter(problem_sampling_initials != "RS") ) +
aes(x = relative_timespend, y = fct_rev(label), fill=color) +
# use RS as a gridline instead of plotting it
geom_vline(xintercept = 0, size=1.5, color="#1f77b4") +
# histograms
ggdist::stat_histinterval(breaks=seq(-4550, 4550, 100), point_size=0, interval_size = 0, justification = -0.1, scale=0.95) +
# scales
geom_segment(data=CIs, aes(x=lo, xend=hi, yend = fct_rev(label), fill=NULL), size=2.5) +
geom_segment(data=CIs, aes(x=lo_adjusted, xend=hi_adjusted, yend = fct_rev(label), fill=NULL), size=1.25) +
# colors, axes, etc.
scale_fill_identity() +
scale_x_continuous(expand = c(0,0), breaks = c(-3000, 0, 3000), labels = c("faster", "<span style='color:#1f77b4'>RS</span>", "slower")) +
scale_y_discrete(expand = expansion(mult = 0, add = c(.2, 0))) +
coord_cartesian(xlim=c(-4000, 4000)) + # Prevent really wide CIs from squishing everything else
facet_grid(level ~ problem_dataset) +
theme_gray(18) + theme(
axis.text.x = element_markdown(),
axis.text.y = element_markdown(),
axis.ticks.x = element_blank(),
panel.grid = element_blank(),
plot.margin = unit(c(0,0,0,0), "points")
) +
guides(color = "none", fill = "none") +
labs(title = "Response time relative to Random Sampling", x=NULL, y=NULL)
The dataset dimension is categorical and yields performance differences that noticeably vary by sampling technique.
It is also really odd that there is a systemic difference across question/stimulus number, suggesting that the stimuli are not randomly selected. It may be worth investigating whether this relationship impacts the results.
Overall, the inconsistent performance of the different sampling techniques prevents any generalized conclusion across datasets or stimuli.
The original article discretized the results by analyzing and drawing a conclusion about which technique is “best” rather than how much better it is compared to others. Discretizing doesn’t differentiate subtle vs substantial differences, which can make for misleading results. However, let’s put aside that concern and try a discretized approach to the analysis.
For each dataset and stimulus, what proportion of subjects perform fastest with RS?
# for each dataset, stimulus, and subject, select only the fastest technique
data %>%
mutate(isCorrect2 = ifelse(isCorrect2, "Correct responses", "Incorrect responses")) %>%
group_by(problem_dataset, level, subjectID, isCorrect2) %>%
filter(problem_timespend == min(problem_timespend)) %>%
# for each dataset and stimulus, how many subjects have RS as the fastest?
group_by(problem_dataset, level, isCorrect2) %>%
summarise( RS_is_best = mean(problem_sampling_initials=="RS") ) %>%
ungroup() %>%
#complete(problem_dataset, level, isCorrect2, fill = list(RS_is_best=0)) %>%
ggplot() +
aes(x = level, y = RS_is_best) +
geom_col() +
geom_text(aes(label = scales::percent(RS_is_best, 1)), vjust=-0.5) +
facet_grid(isCorrect2 ~ problem_dataset) +
scale_y_continuous(limits = 0:1, expand = c(0,0), breaks = seq(0,1,.25), labels = scales::percent) +
theme(
#plot.margin = unit(c(0,0,0,0), "points"),
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
) +
labs(title = "Percent of subjects who are fastest with RS", x=NULL, y=NULL)
If RS were usually fastest, it’d be above 50% for most conditions. But there is no condition where the majority of subjects are fastest with RS.
Which technique is fastest under each condition?
# for each dataset, stimulus, and subject, select only the fastest technique
data %>%
mutate(isCorrect2 = ifelse(isCorrect2, "Correct responses", "Incorrect responses")) %>%
group_by(problem_dataset, level, isCorrect2, subjectID) %>%
filter(problem_timespend == min(problem_timespend)) %>%
# for each dataset and stimulus, how many subjects are fastest with each technique?
group_by(problem_dataset, level, isCorrect2, problem_sampling_initials, label, color) %>%
summarise( is_best = n() ) %>%
group_by(problem_dataset, level, isCorrect2) %>%
mutate(is_best = is_best / sum(is_best)) %>%
ungroup() %>%
ggplot() +
aes(x = label, y = is_best, fill=color) +
geom_col() +
geom_text(aes(label = scales::percent(is_best, 1)), vjust=-0.5) +
facet_grid(level * isCorrect2 ~ problem_dataset) +
scale_y_continuous(limits = 0:1, expand = c(0,0), breaks = seq(0,1,.25), labels = scales::percent) +
scale_fill_identity() +
theme(
#plot.margin = unit(c(0,0,0,0), "points"),
axis.text.x = element_markdown(),
axis.ticks.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
) +
labs(title = "Percent of subjects for whom each technique is fastest", x=NULL, y=NULL)
No sampling technique stands out as consistently best in any condition.
Even with a discretized approach, claims that “RS is best” are not compelling.
When categorical variables like the dataset aren’t parameterized, it’s unclear how or if they should be pooled. Are they evenly sampling a parameter space? Or are most of them clumped together, while one is distinct? Maybe most of them are obscure types of datasets, and only one is commonly used? The lack of parameterization or weighting to base pooling on means that simply taking an average has no basis. But, let’s put that concern aside.
What if we use accuracy as a basis for pooling by separately analyzing correct answers and incorrect answers?
I don’t understand why stimulus 0 and stimulus 1 (level
in the data) are so systematically different in raw performance, so I’m avoiding pooling that variable.
Note that this approach unevenly distributes subjects and datasets across conditions, which may bias the results. Interpret with caution.
# copy the data for the alternative analysis
data2 = data %>%
select(-adjusted_alpha, -p_value, -p_rank) %>% # drop previous analysis
mutate(isCorrect2 = ifelse(isCorrect2, "Correct", "Incorrect")) # simpler labels
# p-values for adjusted alpha
p_values = data2 %>%
filter(problem_sampling_initials != "RS") %>%
group_by(problem_sampling_initials, level, isCorrect2) %>%
summarize(p_value = t.test(relative_timespend)$p.value) %>%
ungroup() %>%
# for Holm-Bonferroni, ranked the p-values and use that to adjust the alpha
arrange(desc(p_value)) %>%
mutate(p_rank = 1:n(), adjusted_alpha = ALPHA / p_rank)
# merge it into the data
data2 = data2 %>% left_join(p_values)
# put into purrr's nested table format and make bootstrap copies
boots2 = data2 %>%
# get one tibble per condition and DV
group_by(problem_sampling_initials, label, color, level, isCorrect2, adjusted_alpha) %>%
nest() %>%
# duplicate each based on number of bootstrap iterations
expand_grid(boot_index = 1:(BOOT_COUNT))
# bootstrapped means: randomly sample each set with replacement and compute the mean
boots2 = boots2 %>%
#randomly sample with replacement
mutate(data = map(data, ~slice_sample(., prop = 1, replace = TRUE))) %>%
# get the mean of the relative response time
mutate(data = map(data, ~mean(.$relative_timespend))) %>%
unnest(data) %>%
rename(relative_timespend = data)
# Holm-Bonferroni
CIs2 = boots2 %>%
filter(problem_sampling_initials != "RS") %>%
group_by(problem_sampling_initials, level, isCorrect2, adjusted_alpha, label, color) %>%
summarise(
lo = quantile(relative_timespend, ALPHA/2),
hi = quantile(relative_timespend, 1-ALPHA/2),
lo_adjusted = quantile(relative_timespend, first(adjusted_alpha)/2),
hi_adjusted = quantile(relative_timespend, 1-first(adjusted_alpha)/2)
) %>%
ungroup()
ggplot(boots2 %>% filter(problem_sampling_initials != "RS")) +
aes(x = relative_timespend, y = fct_rev(label), fill=color) +
# use RS as a gridline instead of plotting it
geom_vline(xintercept = 0, size=1.5, color="#1f77b4") +
# histograms
#ggdist::stat_histinterval(breaks=seq(-1550, 1550, 10), point_size=0, interval_size = 0, justification = -0.1, scale=0.95) +
# scales
geom_segment(data=CIs2, aes(x=lo, xend=hi, yend = fct_rev(label), fill=NULL, color=color), size=2.5) +
geom_segment(data=CIs2, aes(x=lo_adjusted, xend=hi_adjusted, yend = fct_rev(label), fill=NULL, color=color), size=1.25) +
# colors, axes, etc.
scale_color_identity() +
scale_fill_identity() +
scale_x_continuous(expand = c(0,0), breaks = c(-1500, 0, 1500), labels = c("faster", "<span style='color:#1f77b4'>RS</span>", "slower")) +
scale_y_discrete(expand = expansion(mult = 0, add = c(.2, .2))) +
coord_cartesian(xlim=c(-2000, 2000)) + # Prevent really wide CIs from squishing everything else
facet_grid(isCorrect2 ~ level) +
theme_gray(18) + theme(
axis.text.x = element_markdown(),
axis.text.y = element_markdown(),
axis.ticks.x = element_blank(),
panel.grid = element_blank(),
plot.margin = unit(c(0,0,0,0), "points")
) +
guides(color = "none", fill = "none") +
labs(title = "Response time relative to Random Sampling", x=NULL, y=NULL)
This zoomed-in graph shows that BNS and DBS perform on par with RS and that the statistical difference does not meet the original article’s alpha threshold. When subjects are incorrect, some techniques perform MUCH more slowlt compared to RS, which may have been driving the original article’s insinuation that RS performs best.
RS, BNS, and DBS all seem to perform similarly. Again, claims that “RS is best” are not compelling.
Why not just average across categories? Because seemingly distinct categories often exist in a parameter space. What those parameters are and where the categories fall in that space can wildly swing the results. Are most clustered together with a couple outliers? Or are they evenly distributed for each parameter? If we don’t understand the parameters or distributions, we don’t understand how to pool together the categories as a random effect.
data = tibble(
topping = c("Cheese", "Pickles", "Mustard", "Ground beef", "Oreos"),
relative_preference = c(-3, -1.5, -4, -3, 1.5)
) %>%
expand_grid(subject_ID = 1:100)
data = data %>%
mutate(topping = fct_inorder(factor(topping)))
data = data %>%
rowwise() %>%
mutate(relative_preference = relative_preference + rnorm(1, 0, 0.4)) %>%
ungroup()
data = data %>%
group_by(topping) %>%
summarise(
lo = quantile(relative_preference, 0.025),
med = quantile(relative_preference, 0.5),
hi = quantile(relative_preference, 0.975)
)
ggplot(data) +
aes(y=topping, xmin=lo, x=med, xmax=hi) +
geom_vline(xintercept = 0, color="gray", size=1, linetype = "22") +
geom_pointrange(size=1.5) +
scale_x_continuous(breaks = c(-4.5, 0, 4.5), labels = c("< worse than plain", "equal to plain", "better than plain >"), limits=c(-5.5, 5.5), minor_breaks = F, position = "top") +
theme_minimal(20) +
theme(
panel.grid.major.y = element_line(linetype = "dotted"),
panel.grid.major.x = element_blank(),
plot.title = element_text(hjust = 0.5, margin = margin(b=2, unit="lines"))) +
labs(
y = NULL, x = NULL,
title = "Preference of toppings over plain ice cream") +
tibble(y = "(average)", x = mean(data$med)) %>%
ggplot() +
aes(y=y, x=x) +
geom_vline(xintercept = 0, color="gray", size=1, linetype = "22") +
geom_point(size=7.5, shape=18) +
scale_x_continuous(breaks = c(-4.5, 0, 4.5), labels = c("", "", ""), limits=c(-5.5, 5.5), minor_breaks = F) +
theme_minimal(20) +
theme(panel.grid.major.y = element_line(linetype = "dotted"), panel.grid.major.x = element_blank()) +
labs(y = NULL, x = NULL) +
patchwork::plot_layout(ncol=1, heights = c(10, 1)) +
patchwork::plot_annotation(
caption = "Can we conclude that ice cream with toppings is worse than plain ice cream? No.\nOreos are a preferred topping, and ground beef is a detested topping.\nBoth are true, and they don't cancel each other out.\nAggregating across categories is not always informative.",
theme = theme(
plot.margin = margin(10, 10, 15, 10),
plot.caption = element_text(size=16, hjust = 0, lineheight = 1.1, margin = margin(1, unit = "lines"))))
Thanks to Matthew Kay, Kimberly Quinn, Brenton Wiernik, Pierre Dragicevic, Lonni Besançon, Lucija Batinovic, and Alex Holcombe for helpful comments.
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices datasets utils methods base
##
## other attached packages:
## [1] apaTables_2.0.8 MBESS_4.8.1 broom_0.7.10 ggdist_3.0.1
## [5] ragg_1.2.1 glue_1.5.1 ggtext_0.1.1 patchwork_1.1.1
## [9] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4
## [13] readr_2.1.0 tidyr_1.1.4 tibble_3.1.6 ggplot2_3.3.5
## [17] tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.2 jsonlite_1.7.2 modelr_0.1.8
## [4] assertthat_0.2.1 distributional_0.2.2 highr_0.9
## [7] renv_0.14.0 cellranger_1.1.0 yaml_2.2.1
## [10] pillar_1.6.4 backports_1.3.0 digest_0.6.28
## [13] gridtext_0.1.4 rvest_1.0.2 colorspace_2.0-2
## [16] htmltools_0.5.2 pkgconfig_2.0.3 haven_2.4.3
## [19] scales_1.1.1 tzdb_0.2.0 generics_0.1.1
## [22] farver_2.1.0 ellipsis_0.3.2 withr_2.4.2
## [25] cli_3.1.0 magrittr_2.0.1 crayon_1.4.2
## [28] readxl_1.3.1 evaluate_0.14 fs_1.5.0
## [31] fansi_0.5.0 xml2_1.3.2 textshaping_0.3.6
## [34] tools_4.1.2 hms_1.1.1 lifecycle_1.0.1
## [37] munsell_0.5.0 reprex_2.0.1 compiler_4.1.2
## [40] jquerylib_0.1.4 systemfonts_1.0.3 rlang_0.4.12
## [43] grid_4.1.2 rstudioapi_0.13 labeling_0.4.2
## [46] rmarkdown_2.11 gtable_0.3.0 DBI_1.1.1
## [49] markdown_1.1 R6_2.5.1 lubridate_1.8.0
## [52] knitr_1.36 fastmap_1.1.0 utf8_1.2.2
## [55] stringi_1.7.6 Rcpp_1.0.7 vctrs_0.3.8
## [58] dbplyr_2.1.1 tidyselect_1.1.1 xfun_0.27