Original article:
Evaluation of Sampling Methods for Scatterplots
Jun Yuan, Shouxing Xiang, Jiazhi Xia, Lingyun Yu, and Shixia Liu
https://arxiv.org/pdf/2007.14666.pdf

Repository with this analysis, the writeup, and the original data:
https://osf.io/hsujr/

1 Setup

# The first time you knit this markdown, you'll need to call install.packages("PACKAGE") for each library call
library(tidyverse) # lots of basic functionality
library(patchwork) # combine plots
library(ggtext) # for colored text
library(glue) # for colored text
library(ragg) # nicer figure output
library(ggdist) # stat_histinterval used in final figure

library(broom) # broom::tidy() puts the anova in a manageable dataframe
library(MBESS) # dependency for apaTables
library(apaTables) # get.ci.partial.eta.squared() gives confidence intervals for the effect sizes

# alpha value of the original article
ALPHA = 0.05

# How many bootstrap iterations to do
BOOT_COUNT = 10 * 1000

# simplify everything when debugging
DEBUG = FALSE

# trial data folder
DATA_FOLDER_TRIALS = "libsampling/trial data/publish"
DATA_FOLDER_ANSWERS = "libsampling/trial data/answer"

# folder for hi res figures and other things
OUTPUT_FOLDER = "paper/"

# for reproducibility
set.seed(1234)

1.1 Load and parse the data

Load the response data from experiment 1.

filenames = list.files(DATA_FOLDER_TRIALS, pattern="*.txt", full.names = T)

# all the experiments are stored together. Starting with experiment 1
EXPERIMENT_1_RANGE = 4:115

# function to read in all the trials
read_trial_data = function(filename) {
  text = read_lines(filename)
  
  # read the json line by line
  json = lapply(text[EXPERIMENT_1_RANGE], function(line) {
    # print(line) 
    trial = jsonlite::fromJSON(line)
    # flatten the answers info
    answers = trial$problem_answer
    for (a in names(answers)) {
      trial[a] = answers[a]
    }
    trial$problem_answer = NULL
    
    # flatten multi-facetted answers
    for (name in names(trial)) {
      # drop empties
      if (length(trial[name][[1]]) == 0) 
        trial[name] = NA
      # flatten vectors
      if (length(trial[name][[1]]) > 1) {
        trial[name] = paste(trial[[name]], collapse = ",")
      }
    }
    # add subject name
    trial$subjectID = basename(filename) %>% str_split(fixed("."), simplify = TRUE) %>% `[`(1)
    # return
    trial
  })
  
  data = bind_rows(json)
  return(data)
}

# apply the function to each filename and merge them together
data = lapply(filenames, read_trial_data) %>% 
  bind_rows()

# cleanup
rm(EXPERIMENT_1_RANGE, filenames, read_trial_data)

Load the correct answers.
Slow because of all the string manipulation

filenames = list.files(DATA_FOLDER_ANSWERS, pattern="*.txt", full.names = T)

# function to read the correct answers for each subject
read_correct_answers = function(filename) {
  lines = read_lines(filename)
  lines = lapply(lines, function(line) {
    line %>% 
    str_replace(fixed("random sampling"), fixed("random_sampling")) %>% 
    str_replace(fixed("blue noise sampling"), fixed("blue_noise_sampling")) %>% 
    str_replace(fixed("density biased sampling"), fixed("density_biased_sampling")) %>% 
    str_replace(fixed("multi-view Z order sampling"), fixed("multi-view_Z_order_sampling")) %>% 
    str_replace(fixed("multi-class blue_noise_sampling"), fixed("multi-class_blue_noise_sampling")) %>% 
    str_replace(fixed("outlier biased density based sampling"),fixed("outlier_biased_density_based_sampling"))%>%
    str_replace(fixed("recursive subdivision based sampling"), fixed("recursive_subdivision_based_sampling")) %>%
    str_split(fixed(" "), simplify = T)
  })
  
  # drop the experiment description line
  lines = lines[-1]
  
  # drop experimnet 2 onward
  start_of_experiment_2 = (1:length(lines))[sapply(lines, length)!=3]
  lines = lines[1:start_of_experiment_2-1]
  # make into tibble
  answers = lapply(
    lines, 
    function(line) { 
      colnames(line) = c("problem_dataset", "problem_sampling", "actual_sort_index"); 
      as_tibble(line) 
    }) %>% 
    bind_rows()
  answers = answers %>% separate(problem_dataset, c("problem_dataset", "level"), sep = "-")
  # drop synthetic condition
  answers = answers %>% filter(!str_detect(problem_dataset, fixed("synthetic")))
  # add subject name
  answers$subjectID = basename(filename) %>% str_split(fixed("."), simplify = TRUE) %>% `[`(1)
  return(answers)
}

# apply the loader to each filename and merge
correct_answers = lapply(filenames, read_correct_answers) %>% bind_rows()

# cleanup
rm(filenames, read_correct_answers)

Join the responses to the correct answers and compare.

# filter out answers for missing subjects (why missing???) 
correct_answers = correct_answers %>% filter(subjectID %in% unique(data$subjectID))

# merge correct answers and data
correct_answers = correct_answers %>% mutate(problem_sampling = problem_sampling %>% 
    str_replace(fixed("random_sampling"), fixed("random sampling")) %>% 
    str_replace(fixed("blue_noise_sampling"), fixed("blue noise sampling")) %>% 
    str_replace(fixed("density_biased_sampling"), fixed("density biased sampling")) %>% 
    str_replace(fixed("multi-view_Z_order_sampling"), fixed("multi-view Z order sampling")) %>% 
    str_replace(fixed("multi-class_blue noise sampling"), fixed("multi-class blue noise sampling")) %>% 
    str_replace(fixed("outlier_biased_density_based_sampling"), fixed("outlier biased density based sampling")) %>% 
    str_replace(fixed("recursive_subdivision_based_sampling"), fixed("recursive subdivision based sampling"))
)
correct_answers = correct_answers %>% mutate(level = as.numeric(level))
data = left_join(data, correct_answers)

# check if answer is correct
data = data %>% mutate(
    isCorrect = sort_index == actual_sort_index, 
    isCorrect2 = sort_index != actual_sort_index) # don't know why, but the reverse seems to match the paper's results

# output processed data
# you can load this file to start from here: data = read_csv("Experiment 1 processed.csv")
data %>% write_csv(paste0(OUTPUT_FOLDER, "Experiment 1 processed.csv"))

# cleanup
rm(correct_answers)

Make some variables more presentable.

data = data %>% mutate(level = paste("stimulus", level))
data = data %>% mutate(subjectID = paste0("S", subjectID))

1.2 Match labels of original

Make the labels, colors, and order from the original article. It’s easiest to take care of this at the beginning, so we don’t have to redo it for each graph.

# make initials and ordering match the original article
data = data %>%
  # get initials
  rowwise() %>% 
  mutate(problem_sampling_initials = paste(str_sub(str_split(problem_sampling, " ", simplify = T), 1, 1), collapse="")) %>% 
  mutate(problem_sampling_initials = str_to_upper(problem_sampling_initials)) %>% 
  ungroup() %>% 
  # tweak to match the original
  mutate(
      problem_sampling_initials = ifelse(problem_sampling_initials == "MZOS", "MVZS", problem_sampling_initials),
      problem_sampling_initials = ifelse(problem_sampling_initials == "MBNS", "MCBNS", problem_sampling_initials),
  )

# match colors of original graphs
label_colors = tibble(
  problem_sampling_initials = c("RS","BNS","DBS","MVZS","MCBNS","OBDBS","RSBS"), 
  color = c("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2")
)

# match order of original graphs
label_order = c("RS","BNS","DBS","MVZS","MCBNS","OBDBS","RSBS")
label_colors = label_colors %>% 
  mutate(problem_sampling_initials = factor(problem_sampling_initials, levels = label_order))
data = data %>% 
  mutate(problem_sampling_initials = factor(problem_sampling_initials, levels = label_order))

# pretty labels and colors for graphs
data = data %>% 
  left_join(label_colors) %>% 
  mutate(label = glue("<span style='color:{color}'>{problem_sampling_initials}</span>") ) %>% 
  # match label's factor order to the original's order
  arrange(problem_sampling_initials) %>% 
  mutate(label = fct_inorder(factor(label)))

2 Recreate the original graphs

Recreate figure 7 from “Evaluation of Sampling Methods for Scatterplots”. Importantly, this original approach ignores that the experiment was run within-subjects and with multiple trials per condition. It visualizes the results without regard to which subject ran which trial. Therefore, these results should be very cautiously interpreted.

Some helper functions

# r doesn't have a standard error function
stderr = function(x) sqrt(var(x,na.rm=TRUE)/length(na.omit(x)))
ci = function(x, confidence.interval=1-ALPHA) stderr(x) * qnorm(1 - (1-confidence.interval)/2)

Aggregate the data by condition into mean and 95% confidence intervals of the trial data. Ignore subjects and other variables.

data_ignore_subjects = data %>% 
  group_by(problem_sampling_initials, color) %>% 
  summarise(
    accuracy2_lo = mean(isCorrect2) - ci(isCorrect2),
    accuracy2_hi = mean(isCorrect2) + ci(isCorrect2),
    accuracy2    = mean(isCorrect2),
    problem_timespend_lo = mean(problem_timespend) - ci(problem_timespend),
    problem_timespend_hi = mean(problem_timespend) + ci(problem_timespend),
    problem_timespend    = mean(problem_timespend),
  )

Recreate original’s figure 7

# Accuracy
ggplot(data_ignore_subjects) +
  aes(x = problem_sampling_initials, y = accuracy2, color=color) +
  geom_pointrange(size=4, aes(ymin=accuracy2_lo, ymax=accuracy2_hi)) +
  scale_color_identity() +
  theme_classic(42) + theme(plot.margin = margin(0,0,20,0)) +
  guides(color = "none") +
  labs(title = "Accuracy", x = NULL, y = NULL) +
  # point out a pair of similar CIs
  annotate("curve", xend="RS", yend=.979, x=1.5, y=.96, curvature=-0.25, color="red", size=2, linetype="22", arrow = arrow(length = unit(0.03, "npc")), lineend = "round") +
  annotate("curve", xend="BNS", yend=.977, x=1.5, y=.96, curvature=0.20, color="red", size=2, linetype="22", arrow = arrow(length = unit(0.03, "npc")), lineend = "round") +
  annotate("text", x=1.5, y=.957, color="red", size=12, hjust=0.5, label="different?") +

# Speed
ggplot(data_ignore_subjects) +
  aes(x = problem_sampling_initials, y = problem_timespend, color=color) +
  geom_pointrange(size=4, aes(ymin=problem_timespend_lo, ymax=problem_timespend_hi)) +
  scale_color_identity() +
  theme_classic(42) + theme(plot.margin = margin(0,0,20,0)) +
  guides(color = "none") +
  labs(title = "Response time (ms)", x = NULL, y = NULL) +
  # point out a pair of similar CIs
  annotate("curve", xend="RS", yend=3050, x=1.9, y=3350, curvature=0.25, color="red", size=2, linetype="22", arrow = arrow(length = unit(0.03, "npc")), lineend = "round") +
  #annotate("curve", xend="BNS", yend=3220, x=1.9, y=3350, curvature=0, color="red", size=2, linetype="22", arrow = arrow(length = unit(0.03, "npc")), lineend = "round", linejoin = "mitre") +
  annotate("curve", xend="DBS", yend=3150, x=1.9, y=3350, curvature=-0.25, color="red", size=2, linetype="22", arrow = arrow(length = unit(0.03, "npc")), lineend = "round") +
  annotate("text", x=2, y=3385, color="red", size=12, hjust=0.5, label="different?") +

# Caption
plot_annotation( 
  caption = "Interpret with caution: This visualization aggregates the data in a questionable way.", 
  theme = theme(plot.caption = element_text(size=42, color = "#C03333")) )

3 Reanalyzing the data

3.1 How is inaccuracy distributed?

# Get mean accuracy per subject
accuracy = data %>%
  group_by(subjectID) %>%
  summarize(accuracy = mean(isCorrect2))
glue::glue("Average accuracy: {mean(accuracy$accuracy) * 100}%")

## Average accuracy: 96.625%

# Check if accuracy varied substantially per condition
data %>% 
  group_by(problem_sampling_initials, problem_dataset, level) %>% 
  summarize(accuracy = mean(isCorrect2), .groups = 'drop') %>% 
ggplot() +
  aes(y = problem_sampling_initials, x = accuracy) +
  geom_col() +
  geom_vline(xintercept = 0.9, size=1) + # 90% accuracy
  facet_grid(level ~ problem_dataset) +
  scale_x_continuous(labels = scales::percent, breaks = c(0,.5,.9), limits = 0:1, expand = c(0,0)) +
  labs(y=NULL, title="Does accuracy vary much by condition?") +
  theme_light(18)

# where do errors occur
data %>% mutate(dataset_is_clothes = problem_dataset == "clothes") %>% 
  group_by(dataset_is_clothes, level) %>% 
  summarize(errors = sum(1-isCorrect2), .groups = 'drop') %>% 
  mutate(errors_proportion = errors/sum(errors))

dataset_is_clothes	level	errors	errors_proportion
FALSE	stimulus 0	100	0.2645503
FALSE	stimulus 1	0	0.0000000
TRUE	stimulus 0	166	0.4391534
TRUE	stimulus 1	112	0.2962963

Accuracy is pretty high overall, suggesting that it likely hitting a “ceiling effect”. So manipulations impact response time much more than accuracy. However, there is a lot of variation of accuracy by dataset and by stimulus number, suggesting we shouldn’t collapse them.

3.2 Relative to the RS condition

We will compare subject performance with each sampling technique to that subject’s Random Sampling (RS) performance. First, find each subject’s average performance in the RS condition for each measure (duration and accuracy). Then subtract the subject’s RS performance from each trial’s response time.

The result is one row per subject * condition. All the RS values will be 0 because RS - RS == 0.

# use "random sampling" as the baseline
baseline = data %>% 
  filter(problem_sampling_initials == "RS") %>% 
  group_by(subjectID, problem_dataset, level) %>% 
  summarise(accuracy2_baseline = mean(isCorrect2), problem_timespend_baseline = mean(problem_timespend)) %>% 
  ungroup()

# subtract baseline from others
data = data %>% 
  left_join(baseline) %>% 
  mutate(relative_accuracy2 = isCorrect2 - accuracy2_baseline) %>% 
  mutate(relative_timespend = problem_timespend - problem_timespend_baseline)

3.3 Normality

Normality tests are applied to categorical residuals, not raw data. So we can take the data from each condition (sampling * dataset * level) and normalize it independently. Because we have already subtracted the RS response times by subject, the subject variable is already incorporated.

# for each condition, scale it to mean=0, sd=1
scaled_timespend = data %>% 
  filter(problem_sampling_initials != "RS") %>% 
  group_by(problem_sampling_initials, problem_dataset, level) %>% 
  mutate(relative_timespend = scale(relative_timespend)[,1]) %>% 
  ungroup()

ggplot(scaled_timespend) +
  aes(x = relative_timespend) +
  geom_density(size=1) +
  scale_x_continuous(limits = c(-2,2)) + # with 11k values, the tails are long
  labs(title="Residual distribution") +
  theme_light(18)

The distribution appears leptokurtic, but it’s fairly symmetrical, having only a small amount of skew. Jobson (1991) demonstrated that skew is much more of a concern than kurtosis:

In comparison to the skewness measure, the kurtosis measure therefore seems to have little influence on the first two moments of the t-ratio. Thus non-normality resulting in skewness seems to be more important than non-normality resulting in kurtosis.
- Jobson, J.D. (1991). Applied multivariate data analysis: Vol. 1. Regression and experimental design. (p. 55)

As these residuals have minimal skew, aren’t bimodal, and don’t resemble other problematic distributions, they are unlikely to impact false positive rates.

A more quantitative test could still be helpful, though. Unfortunately, tests like Shapiro-Wilk will almost always fail with over 11k trials. So let’s simulate the false positive rate by sampling from this distribution. If we run a t-test that samples from this distribution (a null effect), we expect a p-value below 0.05 to occur 5% of the time. Deviation from 5% suggests that standard ANOVAs are not well suited and non-parametric analyses are needed.

Let’s run many simulations, and see if the result is substantially different from 0.05:

replicate(
  BOOT_COUNT, # simulation count
  t.test(sample(scaled_timespend$relative_timespend, replace = TRUE))$p.value < 0.05
) %>% mean()

## [1] 0.0485

3.4 ANOVA

Since the result is close to 0.05, we don’t need to worry about a substantially inflated false-positive rate. So let’s run an ANOVA. Note, we can use the raw response times rather than the relative response times becuase the ANOVA computes relative differences as part of the process.

slow due to many conditions (112) and subjects (100). Takes ~10 minutes

# compute the ANOVA
# takes 10+ minutes on my computer :(
anova_table = aov(
  problem_timespend ~ problem_sampling*problem_dataset*level + Error(subjectID/(problem_sampling*problem_dataset*level)),
  data
)

# tidy it up
anova_table = anova_table %>% broom::tidy()

# split up error component from variables
residuals = anova_table %>%
  filter(term == "Residuals") %>%
  select(-term, -statistic, -p.value, -meansq) %>%
  rename(df2 = df, error_ss = sumsq)
anova_table = anova_table %>%
  filter(term != "Residuals") %>%
  select(-term, -meansq) %>%
  rename(df1 = df)

# merge them back
anova_table = anova_table %>% left_join(residuals)

# compute partial eta^2
anova_table = anova_table %>% mutate(pes = sumsq / (sumsq + error_ss))

# add confidence intervals to the effect sizes
anova_table %>%
  rowwise() %>%
  mutate(pes_lo = get.ci.partial.eta.squared(statistic, df1, df2)$LL) %>%
  mutate(pes_hi = get.ci.partial.eta.squared(statistic, df1, df2)$UL) %>%
  mutate(across(c(statistic, p.value, pes, pes_lo, pes_hi, sumsq, error_ss), ~format(.x, digits=3)))

stratum	df1	sumsq	statistic	p.value	df2	error_ss	pes	pes_lo	pes_hi
subjectID:problem_sampling	6	2.13e+08	8.52	6.75e-09	594	2.48e+09	0.0793	0.0407	0.107
subjectID:problem_dataset	7	4.18e+09	84.2	2.6e-88	693	4.92e+09	0.459	0.412	0.493
subjectID:level	1	2.87e+08	26.4	1.38e-06	99	1.08e+09	0.211	0.101	0.319
subjectID:problem_sampling:problem_dataset	42	5.24e+08	4.39	2.96e-19	4158	1.18e+10	0.0425	0.0241	0.0431
subjectID:problem_sampling:level	6	83869772	3.72	0.00121	594	2.23e+09	0.0362	0.00884	0.0549
subjectID:problem_dataset:level	7	9.34e+08	27.6	1.4e-33	693	3.35e+09	0.218	0.168	0.254
subjectID:problem_sampling:problem_dataset:level	42	5.98e+08	4.38	3.47e-19	4158	1.35e+10	0.0424	0.024	0.043

What’s important to note here is a lot of interaction effects. They tell us that the main effect from the sampling technique is modulated by these other variables. To understand them better, we can graph the results.

3.5 Per-condition p-values and Holm-Bonferroni

p_values = data %>% 
  filter(problem_sampling_initials != "RS") %>% 
  group_by(problem_sampling_initials, problem_dataset, level) %>% 
  summarize(p_value = t.test(relative_timespend)$p.value) %>% 
  ungroup() %>% 
  # for Holm-Bonferroni, ranked the p-values and use that to adjust the alpha
  arrange(desc(p_value)) %>% 
  mutate(p_rank = 1:n(), adjusted_alpha = ALPHA / p_rank)

# merge it into the data
data = data %>% left_join(p_values)

3.6 Bootstrapped means

For each condition, sample with replacement and compute the mean.

Reduce BOOT_COUNT to speed up.

# put into purrr's nested table format and make bootstrap copies
boots = data %>% 
  # get one tibble per condition and DV
  group_by(problem_sampling_initials, label, color, problem_dataset, level, adjusted_alpha) %>% 
  nest() %>% 
  # duplicate each based on number of bootstrap iterations
  expand_grid(boot_index = 1:BOOT_COUNT)

# bootstrapped means: randomly sample each set with replacement and compute the mean
boots = boots %>% 
  #randomly sample with replacement
  mutate(data = map(data, ~slice_sample(., prop = 1, replace = TRUE))) %>% 
  # get the mean of the relative response time
  mutate(data = map(data, ~mean(.$relative_timespend))) %>% 
  unnest(data) %>% 
  rename(relative_timespend = data)

3.7 Plots

Confidence interval range with multiple comparison adjustment

# simple Bonferroni (Holm-Bonferroni is more appropriate)
# adjusted_alpha = ALPHA / # initial alpha
#   (length(unique(data$problem_sampling_initials)) - 1) / # unique sampling technique conditions (excluding RS)
#   length(unique(data$problem_dataset)) / # unique dataset conditions
#   length(unique(data$level)) # unique question conditions

# Holm-Bonferroni
CIs = boots %>% 
  filter(problem_sampling_initials != "RS") %>% 
  group_by(problem_sampling_initials, problem_dataset, level, adjusted_alpha, label) %>% 
  summarise(
    lo = quantile(relative_timespend, ALPHA/2),
    hi = quantile(relative_timespend, 1-ALPHA/2),
    lo_adjusted = quantile(relative_timespend, first(adjusted_alpha)/2),
    hi_adjusted = quantile(relative_timespend, 1-first(adjusted_alpha)/2)
  ) %>% 
  ungroup()

# check the smallest and largest adjustments
CIs %>% arrange(desc(adjusted_alpha)) %>% select(adjusted_alpha, label, everything(), -problem_sampling_initials) %>% head(5)

adjusted_alpha	label	problem_dataset	level	lo	hi	lo_adjusted	hi_adjusted
0.0500000	DBS	epileptic_seizure	stimulus 0	-354.1755	339.7992	-354.1755	339.7992
0.0250000	RSBS	abalone	stimulus 0	-327.7552	337.2892	-375.5385	379.8015
0.0166667	OBDBS	swiss_roll_3d	stimulus 1	-191.7658	213.1808	-236.0186	266.1707
0.0125000	MCBNS	swiss_roll_3d	stimulus 0	-237.9269	350.1339	-277.7009	476.0591
0.0100000	DBS	crowdsourced_mapping	stimulus 1	-281.3541	280.6519	-347.0757	376.9461

CIs %>% arrange(desc(adjusted_alpha)) %>% select(adjusted_alpha, label, everything(), -problem_sampling_initials) %>% tail(5)

adjusted_alpha	label	problem_dataset	level	lo	hi	lo_adjusted	hi_adjusted
0.0005435	BNS	clothes	stimulus 1	831.4457	2269.626	368.9494	2994.765
0.0005376	MCBNS	clothes	stimulus 1	862.3777	2235.617	408.3022	2739.010
0.0005319	BNS	mnist	stimulus 0	686.6127	1712.245	407.1720	2281.451
0.0005263	RSBS	mnist	stimulus 0	994.4585	2234.014	659.8339	2896.398
0.0005208	MCBNS	mnist	stimulus 0	759.9706	1733.681	443.6657	2159.975

Plot histogram of the bootstrapped means for each condition relative to RS. Show adjusted (thin) and unadjusted (thick) 95% confidence intervals of bootstraped means.

# we can filter out RS because they're all 0
ggplot(boots %>% filter(problem_sampling_initials != "RS") ) +
  aes(x = relative_timespend, y = fct_rev(label), fill=color) +
  # use RS as a gridline instead of plotting it
  geom_vline(xintercept = 0, size=1.5, color="#1f77b4") +
  # histograms
  ggdist::stat_histinterval(breaks=seq(-4550, 4550, 100), point_size=0, interval_size = 0, justification = -0.1, scale=0.95) +
  # scales
  geom_segment(data=CIs, aes(x=lo, xend=hi, yend = fct_rev(label), fill=NULL), size=2.5) +
  geom_segment(data=CIs, aes(x=lo_adjusted, xend=hi_adjusted, yend = fct_rev(label), fill=NULL), size=1.25) +
  # colors, axes, etc.
  scale_fill_identity() +
  scale_x_continuous(expand = c(0,0), breaks = c(-3000, 0, 3000), labels = c("faster", "<span style='color:#1f77b4'>RS</span>", "slower")) + 
  scale_y_discrete(expand = expansion(mult = 0, add = c(.2, 0))) + 
  coord_cartesian(xlim=c(-4000, 4000)) + # Prevent really wide CIs from squishing everything else
  facet_grid(level ~ problem_dataset) +
  theme_gray(18) + theme(
    axis.text.x = element_markdown(),
    axis.text.y = element_markdown(),
    axis.ticks.x = element_blank(), 
    panel.grid = element_blank(), 
    plot.margin = unit(c(0,0,0,0), "points")
  ) +
  guides(color = "none", fill = "none") +
  labs(title = "Response time relative to Random Sampling", x=NULL, y=NULL)

The dataset dimension is categorical and yields performance differences that noticeably vary by sampling technique.

It is also really odd that there is a systemic difference across question/stimulus number, suggesting that the stimuli are not randomly selected. It may be worth investigating whether this relationship impacts the results.

4 Conclusion

Overall, the inconsistent performance of the different sampling techniques prevents any generalized conclusion across datasets or stimuli.

5 Appendices

5.1 Alternative analysis 1

The original article discretized the results by analyzing and drawing a conclusion about which technique is “best” rather than how much better it is compared to others. Discretizing doesn’t differentiate subtle vs substantial differences, which can make for misleading results. However, let’s put aside that concern and try a discretized approach to the analysis.

For each dataset and stimulus, what proportion of subjects perform fastest with RS?

# for each dataset, stimulus, and subject, select only the fastest technique
data %>% 
  mutate(isCorrect2 = ifelse(isCorrect2, "Correct responses", "Incorrect responses")) %>% 
  group_by(problem_dataset, level, subjectID, isCorrect2) %>% 
  filter(problem_timespend == min(problem_timespend)) %>% 
  # for each dataset and stimulus, how many subjects have RS as the fastest?
  group_by(problem_dataset, level, isCorrect2) %>% 
  summarise( RS_is_best  = mean(problem_sampling_initials=="RS") ) %>% 
  ungroup() %>% 
  #complete(problem_dataset, level, isCorrect2, fill = list(RS_is_best=0)) %>% 
  ggplot() +
    aes(x = level, y = RS_is_best) +
    geom_col() +
    geom_text(aes(label = scales::percent(RS_is_best, 1)), vjust=-0.5) +
    facet_grid(isCorrect2 ~ problem_dataset) +
    scale_y_continuous(limits = 0:1, expand = c(0,0), breaks = seq(0,1,.25), labels = scales::percent) +
    theme(
      #plot.margin = unit(c(0,0,0,0), "points"),
      axis.ticks.x = element_blank(), 
      panel.grid.major.x = element_blank(),
      panel.grid.minor = element_blank()
    ) +
    labs(title = "Percent of subjects who are fastest with RS", x=NULL, y=NULL)

If RS were usually fastest, it’d be above 50% for most conditions. But there is no condition where the majority of subjects are fastest with RS.

Which technique is fastest under each condition?

# for each dataset, stimulus, and subject, select only the fastest technique
data %>% 
  mutate(isCorrect2 = ifelse(isCorrect2, "Correct responses", "Incorrect responses")) %>% 
  group_by(problem_dataset, level, isCorrect2, subjectID) %>% 
  filter(problem_timespend == min(problem_timespend)) %>% 
  # for each dataset and stimulus, how many subjects are fastest with each technique?
  group_by(problem_dataset, level, isCorrect2, problem_sampling_initials, label, color) %>% 
  summarise( is_best = n() ) %>% 
  group_by(problem_dataset, level, isCorrect2) %>% 
  mutate(is_best = is_best / sum(is_best)) %>% 
  ungroup() %>% 
  ggplot() +
    aes(x = label, y = is_best, fill=color) +
    geom_col() +
    geom_text(aes(label = scales::percent(is_best, 1)), vjust=-0.5) +
    facet_grid(level * isCorrect2 ~ problem_dataset) +
    scale_y_continuous(limits = 0:1, expand = c(0,0), breaks = seq(0,1,.25), labels = scales::percent) +
    scale_fill_identity() +
    theme(
      #plot.margin = unit(c(0,0,0,0), "points"),
      axis.text.x = element_markdown(),
      axis.ticks.x = element_blank(), 
      panel.grid.major.x = element_blank(),
      panel.grid.minor = element_blank()
    ) +
    labs(title = "Percent of subjects for whom each technique is fastest", x=NULL, y=NULL)

No sampling technique stands out as consistently best in any condition.

Even with a discretized approach, claims that “RS is best” are not compelling.

5.2 Alternative analysis 2

When categorical variables like the dataset aren’t parameterized, it’s unclear how or if they should be pooled. Are they evenly sampling a parameter space? Or are most of them clumped together, while one is distinct? Maybe most of them are obscure types of datasets, and only one is commonly used? The lack of parameterization or weighting to base pooling on means that simply taking an average has no basis. But, let’s put that concern aside.

What if we use accuracy as a basis for pooling by separately analyzing correct answers and incorrect answers?
I don’t understand why stimulus 0 and stimulus 1 (level in the data) are so systematically different in raw performance, so I’m avoiding pooling that variable.

Note that this approach unevenly distributes subjects and datasets across conditions, which may bias the results. Interpret with caution.

# copy the data for the alternative analysis
data2 = data %>% 
  select(-adjusted_alpha, -p_value, -p_rank) %>% # drop previous analysis
  mutate(isCorrect2 = ifelse(isCorrect2, "Correct", "Incorrect")) # simpler labels

# p-values for adjusted alpha
p_values = data2 %>% 
  filter(problem_sampling_initials != "RS") %>% 
  group_by(problem_sampling_initials, level, isCorrect2) %>% 
  summarize(p_value = t.test(relative_timespend)$p.value) %>% 
  ungroup() %>% 
  # for Holm-Bonferroni, ranked the p-values and use that to adjust the alpha
  arrange(desc(p_value)) %>% 
  mutate(p_rank = 1:n(), adjusted_alpha = ALPHA / p_rank)

# merge it into the data
data2 = data2 %>% left_join(p_values)

# put into purrr's nested table format and make bootstrap copies
boots2 = data2 %>% 
  # get one tibble per condition and DV
  group_by(problem_sampling_initials, label, color, level, isCorrect2, adjusted_alpha) %>% 
  nest() %>% 
  # duplicate each based on number of bootstrap iterations
  expand_grid(boot_index = 1:(BOOT_COUNT))

# bootstrapped means: randomly sample each set with replacement and compute the mean
boots2 = boots2 %>% 
  #randomly sample with replacement
  mutate(data = map(data, ~slice_sample(., prop = 1, replace = TRUE))) %>% 
  # get the mean of the relative response time
  mutate(data = map(data, ~mean(.$relative_timespend))) %>% 
  unnest(data) %>% 
  rename(relative_timespend = data)

# Holm-Bonferroni
CIs2 = boots2 %>% 
  filter(problem_sampling_initials != "RS") %>% 
  group_by(problem_sampling_initials, level, isCorrect2, adjusted_alpha, label, color) %>% 
  summarise(
    lo = quantile(relative_timespend, ALPHA/2),
    hi = quantile(relative_timespend, 1-ALPHA/2),
    lo_adjusted = quantile(relative_timespend, first(adjusted_alpha)/2),
    hi_adjusted = quantile(relative_timespend, 1-first(adjusted_alpha)/2)
  ) %>% 
  ungroup()

ggplot(boots2 %>% filter(problem_sampling_initials != "RS")) +
  aes(x = relative_timespend, y = fct_rev(label), fill=color) +
  # use RS as a gridline instead of plotting it
  geom_vline(xintercept = 0, size=1.5, color="#1f77b4") +
  # histograms
  #ggdist::stat_histinterval(breaks=seq(-1550, 1550, 10), point_size=0, interval_size = 0, justification = -0.1, scale=0.95) +
  # scales
  geom_segment(data=CIs2, aes(x=lo, xend=hi, yend = fct_rev(label), fill=NULL, color=color), size=2.5) +
  geom_segment(data=CIs2, aes(x=lo_adjusted, xend=hi_adjusted, yend = fct_rev(label), fill=NULL, color=color), size=1.25) +
  # colors, axes, etc.
  scale_color_identity() +
  scale_fill_identity() +
  scale_x_continuous(expand = c(0,0), breaks = c(-1500, 0, 1500), labels = c("faster", "<span style='color:#1f77b4'>RS</span>", "slower")) + 
  scale_y_discrete(expand = expansion(mult = 0, add = c(.2, .2))) + 
  coord_cartesian(xlim=c(-2000, 2000)) + # Prevent really wide CIs from squishing everything else
  facet_grid(isCorrect2 ~ level) +
  theme_gray(18) + theme(
    axis.text.x = element_markdown(),
    axis.text.y = element_markdown(),
    axis.ticks.x = element_blank(), 
    panel.grid = element_blank(), 
    plot.margin = unit(c(0,0,0,0), "points")
  ) +
  guides(color = "none", fill = "none") +
  labs(title = "Response time relative to Random Sampling", x=NULL, y=NULL)

This zoomed-in graph shows that BNS and DBS perform on par with RS and that the statistical difference does not meet the original article’s alpha threshold. When subjects are incorrect, some techniques perform MUCH more slowlt compared to RS, which may have been driving the original article’s insinuation that RS performs best.

RS, BNS, and DBS all seem to perform similarly. Again, claims that “RS is best” are not compelling.

5.3 The problem with averaging across categories

Why not just average across categories? Because seemingly distinct categories often exist in a parameter space. What those parameters are and where the categories fall in that space can wildly swing the results. Are most clustered together with a couple outliers? Or are they evenly distributed for each parameter? If we don’t understand the parameters or distributions, we don’t understand how to pool together the categories as a random effect.

data = tibble(
  topping = c("Cheese", "Pickles", "Mustard", "Ground beef", "Oreos"),
  relative_preference = c(-3, -1.5, -4, -3, 1.5)
) %>% 
  expand_grid(subject_ID = 1:100) 

data = data %>% 
  mutate(topping = fct_inorder(factor(topping)))

data = data %>% 
  rowwise() %>% 
  mutate(relative_preference = relative_preference + rnorm(1, 0, 0.4)) %>% 
  ungroup()

data = data %>% 
  group_by(topping) %>% 
  summarise(
    lo = quantile(relative_preference, 0.025),
    med = quantile(relative_preference, 0.5),
    hi = quantile(relative_preference, 0.975)
  )

ggplot(data) +
  aes(y=topping, xmin=lo, x=med, xmax=hi) +
  geom_vline(xintercept = 0, color="gray", size=1, linetype = "22") +
  geom_pointrange(size=1.5) +
  scale_x_continuous(breaks = c(-4.5, 0, 4.5), labels = c("< worse than plain", "equal to plain", "better than plain >"), limits=c(-5.5, 5.5), minor_breaks = F, position = "top") +
  theme_minimal(20) +
  theme(
    panel.grid.major.y = element_line(linetype = "dotted"), 
    panel.grid.major.x = element_blank(), 
    plot.title = element_text(hjust = 0.5, margin = margin(b=2, unit="lines"))) +
  labs(
    y = NULL, x = NULL,
    title = "Preference of toppings over plain ice cream") +

tibble(y = "(average)", x = mean(data$med)) %>%
  ggplot() +
  aes(y=y, x=x) +
  geom_vline(xintercept = 0, color="gray", size=1, linetype = "22") +
  geom_point(size=7.5, shape=18) +
  scale_x_continuous(breaks = c(-4.5, 0, 4.5), labels = c("", "", ""), limits=c(-5.5, 5.5), minor_breaks = F) +
  theme_minimal(20) +
  theme(panel.grid.major.y = element_line(linetype = "dotted"), panel.grid.major.x = element_blank()) +
  labs(y = NULL, x = NULL) +

patchwork::plot_layout(ncol=1, heights = c(10, 1)) +
patchwork::plot_annotation(
  caption = "Can we conclude that ice cream with toppings is worse than plain ice cream? No.\nOreos are a preferred topping, and ground beef is a detested topping.\nBoth are true, and they don't cancel each other out.\nAggregating across categories is not always informative.",
  theme = theme(
    plot.margin = margin(10, 10, 15, 10),
    plot.caption = element_text(size=16, hjust = 0, lineheight = 1.1, margin = margin(1, unit = "lines"))))

6 Acknowledgements

Thanks to Matthew Kay, Kimberly Quinn, Brenton Wiernik, Pierre Dragicevic, Lonni Besançon, Lucija Batinovic, and Alex Holcombe for helpful comments.

7 Reproducibility

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices datasets  utils     methods   base     
## 
## other attached packages:
##  [1] apaTables_2.0.8 MBESS_4.8.1     broom_0.7.10    ggdist_3.0.1   
##  [5] ragg_1.2.1      glue_1.5.1      ggtext_0.1.1    patchwork_1.1.1
##  [9] forcats_0.5.1   stringr_1.4.0   dplyr_1.0.7     purrr_0.3.4    
## [13] readr_2.1.0     tidyr_1.1.4     tibble_3.1.6    ggplot2_3.3.5  
## [17] tidyverse_1.3.1
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.2           jsonlite_1.7.2       modelr_0.1.8        
##  [4] assertthat_0.2.1     distributional_0.2.2 highr_0.9           
##  [7] renv_0.14.0          cellranger_1.1.0     yaml_2.2.1          
## [10] pillar_1.6.4         backports_1.3.0      digest_0.6.28       
## [13] gridtext_0.1.4       rvest_1.0.2          colorspace_2.0-2    
## [16] htmltools_0.5.2      pkgconfig_2.0.3      haven_2.4.3         
## [19] scales_1.1.1         tzdb_0.2.0           generics_0.1.1      
## [22] farver_2.1.0         ellipsis_0.3.2       withr_2.4.2         
## [25] cli_3.1.0            magrittr_2.0.1       crayon_1.4.2        
## [28] readxl_1.3.1         evaluate_0.14        fs_1.5.0            
## [31] fansi_0.5.0          xml2_1.3.2           textshaping_0.3.6   
## [34] tools_4.1.2          hms_1.1.1            lifecycle_1.0.1     
## [37] munsell_0.5.0        reprex_2.0.1         compiler_4.1.2      
## [40] jquerylib_0.1.4      systemfonts_1.0.3    rlang_0.4.12        
## [43] grid_4.1.2           rstudioapi_0.13      labeling_0.4.2      
## [46] rmarkdown_2.11       gtable_0.3.0         DBI_1.1.1           
## [49] markdown_1.1         R6_2.5.1             lubridate_1.8.0     
## [52] knitr_1.36           fastmap_1.1.0        utf8_1.2.2          
## [55] stringi_1.7.6        Rcpp_1.0.7           vctrs_0.3.8         
## [58] dbplyr_2.1.1         tidyselect_1.1.1     xfun_0.27

A Comment on “Evaluation of Sampling Methods for Scatterplots”

Steve Haroz