rcombinationspermutationcombinatorics

Manually calculate all coin flip probabilities from real data


I have this coin flip data:

library(dplyr)
library(knitr)
library(kableExtra)

set.seed(123)

n_flips <- 100
flips <- sample(c("H", "T"), n_flips, replace = TRUE)

I manually calculate all conditional probabilities and summaries the results in a table:

get_conditional_prob <- function(sequence, data) {
  n <- nchar(sequence)
  
  windows <- character(length(data) - n)
  next_outcomes <- character(length(data) - n)
  
  for(i in 1:(length(data) - n)) {
    windows[i] <- paste(data[i:(i+n-1)], collapse="")
    next_outcomes[i] <- data[i+n]
  }
  
  matches <- windows == sequence
  count <- sum(matches)
  
  if(count > 0) {
    next_after_matches <- next_outcomes[matches]
    prob_h <- mean(next_after_matches == "H")
    prob_t <- mean(next_after_matches == "T")
  } else {
    prob_h <- NA
    prob_t <- NA
  }
  
  return(c(prob_h, prob_t, count))
}

sequences_1 <- c("H", "T")
sequences_2 <- c("HH", "HT", "TH", "TT")
sequences_3 <- c("HHH", "HHT", "HTH", "HTT", "THH", "THT", "TTH", "TTT")
sequences_4 <- c("HHHH", "HHHT", "HHTH", "HHTT", "HTHH", "HTHT", "HTTH", "HTTT",
                "THHH", "THHT", "THTH", "THTT", "TTHH", "TTHT", "TTTH", "TTTT")
sequences_5 <- c("HHHHH", "HHHHT", "HHHTH", "HHHTT", "HHTHH", "HHTHT", "HHTTH", "HHTTT",
                "HTHHH", "HTHHT", "HTHTH", "HTHTT", "HTTHH", "HTTHT", "HTTTH", "HTTTT",
                "THHHH", "THHHT", "THHTH", "THHTT", "THTHH", "THTHT", "THTTH", "THTTT",
                "TTHHH", "TTHHT", "TTHTH", "TTHTT", "TTTHH", "TTTHT", "TTTTH", "TTTTT")

all_sequences <- c(sequences_1, sequences_2, sequences_3, sequences_4, sequences_5)

results <- data.frame(
  Sequence = character(),
  Next_H = numeric(),
  Next_T = numeric(),
  Count = numeric(),
  stringsAsFactors = FALSE
)

for(seq in all_sequences) {
  probs <- get_conditional_prob(seq, flips)
  results <- rbind(results, 
                  data.frame(
                    Sequence = seq,
                    Next_H = probs[1],
                    Next_T = probs[2],
                    Count = probs[3]
                  ))
}

results_formatted <- results %>%
  mutate(
    Length = nchar(Sequence),
    Next_H = round(Next_H, 3),
    Next_T = round(Next_T, 3)
  ) %>%
  arrange(Length, Sequence) %>%
  select(
    'Pattern' = Sequence,
    'Length' = Length,
    'P(H|Pattern)' = Next_H,
    'P(T|Pattern)' = Next_T,
    'Occurrences' = Count
  )

kable(results_formatted,
      format = "html",
      caption = "Conditional Probabilities in Coin Flip Sequence (up to length 5)",
      align = c('l', 'c', 'c', 'c', 'c')) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed"),
    full_width = FALSE,
    position = "left",
    font_size = 12
  ) %>%
  add_header_above(c(" " = 2, "Conditional Probabilities" = 2, " " = 1)) %>%
  row_spec(0, bold = TRUE) %>%
  pack_rows("Single Flip", 1, 2) %>%
  pack_rows("Two Flips", 3, 6) %>%
  pack_rows("Three Flips", 7, 14) %>%
  pack_rows("Four Flips", 15, 30) %>%
  pack_rows("Five Flips", 31, 62)

enter image description here

Is there something I can do that automatically calculates all combinations without having to manually enumerate them? This will get quite lengthy to manually enumerate all combinations and then feed them into the code. Is there an easier way to do this?


Solution

  • flips <- c("H","T")
    
    library(gtools)
    lapply(1:5, \(r) 
           apply(permutations(2, r, flips, repeats=TRUE), 1, paste, collapse="")) |>
      unlist()
    
     [1] "H"     "T"     "HH"    "HT"    "TH"    "TT"    "HHH"   "HHT"  
     [9] "HTH"   "HTT"   "THH"   "THT"   "TTH"   "TTT"   "HHHH"  "HHHT" 
    [17] "HHTH"  "HHTT"  "HTHH"  "HTHT"  "HTTH"  "HTTT"  "THHH"  "THHT" 
    [25] "THTH"  "THTT"  "TTHH"  "TTHT"  "TTTH"  "TTTT"  "HHHHH" "HHHHT"
    [33] "HHHTH" "HHHTT" "HHTHH" "HHTHT" "HHTTH" "HHTTT" "HTHHH" "HTHHT"
    [41] "HTHTH" "HTHTT" "HTTHH" "HTTHT" "HTTTH" "HTTTT" "THHHH" "THHHT"
    [49] "THHTH" "THHTT" "THTHH" "THTHT" "THTTH" "THTTT" "TTHHH" "TTHHT"
    [57] "TTHTH" "TTHTT" "TTTHH" "TTTHT" "TTTTH" "TTTTT"