rsortingrandomsample

Randomly sort specific number of elements from columns of different lengths


I have a tibble_data.frame of dimensions 1042x64. Columns are amphibian families and rows are the names of all species in that family. The first 5 rows and 2 columns look like this:

> amphilist[1:5,1:2]
A tibble: 5 × 2
  `Allophrynidae_(1_genus;_3_species)` `Alsodidae_(3_genera;_26_species)`
  <chr>                                <chr>                             
1 Allophryne_relicta                   Alsodes_australis                 
2 Allophryne_resplendens               Alsodes_barrioi                   
3 Allophryne_ruthveni                  Alsodes_cantillanensis            
4 NA                                   Alsodes_coppingeri                
5 NA                                   Alsodes_gargola 

The families have different number of species in them, the biggest containing 1,042 species and the smallest containing only 1. All columns are filled with NAs completing the 1,042 rows, apart from the only family which has 1,042 species. I need to randomly sort a certain number of species from each family for the next step of my analysis, however I keep getting NAs for all my columns, even the one that has no NAs in it. Here's what I've done so far:

I created a loop to get the species richness by family (spcR) and saved it in the df "species_no". And then with an 'ifelse' clause I input the number of species I need and saved it to the df #

amphilist <- read_xlsx("amphilist.xlsx", col_names = TRUE)

families <- colnames(amphilist)
family_n <- ncol(amphilist)
spcR <- vector(length = family_n)

for(i in 1:length(families)) {
  families.i <- families[i]
  spcR[i] <- colSums(amphilist[,families.i] > 0, na.rm = TRUE)
}

species_no <- data.frame(families, spcR)
species_no$choose <- ifelse(species_no$spcR > 50, ceiling(species_no$spcR/10), 
                            ifelse(species_no$spcR >= 5 & species_no$spcR <= 50,
                                   5, species_no$spcR))

> species_no[1:3,]
                                        families spcR choose
1             Allophrynidae_(1_genus;_3_species)    3      3
2               Alsodidae_(3_genera;_26_species)   26      5
3 Alytidae_(2_subfamilies;_3_genera;_12_species)   12      5

From here is where I'm stuck and getting the error NAs. I created a vector with the number of elements I need, however I can't make the random choice to work. I want to get a number of species from each column, defined by the choose_no vector, disconsidering the NAs #

choose_no <- species_no$choose
set.seed(43)
for(i in 1:length(families)) {
  families.i <- families[i]
  choose_no.i <- choose_no[i]
  rand_amphilist <- amphilist[sample(amphilist[,i], 
                                     size = choose_no.i), ]
}

Can someone help me, please? Thank you very much!


Solution

  • # SETUP 
    # load lib
    library(tidyverse)
    
    # example data
    amphilist <- tribble(
      ~"Allophrynidae_(1_genus;_3_species)", ~"Alsodidae_(3_genera;_26_species)"
      , "Allophryne_relicta"                ,  "Alsodes_australis"                
      , "Allophryne_resplendens"            ,  "Alsodes_barrioi"                  
      , "Allophryne_ruthveni"               ,  "Alsodes_cantillanensis"          
      , NA                                ,  "Alsodes_coppingeri"               
      , NA                                ,  "Alsodes_gargola" )
    
    # make it long; an 8 row frame( not 5x2 = 10)
    amphilist_long <- amphilist |> pivot_longer(cols=everything(),
                                                names_to = "category",
                                                values_to = "entry") |> filter(!is.na(entry))
    
    
    # a vector with the number of values to extract per category
    (choose_no <- data.frame(
      category = c("Allophrynidae_(1_genus;_3_species)", "Alsodidae_(3_genera;_26_species)"),
      sampsize = 1:2
    ) |> deframe())
    
    set.seed(42) # for reproducibility
    
    # the main event
    # for each sample size choice for category (choose_no)
    # filter out irrelevant records then sample the desired amount
    # `_dfr` variant to collate results rowwise to a dataframe
    (samp_list <- imap_dfr(choose_no,
        \(x,y){
          slice_sample(filter(amphilist_long,
                              category==y),
                       n=x)
        })
    )