rdataframelistdplyrpivot-table

My column values show as (lists) and not (character/numeric/integer) after using pivot_wider


A disclamire before I start, I am still very new to R; therefore, if I am using wrong terminology, please feel free to correct me, and if more information is needed to solve this, please let me know.

I am currently working on a dataset, below is a code to generate a small part of it (excluding some variables and most observations):

structure(list(ED1_SEN = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L), Date = c("09/12/2020", "09/12/2020", "09/12/2020", 
"09/12/2020", "09/12/2020", "09/12/2020", "09/12/2020", "09/12/2020", 
"09/12/2020", "09/12/2020", "09/12/2020", "09/12/2020", "09/12/2020", 
"09/12/2020", "09/12/2020", "09/12/2020", "09/12/2020", "09/12/2020", 
"09/12/2020", "09/12/2020", "09/12/2020", "09/12/2020", "09/12/2020", 
"09/12/2020", "09/12/2020", "09/12/2020", "09/12/2020", "09/12/2020", 
"09/12/2020", "09/12/2020"), Household_ID = c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Trap = c("MET", "MET", 
"MET", "MET", "MET", "MET", "MET", "MET", "MET", "MET", "MET", 
"MET", "MET", "MET", "MET", "MET", "MET", "MET", "MET", "MET", 
"MET", "MET", "MET", "MET", "MET", "MET", "MET", "MET", "MET", 
"MET"), Trap_type = c("MET", "MET", "MET", "MET", "MET", "MET", 
"MET", "MET", "MET", "MET", "MET", "MET", "MET", "MET", "MET", 
"MET", "MET", "MET", "MET", "MET", "MET", "MET", "MET", "MET", 
"MET", "MET", "MET", "MET", "MET", "MET"), Location = c("Indoor", 
"Indoor", "Indoor", "Indoor", "Indoor", "Indoor", "Indoor", "Indoor", 
"Indoor", "Indoor", "Indoor", "Indoor", "Indoor", "Indoor", "Indoor", 
"Indoor", "Indoor", "Indoor", "Indoor", "Indoor", "Indoor", "Indoor", 
"Indoor", "Indoor", "Indoor", "Indoor", "Indoor", "Indoor", "Indoor", 
"Indoor"), Round = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L), Season = c("Dry", "Dry", "Dry", "Dry", "Dry", 
"Dry", "Dry", "Dry", "Dry", "Dry", "Dry", "Dry", "Dry", "Dry", 
"Dry", "Dry", "Dry", "Dry", "Dry", "Dry", "Dry", "Dry", "Dry", 
"Dry", "Dry", "Dry", "Dry", "Dry", "Dry", "Dry"), Volunteer_initial = c("AY", 
"AY", "AY", "AY", "AY", "AY", "AY", "AY", "AY", "AY", "AY", "AY", 
"AY", "AY", "AY", "AY", "AY", "AY", "AY", "AY", "AY", "AY", "AY", 
"AY", "AY", "AY", "AY", "AY", "AY", "AY"), Temperature = c(29.2, 
29.2, 29.2, 29.2, 29.2, 29.2, 29.2, 29.2, 29.2, 29.2, 29.2, 29.2, 
29.2, 27.6, 27.6, 27.6, 27.6, 27.6, 27.6, 27.6, 27.6, 28.4, 28.4, 
28.4, 28.4, 28.4, 28.2, 28.2, 28.2, 28.2), Humidity = c(75L, 
75L, 75L, 75L, 75L, 75L, 75L, 75L, 75L, 75L, 75L, 75L, 75L, 83L, 
83L, 83L, 83L, 83L, 83L, 83L, 83L, 78L, 78L, 78L, 78L, 78L, 79L, 
79L, 79L, 79L), Wind_speed = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 
    Latitude = c(-8.1473, -8.1473, -8.1473, -8.1473, -8.1473, 
    -8.1473, -8.1473, -8.1473, -8.1473, -8.1473, -8.1473, -8.1473, 
    -8.1473, -8.1473, -8.1473, -8.1473, -8.1473, -8.1473, -8.1473, 
    -8.1473, -8.1473, -8.1473, -8.1473, -8.1473, -8.1473, -8.1473, 
    -8.1473, -8.1473, -8.1473, -8.1473), Longitude = c(39.1771, 
    39.1771, 39.1771, 39.1771, 39.1771, 39.1771, 39.1771, 39.1771, 
    39.1771, 39.1771, 39.1771, 39.1771, 39.1771, 39.1771, 39.1771, 
    39.1771, 39.1771, 39.1771, 39.1771, 39.1771, 39.1771, 39.1771, 
    39.1771, 39.1771, 39.1771, 39.1771, 39.1771, 39.1771, 39.1771, 
    39.1771), Elevation = c(NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
    NA_real_, NA_real_), Grouped_species = c("CULEX", "CULEX", 
    "CULEX", "CULEX", "CULEX", "CULEX", "CULEX", "CULEX", "CULEX", 
    "CULEX", "CULEX", "CULEX", "CULEX", "CULEX", "CULEX", "CULEX", 
    "CULEX", "CULEX", "CULEX", "CULEX", "CULEX", "MANSONIA", 
    "MANSONIA", "MANSONIA", "MANSONIA", "MANSONIA", "MANSONIA", 
    "MANSONIA", "MANSONIA", "MANSONIA"), Group_malaria_transmissionC = c("Non_malaria_vectors", 
    "Non_malaria_vectors", "Non_malaria_vectors", "Non_malaria_vectors", 
    "Non_malaria_vectors", "Non_malaria_vectors", "Non_malaria_vectors", 
    "Non_malaria_vectors", "Non_malaria_vectors", "Non_malaria_vectors", 
    "Non_malaria_vectors", "Non_malaria_vectors", "Non_malaria_vectors", 
    "Non_malaria_vectors", "Non_malaria_vectors", "Non_malaria_vectors", 
    "Non_malaria_vectors", "Non_malaria_vectors", "Non_malaria_vectors", 
    "Non_malaria_vectors", "Non_malaria_vectors", "Non_malaria_vectors", 
    "Non_malaria_vectors", "Non_malaria_vectors", "Non_malaria_vectors", 
    "Non_malaria_vectors", "Non_malaria_vectors", "Non_malaria_vectors", 
    "Non_malaria_vectors", "Non_malaria_vectors")), row.names = c(NA, 
30L), class = "data.frame")

I wanted R to transpose the "Grouped_species" column into different columns each representing a different mosquite species, and I wanted it to give a value of 1 in case a specific species was collected and 0 if not (I will group my data later and sum the total number of mosquitoes collected from different houses).

To do that, I used the pivot_wider function. However, as I know that each row in my dataset = one collected mosquito, I wanted to add a new column, call it "Count", and populate it with "1" so I can use it later for the "values_from" argument. For that, I used this code:

Count <- as.character('1')
house1_comp <- data.frame(cbind(house1_comp, Count))

After I added the new column, I used this code to get to the wide format:

house1_comp_mod <- house1_comp %>% 
  pivot_wider(names_from = Grouped_species,
              values_from = Count)

The resulting table can be found in the attached picture. New dataframe after using pivot_wider

Here, I can see that the values of the new columns "Culex" and "Mansonia" are a list and not a character. Also, I can see that I have lost most observations (I must have retained the same number of observations at this satge).

I tried to add the values of the new column as numeric, character, and integer, and all turned to "list" when I used pivot_wider.

I would appreciate any advice or help on what is going on.


Solution

  • To create two new columns which have 1 or 0 depending on the species you already had the right idea! Yes, you need to add a new column "count" and fill it with ones:

    house1_comp <- list %>% mutate(Count = 1) --> adds a new column with 1 values

    pivot_wider() needs a unique identifier for each row. Right now we don't have that. So let's create one like this: We need to create an "id" column and fill it with the row numbers:

    house1_comp <- house1_comp %>% mutate(id = row_number() --> adds a new column with 1,2,3,4 ... 30 values for each row

    After this is prepared, we can finally use pivot_wider(): We take the values from Count as "values_from" and Grouped_species as "names_from". You should use values_fill = 0 to fill all remaining values with zeros instead of NAs.

    Let's do it all in one go:

    install.packages("rtools", "tidyr", "dplyr")
    
    library(tidyr)
    library(dplyr)
    
    list <- data.frame(
      ED1_SEN = rep(1L, 30),
      Date = rep("09/12/2020", 30),
      Household_ID = rep(1L, 30),
      Trap = rep("MET", 30),
      Trap_type = rep("MET", 30),
      Location = rep("Indoor", 30),
      Round = rep(1L, 30),
      Season = rep("Dry", 30),
      Volunteer_initial = rep("AY", 30),
      Temperature = c(29.2,  29.2,29.2,  29.2,29.2, 29.2, 29.2, 29.2, 29.2, 29.2, 29.2, 29.2, 29.2, 27.6, 27.6, 27.6, 27.6, 27.6, 27.6, 27.6, 27.6, 28.4, 28.4, 28.4,
        28.4, 28.4, 28.2, 28.2,  28.2, 28.2 ),
      Humidity = c( 75L,75L,75L,75L,75L,75L,75L,75L,75L,75L,75L,75L,75L,83L,83L,83L,83L,83L,83L,83L,83L,78L,78L,78L,78L,78L,79L,79L,79L,79L),
      Wind_speed = rep(0, 30),
      Latitude = rep(-8.1473, 30),
      Longitude = rep(39.1771, 30),
      Elevation = rep(NA_real_, 30),
      Grouped_species = c(rep("CULEX", 21),rep("MANSONIA", 9)),
      Group_malaria_transmissionC = rep("Non_malaria_vectors",30)
    )
    
    
    wider_list <- list %>%      
      mutate(id = row_number(),  # we need a unique row identifier
             count = 1) %>%      # add the count column!
      pivot_wider(
        names_from = Grouped_species, # New column names from Grouped_species
        values_from = count,
        values_fill = 0 
      ) %>% select(-id) # remove unique row id column
    

    Finally we get the required output wider_list: enter image description here

    PS.: Looking at your dataset it would make sense to group by Household, Temperature and Humidity since these columns actually vary. You can then count the occurrences of each species for the groups and pivot wider:

    Household_ID_Temperature_Humidity_species_counts <- list %>%
      group_by(Household_ID, Temperature, Humidity) %>% # list all columns you want to include in the final table :)
      count(Grouped_species) %>% # Count occurrences of each species
      pivot_wider(
        names_from = Grouped_species, # New column names from Grouped_species
        values_from = n,              # Fill with the counts
        values_fill = 0               # Fill missing values with 0
      )