Let's say we have this question Why are you not happy? and we have 5 answers (1, 2, 3, 4, 5)
s = data.frame(subjects = 1:12,
Why_are_you_not_happy = c(1,2,4,5,1,2,4,3,2,1,3,4))
in the previous example every subject picked only one option. but let's say that each of the subjects 3, 7 and 10 picked more than one option.
I want to code the options of this question considering these multiple options for these 3 subjects, while preserving the shape of the dataframe.
The next case is if the dataframe includes 2 questions as follows :
df <- data.frame(subjects = 1:12,
Why_are_you_not_happy =
c(1,2,"1,2,5",5,1,2,"3,4",3,2,"1,5",3,4),
why_are_you_sad =
c("1,2,3",1,2,3,"4,5,3",2,1,4,3,1,1,1) )
How can we making the proper coding for the first and second scenario ? The objective is to apply multiple correspondence analysis (MCA).
Thank you
With your updated example data you have (at least) two options: you can separate each column, or you can pivot_longer()
the data and group the "scores" together. E.g.
library(tidyr)
df <- data.frame(subjects = 1:12,
Why_are_you_not_happy =
c(1,2,"1,2,5",5,1,2,"3,4",3,2,"1,5",3,4),
why_are_you_sad =
c("1,2,3",1,2,3,"4,5,3",2,1,4,3,1,1,1))
df
#> subjects Why_are_you_not_happy why_are_you_sad
#> 1 1 1 1,2,3
#> 2 2 2 1
#> 3 3 1,2,5 2
#> 4 4 5 3
#> 5 5 1 4,5,3
#> 6 6 2 2
#> 7 7 3,4 1
#> 8 8 3 4
#> 9 9 2 3
#> 10 10 1,5 1
#> 11 11 3 1
#> 12 12 4 1
df1 <- df %>%
separate(Why_are_you_not_happy,
sep = ",", into = c("Why_are_you_not_happy_1",
"Why_are_you_not_happy_2",
"Why_are_you_not_happy_3")) %>%
separate(why_are_you_sad,
sep = ",", into = c("why_are_you_sad_1",
"why_are_you_sad_2",
"why_are_you_sad_3"))
#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 11 rows [1, 2, 4,
#> 5, 6, 7, 8, 9, 10, 11, 12].
#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 10 rows [2, 3, 4,
#> 6, 7, 8, 9, 10, 11, 12].
df1
#> subjects Why_are_you_not_happy_1 Why_are_you_not_happy_2
#> 1 1 1 <NA>
#> 2 2 2 <NA>
#> 3 3 1 2
#> 4 4 5 <NA>
#> 5 5 1 <NA>
#> 6 6 2 <NA>
#> 7 7 3 4
#> 8 8 3 <NA>
#> 9 9 2 <NA>
#> 10 10 1 5
#> 11 11 3 <NA>
#> 12 12 4 <NA>
#> Why_are_you_not_happy_3 why_are_you_sad_1 why_are_you_sad_2
#> 1 <NA> 1 2
#> 2 <NA> 1 <NA>
#> 3 5 2 <NA>
#> 4 <NA> 3 <NA>
#> 5 <NA> 4 5
#> 6 <NA> 2 <NA>
#> 7 <NA> 1 <NA>
#> 8 <NA> 4 <NA>
#> 9 <NA> 3 <NA>
#> 10 <NA> 1 <NA>
#> 11 <NA> 1 <NA>
#> 12 <NA> 1 <NA>
#> why_are_you_sad_3
#> 1 3
#> 2 <NA>
#> 3 <NA>
#> 4 <NA>
#> 5 3
#> 6 <NA>
#> 7 <NA>
#> 8 <NA>
#> 9 <NA>
#> 10 <NA>
#> 11 <NA>
#> 12 <NA>
This is what I think you should use for MCA, e.g.
library(FactoMineR)
library(factoextra)
#> Loading required package: ggplot2
results <- MCA(df1[,2:7])
# Check eigenvalues to see %var for each dimension
fviz_eig(results)
Second approach for handling the data that 'works better' for plotting with e.g. ggplot:
df2 <- df %>%
pivot_longer(-subjects,
names_to = "Category",
values_to = "Score") %>%
separate(Score, sep = ",",
into = c("Score_1", "Score_2", "Score_3"))
#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 21 rows [1, 3, 4,
#> 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, ...].
df2
#> # A tibble: 24 × 5
#> subjects Category Score_1 Score_2 Score_3
#> <int> <chr> <chr> <chr> <chr>
#> 1 1 Why_are_you_not_happy 1 <NA> <NA>
#> 2 1 why_are_you_sad 1 2 3
#> 3 2 Why_are_you_not_happy 2 <NA> <NA>
#> 4 2 why_are_you_sad 1 <NA> <NA>
#> 5 3 Why_are_you_not_happy 1 2 5
#> 6 3 why_are_you_sad 2 <NA> <NA>
#> 7 4 Why_are_you_not_happy 5 <NA> <NA>
#> 8 4 why_are_you_sad 3 <NA> <NA>
#> 9 5 Why_are_you_not_happy 1 <NA> <NA>
#> 10 5 why_are_you_sad 4 5 3
#> # … with 14 more rows
library(ggplot2)
# convert subjects from an integer to a factor
df2$subjects <- factor(df2$subjects)
group_labels <- c("Why_are_you_not_happy" = "Why are you not happy?",
"why_are_you_sad" = "Why are you sad?")
df2 %>%
pivot_longer(-c(subjects, Category),
names_to = "Score_number",
values_to = "Answer") %>%
na.omit() %>%
ggplot(aes(x = subjects, y = Answer,
fill = Category)) +
geom_tile(color = "white") +
geom_vline(xintercept = seq(0.5, 11.5, 1),
color = "black") +
geom_hline(yintercept = seq(0.5, 4.5, 1),
color = "black") +
scale_fill_discrete(labels = group_labels,
name = "") +
theme_bw(base_size = 16) +
theme(legend.position = "none",
panel.grid = element_blank()) +
coord_cartesian(expand = 0) +
facet_wrap(~Category, nrow = 2,
labeller = labeller(Category = group_labels))
Created on 2022-10-06 by the reprex package (v2.0.1)
It sounds like you want the separate()
function from the tidyr package, e.g.
library(tidyr)
df <- data.frame(subjects = 1:12,
Why_are_you_not_happy = c(1,2,"1,2,5",5,1,2,"3,4",3,2,"1,5",3,4))
df
#> subjects Why_are_you_not_happy
#> 1 1 1
#> 2 2 2
#> 3 3 1,2,5
#> 4 4 5
#> 5 5 1
#> 6 6 2
#> 7 7 3,4
#> 8 8 3
#> 9 9 2
#> 10 10 1,5
#> 11 11 3
#> 12 12 4
df %>%
separate(Why_are_you_not_happy,
sep = ",", into = c("Answer_1",
"Answer_2",
"Answer_3"))
#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 11 rows [1, 2, 4,
#> 5, 6, 7, 8, 9, 10, 11, 12].
#> subjects Answer_1 Answer_2 Answer_3
#> 1 1 1 <NA> <NA>
#> 2 2 2 <NA> <NA>
#> 3 3 1 2 5
#> 4 4 5 <NA> <NA>
#> 5 5 1 <NA> <NA>
#> 6 6 2 <NA> <NA>
#> 7 7 3 4 <NA>
#> 8 8 3 <NA> <NA>
#> 9 9 2 <NA> <NA>
#> 10 10 1 5 <NA>
#> 11 11 3 <NA> <NA>
#> 12 12 4 <NA> <NA>
Or, perhaps in long format? E.g.
df %>%
separate(Why_are_you_not_happy,
sep = ",", into = c("Answer_1",
"Answer_2",
"Answer_3")) %>%
pivot_longer(-subjects) %>%
na.omit()
#> Warning: Expected 3 pieces. Missing pieces filled with `NA` in 11 rows [1, 2, 4,
#> 5, 6, 7, 8, 9, 10, 11, 12].
#> # A tibble: 16 × 3
#> subjects name value
#> <int> <chr> <chr>
#> 1 1 Answer_1 1
#> 2 2 Answer_1 2
#> 3 3 Answer_1 1
#> 4 3 Answer_2 2
#> 5 3 Answer_3 5
#> 6 4 Answer_1 5
#> 7 5 Answer_1 1
#> 8 6 Answer_1 2
#> 9 7 Answer_1 3
#> 10 7 Answer_2 4
#> 11 8 Answer_1 3
#> 12 9 Answer_1 2
#> 13 10 Answer_1 1
#> 14 10 Answer_2 5
#> 15 11 Answer_1 3
#> 16 12 Answer_1 4
Created on 2022-10-05 by the reprex package (v2.0.1)