From a tsv file, I have two tables.
For example,
table 1: V_one
readId | V_segment |
---|---|
abc | IGHV3-18-F(2695) |
ghi | IGHV3-9-F(2840) |
and
table 2: V_two
readId | V_segment |
---|---|
aaa | IGHV3-9-F(2714),IGHV3-12-F(2656) |
bbb | IGHV3-10-F(2666),IGHV3-18-F(2666) |
And I need to create another table where, for each unique value in the column "V_segment" of these new two tables, I get a count of the times every unique value appears in the column "V_segment" in the tsv file.
For example:
V_segment | Count |
---|---|
IGHV3-18-F | 3 |
IGHV3-9-F | 15 |
IGHV3-9-F,IGHV3-12-F | 4 |
IGHV3-10-F,IGHV3-18-F | 7 |
I tried this code:
df <- read_tsv('file.tsv') # tsv file with lots of columns
segmV <- data.frame(df$readId, df$V_segment, stringsAsFactors = FALSE) # create a data frame with two columns of the tsv file
V_one <- segmV[!grepl(",", df$V_segment),] # table 1
V_two <- segmV[grepl(",", df$V_segment),] # table 2
v1 <- V_one %>%
group_by(V_one$V_segment) %>%
mutate(Count = n_distinct(V_one$V_segment))
But it's not working, since (for this) I need to ignore the information in the parenthesis.
> dput(head(V_one, n=10))
structure(list(readId = c("abc", "def",
"ghi", "jkl", "mno", "pqr", "stu", "vwy", "zab", "cde"),
V_segment = c("IGHV3-18-F(2695)", "IGHV3-18-F(2782)",
"IGHV3-18-F(2772)", "IGHV3-18-F(1952)", "IGHV3-15-F(2792)",
"IGHV3-18-F(2558)", "IGHV3-9-F(2831)", "IGHV3-18-F(2221)",
"IGHV3-18-F(2812)", "IGHV3-15-F(2791)")), row.names = c(1L,
2L, 4L, 9L, 11L, 14L, 17L, 19L, 24L, 25L), class = "data.frame")
> dput(head(V_two, n=10))
structure(list(readId = c("aaa", "bbb", "ccc", "ddd", "eee", "fff",
"ggg", "hhh", "iii", "jjj"),
V_segment = c("IGHV3-10-F(2429),IGHV3-12-F(2429)",
"IGHV3-9-F(2714),IGHV3-12-F(2656)", "IGHV1-16-ORF(2920),IGHV1-19-F(2920)",
"IGHV3-17-F(2512),IGHV3-18-F(2512)", "IGHV3-10-F(2666),IGHV3-18-F(2666)",
"IGHV1-8-F(2901),IGHV1-21-F(2814)", "IGHV3-10-F(2685),IGHV3-18-F(2685)",
"IGHV3-12-F(2801),IGHV3-20-F(2743)", "IGHV3-20-F(2754),IGHV3-10-F(2725)",
"IGHV3-10-F(2714),IGHV3-18-F(2714)")), row.names = c(3L,
5L, 6L, 7L, 8L, 10L, 12L, 13L, 15L, 16L), class = "data.frame")
> dput(head(V_table3, n=10))
structure(list(readId = c("aaa", "bbb", "ccc", "ddd", "eee", "fff",
"ggg", "hhh", "iii", "jjj"),
V_segment = c("IGHV3-10-F(242,9),IGHV3-12-F(242,9)",
"IGHV3-9-F(271,4),IGHV3-12-F(265,6)", "IGHV1-16-ORF(292,0),IGHV1-19-F(2920)",
"IGHV3-17-F(251,2),IGHV3-18-F(2512)", "IGHV3-10-F(2666),IGHV3-18-F(2666)",
"IGHV1-8-F(290,1),IGHV1-21-F(281,4)", "IGHV3-10-F(2685),IGHV3-18-F(2685)",
"IGHV3-12-F(2801),IGHV3-20-F(274,3)", "IGHV3-20-F(2754),IGHV3-10-F(272,5)",
"IGHV3-10-F(2714),IGHV3-18-F(271,4)")), row.names = c(3L,
5L, 6L, 7L, 8L, 10L, 12L, 13L, 15L, 16L), class = "data.frame")
Could you help me please?
Thank you in advance!
library(dplyr)
bind_rows(mget(ls(pattern = "V_"))) %>%
count(V_segment)
# A tibble: 4 × 2
V_segment n
<chr> <int>
1 IGHV3-10-F(2666),IGHV3-18-F(2666) 1
2 IGHV3-18-F(2695) 1
3 IGHV3-9-F(2714),IGHV3-12-F(2656) 1
4 IGHV3-9-F(2840) 1
Individual segments:
bind_rows(mget(ls(pattern = "V_"))) %>%
mutate(across(V_segment, ~ str_remove_all(.x, "\\(.*?\\)"))) %>%
count(V_segment)
# A tibble: 4 × 2
V_segment n
<chr> <int>
1 IGHV3-10-F,IGHV3-18-F 1
2 IGHV3-18-F 1
3 IGHV3-9-F 1
4 IGHV3-9-F,IGHV3-12-F 1
With the provided sample data:
V_one <- structure(list(readId = c("abc", "def",
"ghi", "jkl", "mno", "pqr", "stu", "vwy", "zab", "cde"),
V_Segment = c("IGHV3-18-F(2695)", "IGHV3-18-F(2782)",
"IGHV3-18-F(2772)", "IGHV3-18-F(1952)", "IGHV3-15-F(2792)",
"IGHV3-18-F(2558)", "IGHV3-9-F(2831)", "IGHV3-18-F(2221)",
"IGHV3-18-F(2812)", "IGHV3-15-F(2791)")), row.names = c(1L,
2L, 4L, 9L, 11L, 14L, 17L, 19L, 24L, 25L), class = "data.frame") %>%
as_tibble()
V_one %>%
mutate(V_Segment = str_remove(V_Segment, "\\(.*?\\)")) %>%
count(V_Segment)
# A tibble: 3 × 2
V_Segment n
<chr> <int>
1 IGHV3-15-F 2
2 IGHV3-18-F 7
3 IGHV3-9-F 1