My ultimate goal is getting tidy table with neat(clean) frame.
Here is my image file which capture original pdf page. (Sorry for the language, cuz I'm leaving in Korea now)
When I use 'extract_table' function in the R package 'tabulizer', the outcome is like that
library(tabulizer)
setwd("C:/Users/user/Desktop/신규펀드/폴더/20200420/투자설명서")
pdf_file <- "K55101D20742_투자설명서_한국투자미국배당귀족증권자투자신탁UH(주식)_(20200322).pdf"
cle <- extract_tables(pdf_file,pages=47,method="stream", encoding="UTF-8")
cle
As you can see there is not neat or clean frame. Because there is a lot of "" and the function split and make frame with lines! so the every row is split. I mean, if you see the [11,] [12,] [13,] row, in pdf files it should be just 'one' row which contain (수수료선취-오프라인(A), 0.3500, 0.5100, 0.0300, 0.0100, 0.9000, -, 0.9000, 1.8500, 0.9000). However, in the R outcome, it is split three rows [11,] [12,] [13,].
So I want solve this problem and the make the outcome exactly same with pdf files.
If you know any answer about it, please leave any reply to help me.
Plus another problem that is not mentioned in captured images is
If there is some misreading in the package, then it merge the column or rows. for example, first and second number in the table 0.3500, 0.5100 should be separate. however sometimes it is merged just in one column, 0.35000.5100, like that.
How can I solve it?
Thank you for reading.
++
dput(cle)
list(structure(c("수수료미징구-온라인-개인연금 (C-Pe)", "수수료미징구-오프라인-퇴직연금 (C-R)",
"수수료미징구-온라인-퇴직연금 (C-Re)", "수수료미징구-오프라인,직판-랩 (C-W)",
"", "수수료후취-온라인슈퍼 (S)", "", "수수료미징구-온라인슈퍼-개인연금 (S-P)",
"수수료미징구-온라인슈퍼-퇴직연금 (S-R)", "부과기준", "없음",
"없음", "없음", "없음", "", "없음", "", "없음", "없음", "매입시",
"없음", "없음", "없음", "없음", "3 년 미만 환매시", "환매금액의 0.15%",
"이내", "없음", "없음", "환매시", "없음", "없음", "없음", "없음",
"", "없음", "", "없음", "없음", "환매시"), .Dim = c(10L, 4L)),
structure(c("※ 이 투자신탁이 투자하는 모투자신탁의 수수료 현황",
"■ 한국투자 미국배당귀족 증권 모투자신탁(USD)(주식)", "구 분 부과비율 (또는 부과금액)",
"선취판매수수료 -", "후취판매수수료 -", "환매수수료 -", "",
"", "부과시기", "매입시", "환매시", "환매시"), .Dim = c(6L,
2L)), structure(c("", "", "", "", "구분", "", "", "", "",
"", "수수료선취", "-오프라인", "(A)", "수수료선취", "-온라인",
"(A-e)", "수수료선취", "", "-오프라인-", "", "", "집합",
"", "투자", "", "업자", "", "보수", "", "", "0.3500", "",
"", "0.3500", "", "", "0.3500", "", "", "", "", "판매", "",
"회사", "", "보수", "", "", "", "0.5100", "", "", "0.2550",
"", "", "0.3570", "", "", "", "", "신탁", "", "업자", "",
"보수", "", "", "", "0.0300", "", "", "0.0300", "", "", "0.0300",
"", "", "일반", "", "사무", "", "관리", "", "회사", "", "보수",
"", "0.0100", "", "", "0.0100", "", "", "0.0100", "", "부과비율 (연간, %)",
"", "", "", "기타", "총보수", "비용", "", "", "", "", "0.9000 -",
"", "", "0.6450 -", "", "", "0.7470 -", "", "", "", "", "",
"총보수", "", "<U+30FB>비용", "", "", "", "", "0.9000", "",
"", "0.6450", "", "", "0.7470", "", "", "", "(동종", "",
"유형총", "", "보수<U+30FB>", "", "비용)", "", "", "1.8500",
"", "", "1.3900", "", "", "-", "", "", "", "", "합성총",
"", "보수<U+30FB>", "", "비용", "", "", "", "0.9000", "",
"", "0.6450", "", "", "0.7470", "", "", "", "", "증권", "",
"거래", "", "비용", "", "", "", "-", "", "", "-", "", "",
"-", ""), .Dim = c(19L, 10L)))
>
Update:
df<- as.data.frame(cle[[3]])
df<- noquote(df)
library(assertr)
header<-col_concat(df[1,])
r1<-col_concat(t(df[2:10,]))
r2<-col_concat(t(df[11:13,]))
r3<-col_concat(t(df[14:16,]))
r4<-col_concat(t(df[17:19,]))
newdf<- rbind(r1,r2,r3,r4)
colnames(newdf)<- newdf[1,]
newdf<- newdf[-1,]
View(newdf)
Since the values of the column 6 and 7are merged in the cle file, it gave ooutput table will one column less.
I manually inserted the column 7 with the code as below:
df<- as.data.frame(cle[[3]])
df<- noquote(df)
df$V6.1<- c("",rep("", nrow(df)-1))
df<- df[c(1:6,11,7:10)]
colnames(df)<- c("V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11")
df$V7[7]<- paste(df$V6[7])
df$V6[7]<-""
library(assertr)
header<-col_concat(df[1,])
r1<-col_concat(t(df[2:10,]))
r2<-col_concat(t(df[11:13,]))
r3<-col_concat(t(df[14:16,]))
r4<-col_concat(t(df[17:19,]))
newdf<- rbind(r1,r2,r3,r4)
colnames(newdf)<- newdf[1,]
newdf<- newdf[-1,]
newdf<- as.data.frame(newdf)
newdf[7]<-"-"
View(newdf)
This resulted in the below table:
I think you can work around this.