rpdfjupyter-notebookpandocipynb

How to extract text from pdf files and convert to Jupyter notebook format (.ipynb) in R


I have a number of PDFs in multiple subfolders containing code. I would like to extract the text from these PDFs and convert them into Jupyter notebooks programmatically.

To my knowledge there is no program/package that can convert PDFs to .ipynb format directly. However, the pdftools package has a function pdf_text that can extract text from PDFs and store as a character vector, and pandoc allows for conversion from markdown to Jupyter notebook format. I essentially conducted a chain of commands first converting PDF to character vector, store character vector as text file using cat() and then converting to ipynb in pandoc through a system call.

Code sample:

# List files in working directory
code.files <- list.files(
    path = ".", 
    pattern = ".pdf$", 
    all.files = TRUE,
    full.names = TRUE, 
    recursive = TRUE,
    ignore.case = TRUE
)

# File conversion PDF --> txt --> ipynb
lapply(
    code.files, 
    function(x) {
        # Extract text from PDF
        pdftools::pdf_text(x) %>%
        # Save extracted text to file
        cat(file = xfun::with_ext(x , "txt"))
        # Create system command with the correct file name extensions
        f <- paste( "pandoc -f markdown -t ipynb", "-o", xfun::with_ext(x , "ipynb"), xfun::with_ext(x , "txt") )
        # Make system call
        system(f, intern = TRUE)
    }
)

Now, PDF extraction works pretty well, however, the conversion to .ipynb will change the formatting.

Examples of text body in the PDF, and after (1) conversion to .txt and (2) the subsequent conversion of the .txt file to .ipynb:
PDF
.txt
This is a WinBUGS program for the real example in Chapter 7, Section 7.2.1.

Model: Structural Equation Model with dichotomous data
Date Set Names: full1.dat, and XI.dat, where XI.dat are input initial values for xi.
Sample Size: N=837

model{
    for(i in 1:N){
      #measurement equation model
      for(j in 1:P){y[i,j]~dnorm(mu[i,j],psi[j])I(low[z[i,j]+1],high[z[i,j]+1])}
      mu[i,1]<-eta[i]
      mu[i,2]<-lam[1]*eta[i]
      mu[i,3]<-lam[2]*eta[i]
      mu[i,4]<-xi[i,1]
      mu[i,5]<-lam[3]*xi[i,1]
      mu[i,6]<-lam[4]*xi[i,1]
      mu[i,7]<-xi[i,2]
      mu[i,8]<-lam[5]*xi[i,2]
      mu[i,9]<-lam[6]*xi[i,2]
      #structural equation model
      xi[i,1:2]~dmnorm(u[1:2],phi[1:2,1:2])
      eta[i]~dnorm(nu[i],psd)
      nu[i]<-gam[1]*xi[i,1]+gam[2]*xi[i,2]
    } #end of i
   for(j in 1:P){psi[j]<-1.0}
   for(j in 1:2){u[j]<-0.0}
   #priors on loadings and coefficients
   lam[1]~dnorm(3.12,4.0)        lam[2]~dnorm(0.10,4.0)            lam[3]~dnorm(3.32,4.0)
   lam[4]~dnorm(3.10,4.0)        lam[5]~dnorm(4.30,4.0)            lam[6]~dnorm(3.14,4.0)
   var.gam<-4.0*psd
   gam[1]~dnorm(-1.0,var.gam) gam[2]~dnorm(0.86,var.gam)
   #priors on precisions
   psd~dgamma(8.0, 10.0)
   sgd<-1/psd
   phi[1:2,1:2]~dwish(R[1:2,1:2], 8)
   phx[1:2,1:2]<-inverse(phi[1:2,1:2])
} # end of model

Data
list(N=837, P=9, low=c(-2000,0), high=c(0,2000),
    R=structure(
       .Data=c(1.0, 0.0,
                0.0, 1.0),.Dim=c(2,2)),
    z=structure(
      .Data=c(paste the full1.dat here),.Dim=c(837,9)))

Three different Initial values
list(lam=c(0.8,0.8,0.8,0.8,0.8,0.8),gam=c(-1.2,1.0),psd=0.5,
    phi=structure(
        .Data=c(1.0, 0.5,
                  0.5,1.0),.Dim=c(2,2)),
    xi=structure(
        .Data=c(paste the XI.dat here),.Dim=c(837,2)))
 list(lam=c(0.6,0.6,0.6,0.6,0.6,0.6),gam=c(-1.0,0.8),psd=1.0,
    phi=structure(
         .Data=c(1.2, 0.0,
                   0.0,1.2),.Dim=c(2,2)),
     xi=structure(
         .Data=c(paste the XI.dat here),.Dim=c(837,2)))

list(lam=c(1.0,1.0,1.0,1.0,1.0,1.0),gam=c(-1.5,1.2),psd=0.8,
    phi=structure(
         .Data=c(0.8,0.1,
                0.1,0.8),.Dim=c(2,2)),
    xi=structure(
        .Data=c(paste the XI.dat here),.Dim=c(837,2)))
.ipynb
This is a WinBUGS program for the real example in Chapter 7, Section
7.2.1.

Model: Structural Equation Model with dichotomous data Date Set Names:
full1.dat, and XI.dat, where XI.dat are input initial values for xi.
Sample Size: N=837

model{ for(i in 1:N){ #measurement equation model for(j in
1:P){y\[i,j\]~dnorm(mu\[i,j\],psi\[j\])I(low\[z\[i,j\]+1\],high\[z\[i,j\]+1\])}
mu\[i,1\]\<-eta\[i\] mu\[i,2\]\<-lam\[1\]*eta\[i\]
mu\[i,3\]\<-lam\[2\]*eta\[i\] mu\[i,4\]\<-xi\[i,1\]
mu\[i,5\]\<-lam\[3\]*xi\[i,1\] mu\[i,6\]\<-lam\[4\]*xi\[i,1\]
mu\[i,7\]\<-xi\[i,2\] mu\[i,8\]\<-lam\[5\]*xi\[i,2\]
mu\[i,9\]\<-lam\[6\]*xi\[i,2\] #structural equation model
xi\[i,1:2\]~dmnorm(u\[1:2\],phi\[1:2,1:2\]) eta\[i\]~dnorm(nu\[i\],psd)
nu\[i\]\<-gam\[1\]*xi\[i,1\]+gam\[2\]*xi\[i,2\] } #end of i for(j in
1:P){psi\[j\]\<-1.0} for(j in 1:2){u\[j\]\<-0.0} #priors on loadings and
coefficients lam\[1\]~dnorm(3.12,4.0) lam\[2\]~dnorm(0.10,4.0)
lam\[3\]~dnorm(3.32,4.0) lam\[4\]~dnorm(3.10,4.0)
lam\[5\]~dnorm(4.30,4.0) lam\[6\]~dnorm(3.14,4.0) var.gam\<-4.0\*psd
gam\[1\]~dnorm(-1.0,var.gam) gam\[2\]~dnorm(0.86,var.gam) #priors on
precisions psd~dgamma(8.0, 10.0) sgd\<-1/psd
phi\[1:2,1:2\]~dwish(R\[1:2,1:2\], 8)
phx\[1:2,1:2\]\<-inverse(phi\[1:2,1:2\]) } \# end of model

Data list(N=837, P=9, low=c(-2000,0), high=c(0,2000), R=structure(
.Data=c(1.0, 0.0, 0.0, 1.0),.Dim=c(2,2)), z=structure( .Data=c(paste the
full1.dat here),.Dim=c(837,9)))

Three different Initial values
list(lam=c(0.8,0.8,0.8,0.8,0.8,0.8),gam=c(-1.2,1.0),psd=0.5,
phi=structure( .Data=c(1.0, 0.5, 0.5,1.0),.Dim=c(2,2)), xi=structure(
.Data=c(paste the XI.dat here),.Dim=c(837,2)))
list(lam=c(0.6,0.6,0.6,0.6,0.6,0.6),gam=c(-1.0,0.8),psd=1.0,
phi=structure( .Data=c(1.2, 0.0, 0.0,1.2),.Dim=c(2,2)), xi=structure(
.Data=c(paste the XI.dat here),.Dim=c(837,2)))

list(lam=c(1.0,1.0,1.0,1.0,1.0,1.0),gam=c(-1.5,1.2),psd=0.8,
phi=structure( .Data=c(0.8,0.1, 0.1,0.8),.Dim=c(2,2)), xi=structure(
.Data=c(paste the XI.dat here),.Dim=c(837,2)))

The conversion to .ipynb will add the entire text body as a single markdown cell, and the process adds backslashes in front of braces (e.g. \[) while some line breaks are lost.

I would very much like to retain the formatting of the PDF file after conversion to Jupyter notebook. The formatting in the converted txt is close enough, but the added slashes and lost line breaks when converting to ipynb I would very much like to avoid.

Does anyone have a solution for this?


Solution

  • A simple alternative to Pandoc is notedown that allows simple conversion of markdown to ipynb. Seemingly, text conversion using notedown does not generate the undesired backslashes while generating line breaks correctly

    Installation:
    pip install notedown
    

    (Arch Linux users may, alternatively, install python-notedown from AUR.)

    Usage:
    notedown input.md > output.ipynb
    
    Updated code:

    It's then a simple matter of updating the conversion function:

    lapply(
        code.files, 
        function(x) {
            pdftools::pdf_text(x) %>%
            cat(file = xfun::with_ext(x , "txt"))
    
            # f <- paste( "pandoc -f markdown -t ipynb", "-o", xfun::with_ext(x , "ipynb"), xfun::with_ext(x , "txt") )
            f <- paste( "notedown", xfun::with_ext(x , "txt"), ">", xfun::with_ext(x , "ipynb") )
            system(f, intern = TRUE)
            
        }
    )
    
    Sample output in generated ipynb file:
    This is a WinBUGS program for the real example in Chapter 7, Section 7.2.1.
    
    Model: Structural Equation Model with dichotomous data
    Date Set Names: full1.dat, and XI.dat, where XI.dat are input initial values for xi.
    Sample Size: N=837
    
    model{
        for(i in 1:N){
          #measurement equation model
          for(j in 1:P){y[i,j]~dnorm(mu[i,j],psi[j])I(low[z[i,j]+1],high[z[i,j]+1])}
          mu[i,1]<-eta[i]
          mu[i,2]<-lam[1]*eta[i]
          mu[i,3]<-lam[2]*eta[i]
          mu[i,4]<-xi[i,1]
          mu[i,5]<-lam[3]*xi[i,1]
          mu[i,6]<-lam[4]*xi[i,1]
          mu[i,7]<-xi[i,2]
          mu[i,8]<-lam[5]*xi[i,2]
          mu[i,9]<-lam[6]*xi[i,2]
          #structural equation model
          xi[i,1:2]~dmnorm(u[1:2],phi[1:2,1:2])
          eta[i]~dnorm(nu[i],psd)
          nu[i]<-gam[1]*xi[i,1]+gam[2]*xi[i,2]
        } #end of i
       for(j in 1:P){psi[j]<-1.0}
       for(j in 1:2){u[j]<-0.0}
       #priors on loadings and coefficients
       lam[1]~dnorm(3.12,4.0)        lam[2]~dnorm(0.10,4.0)            lam[3]~dnorm(3.32,4.0)
       lam[4]~dnorm(3.10,4.0)        lam[5]~dnorm(4.30,4.0)            lam[6]~dnorm(3.14,4.0)
       var.gam<-4.0*psd
       gam[1]~dnorm(-1.0,var.gam) gam[2]~dnorm(0.86,var.gam)
       #priors on precisions
       psd~dgamma(8.0, 10.0)
       sgd<-1/psd
       phi[1:2,1:2]~dwish(R[1:2,1:2], 8)
       phx[1:2,1:2]<-inverse(phi[1:2,1:2])
    } # end of model
    
    Data
    list(N=837, P=9, low=c(-2000,0), high=c(0,2000),
        R=structure(
           .Data=c(1.0, 0.0,
                    0.0, 1.0),.Dim=c(2,2)),
        z=structure(
          .Data=c(paste the full1.dat here),.Dim=c(837,9)))
    
    Three different Initial values
    list(lam=c(0.8,0.8,0.8,0.8,0.8,0.8),gam=c(-1.2,1.0),psd=0.5,
        phi=structure(
            .Data=c(1.0, 0.5,
                      0.5,1.0),.Dim=c(2,2)),
        xi=structure(
            .Data=c(paste the XI.dat here),.Dim=c(837,2)))
     list(lam=c(0.6,0.6,0.6,0.6,0.6,0.6),gam=c(-1.0,0.8),psd=1.0,
        phi=structure(
             .Data=c(1.2, 0.0,
                       0.0,1.2),.Dim=c(2,2)),
         xi=structure(
             .Data=c(paste the XI.dat here),.Dim=c(837,2)))
    
    list(lam=c(1.0,1.0,1.0,1.0,1.0,1.0),gam=c(-1.5,1.2),psd=0.8,
        phi=structure(
             .Data=c(0.8,0.1,
                    0.1,0.8),.Dim=c(2,2)),
        xi=structure(
            .Data=c(paste the XI.dat here),.Dim=c(837,2)))