excelvbams-worddocumenttext-to-column

Fastest way to parse a Word document containing Transcript text into Excel columns


I have multiple word document files, each containing transcript text like below (paragraph marks not shown):

Some Title1              ' <--- Some title ending with paragraph mark
(Apr 3, 2023 - 9:00am)  ' <--- Date - time ending with paragraph mark
(Interviewee: Harry) ' <--- Interviewee name to pick to only add interviewee lines (name and text) to outrow array.
                        ' <--- blank line ending with paragraph mark
(00:00:00 - 00:00:02)   ' <--- timestamp ending with paragraph mark

Harry: Okay, thank you. ' <--- Speaker: Text ending with paragraph mark

(00:00:02 - 00:00:06)
Tom: Hi, Harry, hello. Are you okay?

(00:00:06 - 00:00:09)
Harry: Yeah, I'm good, thank you. How are you doing? Happy Monday to you.

(00:00:09 - 00:00:12)
Tom: It's a nice Monday today, so it's quite bright for a change.


As there are many doc files, I would like to copy the whole content (all paragraphs) from each doc file into an excel sheet Sheet2, appending each content to the last non-blank row. Once done, I would like to use the TextToColumns feature in Excel to split the text into individual columns as shown:

Title DateTime TimeStamp Speaker Text
Some Title1 (Apr 3, 2023 - 9:00am) (00:00:00 - 00:00:02) Harry Okay, thank you.
(00:00:06 - 00:00:09) Harry Yeah, I'm good, thank you. How are you doing? Happy Monday to you.
Some Title2 (Apr 5, 2023 - 19:00pm) (00:00:00 - 00:00:04) Jill I am doing fine.
(00:00:06 - 00:00:12) Jill I'm busy.

...

Currently i am only able to loop and copy paste the docs contents to sheet. Once consolidated in the sheet, I would like to transpose this content into the table as shown above. Also, if there is a way to collect all the doc contents into an array or ado recordset, then transfer array/recordset content directly to sheet in one go, that would speed up code and save some time.

Option Explicit

Sub ParseTranscriptToExcelSheet()

    Application.ScreenUpdating = False
    Application.DisplayAlerts = False
    
    Dim wdApp As Object ' Word application
    Dim wdDoc As Object ' Word document
    Dim tbl As Object   ' Table
    Dim para As Object  ' Paragraph
    Dim row As Integer  ' Row index for the table
    Dim i As Long
    Dim oFileDialog As FileDialog
    Dim vSelectedItem As Variant
    
    ' declare worksheets
    Dim ws1 As Worksheet
    Dim ws2 As Worksheet
    
    Set ws1 = ThisWorkbook.Sheets(1) ' contains button to run code
    Set ws2 = ThisWorkbook.Sheets(2)
    
    ' Add a header row to the worksheet 2
    ws2.Range("A1:E1").Value = Array("Title", "DateTime", "Timestamp", "Speaker", "Text")
    
    ' Initialize the row index for the table
    row = ws2.Cells(ws2.Rows.Count, 1).End(xlUp).row + 1
    
    ' Open the Word document containing the transcript
    On Error Resume Next
    Set wdApp = GetObject(, "Word.Application")
    On Error GoTo 0
        
    If wdApp Is Nothing Then
        Set wdApp = CreateObject("Word.Application")
    End If
    
    With wdApp
        .Visible = False
    End With
            
'    ReDim sContent(1 To 1)
    Set oFileDialog = Application.FileDialog(msoFileDialogFilePicker)
    With oFileDialog
        .Title = "Select Word Files"
        .AllowMultiSelect = True
        .Filters.Add "Word files", "*.doc*", 1
        If .Show = -1 Then
            ws2.Activate
            For Each vSelectedItem In .SelectedItems
                Set wdDoc = wdApp.Documents.Open(vSelectedItem)
                With wdDoc
'                    sContent(UBound(sContent)) = .Content.formattedtext.text
                    .Content.Copy
                    ws2.Cells(row, 1).PasteSpecial (xlPasteValues)
                    Application.CutCopyMode = False
                    DoEvents
                    .Close savechanges:=False
                    row = ws2.Cells(ws2.Rows.Count, 1).End(xlUp).row + 1
                End With
'                ReDim Preserve sContent(1 To UBound(sContent) + 1) As String
            Next vSelectedItem
'            ReDim Preserve sContent(1 To UBound(sContent) - 1) As String
                
        Else
            MsgBox "No files selected"
        End If
    
    End With
    
    
    wdApp.Quit
    Set wdDoc = Nothing
    Set wdApp = Nothing
    Set ws1 = Nothing
    Set ws2 = Nothing
    
    Application.DisplayAlerts = True
    Application.ScreenUpdating = True
End Sub


Solution

  • Try this out:

    Option Explicit
    
    Sub ParseTranscriptToExcelSheet()
        Dim wdApp As Object ' Word application
        Dim wdDoc As Object ' Word document
        Dim allFiles As Collection, f, txt As String, arr, el, ub As Long
        Dim ws2 As Worksheet, nextRow As Long, x As Long, arr2, ln
        
        Set allFiles = SelectedFiles()
        If allFiles.Count = 0 Then Exit Sub
        
        Set ws2 = ThisWorkbook.Sheets(2)
        ws2.Range("A1:E1").Value = Array("Title", "DateTime", "Timestamp", "Speaker", "Text")
        nextRow = ws2.Cells(ws2.Rows.Count, 1).End(xlUp).row + 1
        
        Set wdApp = GetWordApp()
        
        For Each f In allFiles                     'loop over selected files
            Set wdDoc = wdApp.Documents.Open(f)    'open files
            txt = wdDoc.Range.Text                 'read content
            wdDoc.Close False
            arr = Split(txt, vbCr)                 'get array of lines/paras
            ub = UBound(arr)
            If ub > 0 Then
                ws2.Cells(nextRow, "A").Value = arr(0)  'fill the "header" info
                ws2.Cells(nextRow, "B").Value = arr(1)
                For x = 2 To UBound(arr)                'process rest of lines
                    ln = Trim(arr(x))
                    If ln Like "(*)" Then                       'timestamp?
                        ws2.Cells(nextRow, "C").Value = ln 
                    ElseIf ln Like "*:*" Then                   'speaker text?           
                        arr2 = Split(ln, ":", 2)
                        ws2.Cells(nextRow, "D").Value = arr2(0) 'speaker
                        ws2.Cells(nextRow, "E").Value = arr2(1) 'content
                        nextRow = nextRow + 1
                    End If
                Next x
            End If
            nextRow = nextRow + 1
        Next f
    End Sub
    
    'return a Collection of user-selected Word files
    Function SelectedFiles() As Collection
        Dim f
        Set SelectedFiles = New Collection
        With Application.FileDialog(msoFileDialogFilePicker)
            .Title = "Select one or more Word Files"
            .AllowMultiSelect = True
            .Filters.Add "Word files", "*.doc*", 1
            If .Show = -1 Then
                For Each f In .SelectedItems
                    SelectedFiles.Add f
                Next f
            End If
        End With
    End Function
    
    'Get a running Word instance, or start a new instance
    Function GetWordApp() As Object
        On Error Resume Next
        Set GetWordApp = GetObject(, "Word.Application")
        On Error GoTo 0
        If GetWordApp Is Nothing Then
            Set GetWordApp = CreateObject("Word.Application") 'assuming this works ok...
        End If
        GetWordApp.Visible = True
    End Function
    

    EDIT: here's a version which populates an array and writes to the Excel sheet at the end

    Sub ParseTranscriptToExcelSheet_Array()
        Dim wdApp As Object ' Word application
        Dim wdDoc As Object ' Word document
        Dim allFiles As Collection, f, txt As String, arr, el, ub As Long
        Dim ws2 As Worksheet, x As Long, arr2, ln, arrOut(), outRow As Long
        
        Set allFiles = SelectedFiles()
        If allFiles.Count = 0 Then Exit Sub
        
        Set ws2 = ThisWorkbook.Sheets(2)
        ws2.Cells.ClearContents 'for testing....
        
        ws2.Range("A1:E1").Value = Array("Title", "DateTime", "Timestamp", "Speaker", "Text")
       
        ReDim arrOut(1 To 20000, 1 To 5) 'array for output; guessing at max size....
        outRow = 1
        
        Set wdApp = GetWordApp()
        
        For Each f In allFiles                     'loop over selected files
            Set wdDoc = wdApp.Documents.Open(f)    'open files
            txt = wdDoc.Range.Text                 'read content
            wdDoc.Close False
            arr = Split(txt, vbCr)                 'get array of lines/paras
            ub = UBound(arr)
            If ub > 0 Then
                arrOut(outRow, 1) = arr(0)  'fill the "header" info
                arrOut(outRow, 2) = arr(1)
                For x = 2 To UBound(arr)            'process rest of lines
                    ln = Trim(arr(x))
                    If ln Like "(*)" Then           'timestamp?
                        arrOut(outRow, 3) = ln
                    ElseIf ln Like "*:*" Then       'speaker text?
                        arr2 = Split(ln, ":", 2)
                        arrOut(outRow, 4) = arr2(0) 'speaker
                        arrOut(outRow, 5) = arr2(1) 'content
                        outRow = outRow + 1
                    End If
                Next x
            End If
            outRow = outRow + 1
        Next f
        
        'any content to write?
        If outRow > 1 Then ws2.Cells(ws2.Rows.Count, 1).End(xlUp). _
               Offset(1).Resize(outRow, 5).Value = arrOut
        
    End Sub