phplaravelphpword

Convert a docx file to HTML without HTML, HEAD and BODY tags using PhpWord


Hi I am using PhpWord to convert a docx file to a html file. This works well however it contains HTML, HEAD and BODY tags, how do I convert a file without these tags, just the body content.

    $fileTmp =  public_path('test.docx');
    $phpWord = \PhpOffice\PhpWord\IOFactory::load($fileTmp);
    $htmlWriter = new \PhpOffice\PhpWord\Writer\HTML($phpWord);
    $content = $htmlWriter->getContent();

Solution

  • you can use PHP's DOMDocument class to parse the HTML and then extract the body content.

    use PhpOffice\PhpWord\IOFactory;
    use PhpOffice\PhpWord\Writer\HTML;
    
    // Load the DOCX file
    $fileTmp = public_path('test.docx');
    $phpWord = IOFactory::load($fileTmp);
    
    // Generate the HTML content
    $htmlWriter = new HTML($phpWord);
    $htmlContent = $htmlWriter->getContent();
    
    // Parse the HTML content using DOMDocument
    $dom = new DOMDocument();
    $dom->loadHTML($htmlContent, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
    
    // Get the body content without the HTML, HEAD, and BODY tags
    $bodyContent = '';
    $bodyNodes = $dom->getElementsByTagName('body')->item(0)->childNodes;
    foreach ($bodyNodes as $node) {
        $bodyContent .= $dom->saveHTML($node);
    }
    
    // Save or use the $bodyContent as needed
    echo $bodyContent;