perlxml-libxml

Perl with XML::LibXML Dom (Globally Find and Replace XML)


I am new to DOM and XML-LibXML.

This is my sample mathml (XML) file. My XML filename is in.xml and i need the final output XML filename is out.xml. I would like to find <mi>bcde</mi> and need to modify <mtext>pqsd</mtext> globally and store in out.xml. How to achieve this.

<math xmlns="http://www.w3.org/1998/Math/MathML">
    <mfrac>
         <mi>a</mi>
         <mrow>
            <mi>bcde</mi>
         </mrow>
    </mfrac>
    <msqrt>
        <mi>s</mi>
        <mi>e</mi>
        <mi>f</mi>
    </msqrt>
</math> 


#!/usr/bin/perl
use strict;
use warnings 'all';
use XML::LibXML;

my $mediaIdFrom = "MEDIAID_TEST";
my $VodItemIdFrom = "VODITEM_ID_TEST";
my $mediaId="";
my $vodItemId="";

my $filename = 'sample1.xml';
my $out_filename = "sample2.xml";

my $dom = XML::LibXML -> load_xml(location => $filename);

foreach $mediaId ($dom->findnodes('/ScheduleProvider/Episode/Media/@id')) {
    $mediaId->setValue("xx " . $mediaIdFrom . " yy");
}

foreach $vodItemId ($dom->findnodes('/ScheduleProvider/VoidItem/@id')) {
    $vodItemId->setValue($VodItemIdFrom);
}
#### for storing the output separate XML file
$dom->toFile($out_filename);`

Solution

  • Your XML has a namespace but your XPath queries don't, see note under findnodes in man XML::LibXML::Node. This code should work:

    #!/usr/bin/perl
    use strict;
    use warnings;
    
    use XML::LibXML;
    use XML::LibXML::XPathContext;
    
    my $dom = XML::LibXML->load_xml(string => <<'END_OF_XML');
    <math xmlns="http://www.w3.org/1998/Math/MathML">
        <mfrac>
             <mi>a</mi>
             <mrow>
                <mi>bcde</mi>
             </mrow>
        </mfrac>
        <msqrt>
            <mi>s</mi>
            <mi>e</mi>
            <mi>f</mi>
        </msqrt>
    </math>
    END_OF_XML
    
    my $xpc = XML::LibXML::XPathContext->new();
    $xpc->registerNs('math', 'http://www.w3.org/1998/Math/MathML');
    
    foreach my $node ($xpc->findnodes('/math:math/math:mfrac/math:mrow/math:mi', $dom)) {
        my $newNode = XML::LibXML::Element->new('mtext');
        $newNode->appendText('pqsd');
    
        $node->replaceNode($newNode);
    }
    
    print $dom->toString();
    

    Output:

    $ perl dummy.pl
    <?xml version="1.0"?>
    <math xmlns="http://www.w3.org/1998/Math/MathML">
        <mfrac>
             <mi>a</mi>
             <mrow>
                 <mtext>pqsd</mtext>
             </mrow>
        </mfrac>
        <msqrt>
            <mi>s</mi>
            <mi>e</mi>
            <mi>f</mi>
        </msqrt>
    </math>
    

    EDIT Maybe I have misunderstood your question and you want to replace all occurrences of <mi>bcde</mi>? Then the foreach would change to

    foreach my $node ($xpc->findnodes('//math:mi[text()="bcde"]', $dom)) {
    

    EDIT 2 to find multiple <mi>xyz</mi> and replace them you could use text=replacement command line parameters, i.e.

    foreach my $argv (@ARGV) {
        next
            unless my($find, $replace) = ($argv =~ /^([^=]+)=(.*)$/);
    
        foreach my $node ($xpc->findnodes(qq{//math:mi[text()="${find}"]}, $dom)) {
            my $newNode = XML::LibXML::Element->new('mtext');
            $newNode->appendText($replace);
    
            $node->replaceNode($newNode);
        }
    }
    

    and your replacement example would be

    $ perl dummy.pl bcde=pqsd
    

    EDIT 3 replace all <mi>xxx</mi> where xxx has more than one character with mtext:

    foreach my $node ($xpc->findnodes('//math:mi', $dom)) {
        my $text = $node->textContent();
    
        # strip surrounding white space from text
        $text =~ s/^\s+//;
        $text =~ s/\s+$//;
    
        # if text has more than one character then replace "mi" with "mtext"
        if (length($text) > 1) {
            my $newNode = XML::LibXML::Element->new('mtext');
            $newNode->appendText($text);
    
            $node->replaceNode($newNode);
        }
    }