phpregexmediawikimediawiki-extensions

Troubleshooting Regular Expressions in MediaWiki Extension Not Working


The regex feature of the onParserBeforePreprocess function doesn't work in the extension I'm building, and I don't know why.

Let me elaborate on the issue with the onParserBeforePreprocess function not working.

extension.json:

{
    "name": "EnhanceMarkup",
    "description": "Provides enhanced markup functionalities",
    "version": "1.0",
    "author": [
        "Jeong Gaon"
    ],
    "url": "https://www.gaon.xyz/mw_extensions",
    "type": "other",
    "license-name": "Apache-2.0",
    "AutoloadClasses": {
        "EnhanceMarkupHooks": "includes/EnhanceMarkupHooks.php"
    },
    "ResourceModules": {
        "ext.EnhanceMarkup.styles": {
            "styles": "resources/ext.EnhanceMarkup.styles.css",
            "localBasePath": "",
            "remoteExtPath": "EnhanceMarkup"
        },
        "ext.EnhanceMarkup.scripts": {
            "scripts": ["resources/ext.EnhanceMarkup.scripts.js", "resources/lib/math.js"],
            "localBasePath": "",
            "remoteExtPath": "EnhanceMarkup"
        }
    },
    "Hooks": {
        "InternalParseBeforeLinks": "EnhanceMarkupHooks::onInternalParseBeforeLinks",
        "ParserFirstCallInit": "EnhanceMarkupHooks::onParserFirstCallInit",
        "BeforePageDisplay": "EnhanceMarkupHooks::onBeforePageDisplay"
    },
    "manifest_version": 2
}

includes/EnhanceMarkupHooks.php:

<?php
class EnhanceMarkupHooks
{
    public static function onBeforePageDisplay(OutputPage &$out, Skin &$skin)
    {
        $out->addModules("ext.EnhanceMarkup.styles");
        $out->addModules("ext.EnhanceMarkup.scripts");
        return true;
    }

    public static function onParserFirstCallInit(Parser $parser)
    {
        // Register each of your custom parser functions with the parser
        $parser->setHook("random", [self::class, "randomRender"]);


        return true;
    }

    public static function onInternalParseBeforeLinks(Parser &$parser, &$text)
    {
        // - * 4+ == <hr>
        // Replace sequences of 3-9 '*', '-', or '_' with a horizontal rule
        $text = preg_replace('/^([-]{3,9})$/m', "<hr>", $text);

        // [pagecount] show all count of page
        // Replace [pagecount] with the total number of pages
        $text = preg_replace_callback(
            "/\[pagecount\]/",
            function ($matches) use ($parser) {
                $dbr = wfGetDB(DB_REPLICA);
                $count = $dbr->selectRowCount("page");
                return $count;
            },
            $text
        );

        // Replace [*A text] with <ref group="A">text</ref>
        $text = preg_replace(
            "/\[\*\s+([^ ]+)\s+(.*?)\]/",
            '<ref group="$1">$2</ref>',
            $text
        );

        // Replace [*A] with <ref group="A" />
        $text = preg_replace(
            "/\[\*\s+([^ ]+)\s*\]/",
            '<ref group="$1" />',
            $text
        );

        // Replace [* text] with <ref>text</ref>
        $text = preg_replace("/\[\*\s+(.*?)\]/", '<ref>$1</ref>', $text);

        // Replace [include text] with {{text}}
        $text = preg_replace("/\[\include\s+(.*?)\]/", '{{$1}}', $text);

        // Replace [br] with <br>
        $text = str_replace("[br]", "<br>", $text);

        // Font Size up {{{+1 (content) }}} - Range: 1~5
        $text = preg_replace_callback('/\{\{\{\+([1-5])\s*(.*?)\s*\}\}\}/s', function($matches) {
            return '<span style="font-size:'.(1 + $matches[1]).'em;">'.$matches[2].'</span>';
        }, $text);
        
        // Font Size down {{{-1 (content) }}} - Range: 1~5
        $text = preg_replace_callback('/\{\{\{-([1-5])\s*(.*?)\s*\}\}\}/s', function($matches) {
            return '<span style="font-size:'.(1 - $matches[1]/10).'em;">'.$matches[2].'</span>';
        }, $text);

        return true;
    }

    // Random
    // <random range="50">True|False</random>
    public static function randomRender(
        $input,
        array $args,
        Parser $parser,
        PPFrame $frame
    ) {
        // Disable caching
        $parser->getOutput()->updateCacheExpiry(0);

        // Parse the input
        $parts = explode("|", $input);

        // Get the range from args
        $range = isset($args["range"]) ? $args["range"] : 2; // default to 2

        // Generate a random number within the range
        $randomNumber = mt_rand(1, $range);

        // Choose the output based on the random number
        if ($randomNumber <= $range / 2) {
            // If the random number is in the first half of the range, return the first part
            return $parts[0];
        } else {
            // Otherwise, return the second part if it exists, or the first part if it doesn't
            return isset($parts[1]) ? $parts[1] : $parts[0];
        }
    }
}

Looking at the code, there doesn't seem to be anything particularly wrong with it - if it's supposed to work, typing something like [* texts] within the wiki should generate a footnote called texts, but for some reason it's outputting literally.

For example, if you type 'hello[br]world', you should see world under hello, but nothing.

My MediaWiki site address is https://www.gaonwiki.com

Let me know if you need any more information. I'll provide it. Thank you.


Solution

  • A) To match your references described by [*A Text], I would correct the pattern like this:

    /\[\*(?<group>\w+)\s+(?<text>[^\]]+)\]/

    The idea is to use named capturing groups with (?<group_name>...pattern...) and also to be a bit more precise with \w+ to match word characters, then \s+ for one or several spaces and then any char which isn't the closing bracket with [^\]]+.

    The replacement becomes <ref group="$group">$text</ref>

    Here are some tests of it: https://regex101.com/r/vueNcM/2

    B) Step 2, to match only [*A], I would use /\[\*(?<group>\w+)\]/ and replace it with <ref group="$group" />

    Here are the tests too: https://regex101.com/r/gYFOzO/2

    C) Step 3, to replace [* text] with <ref>text</ref>, I would use first use /\[\*\s+(?<text>[^\]]+)\]/ and replace it by <ref>$text</ref>.

    Tests available here: https://regex101.com/r/aYTOH9/1

    But if you want to allow escaped bracket in the text (in case the user needs to have some brackets in the text, then use /\[\*\s+(?<text>(?:\\\]|[^\]])+)\]/

    Tests: https://regex101.com/r/aYTOH9/2

    For this situation, you'll have to do a preg_replace_callback() instead of a simple preg_replace() because we have to unescape the brackets:

    $text = preg_replace_callback(
        '/\[\*\s+(?<text>(?:\\\\\]|[^\]])+)\]/',
        function ($matches) {
            return '<ref>' .
                preg_replace('/\\\\([\[\]])/', '$1', $matches['text']) .
                '</ref>';
        },
        $text
    );
    

    Test the PHP here: https://onlinephp.io/c/2b5249

    Security concerns when creating filters

    What happens if the user inputs this?

    Shit happens with [* <script>alert('I got you')</script>]
    

    Will there be another filter to avoid XSS attacks?

    If it's not safely escaped, then replace all your preg_replace() calls by a preg_replace_callback() like in example C) above and do the sanitizing operations on the captured values:

    // Replace [* Some text] by <ref>Some text</ref>
    // Also handle escaped brackets in text, such as [* An \[important\] reference]
    $text = preg_replace_callback(
        // In the pattern, \ should be doubled, but only for known PHP escaped
        // sequences, such as \t, \n, \a, or \\. This makes the pattern below not
        // very readable :-( In JavaSript it would be simple like this:
        // /\[\*\s+(?<text>(?:\\\]|[^\]])+)\]/
        '/\[\*\s+(?<text>(?:\\\\\]|[^\]])+)\]/',
        function ($matches) {
            // 1) Unescape "\[" or "\]" by "[" and respectively "]".
            // 2) As we are creating HTML, the text should be sanitized as it may
            // contain some stuff like <strong>Bold</strong> or worse some JavaScript
            // <script>alert('XSS attack')</script>.
            return '<ref>' .
                htmlspecialchars(
                    preg_replace('/\\\\([\[\]])/', '$1', $matches['text'])
                ) .
                '</ref>';
        },
        $text
    );
    

    PHP code in action here: https://onlinephp.io/c/8a7f8