phpregexreplacelocalizationaccent-sensitive

Keep accented characters while highlighting text (wrapping in <span> tags)


I am using the following code to search and highlight accented text. The problem I am facing is that it removes accented text while highlighting. Is there anyway to keep accents?

echo highlightTerm("Would you like a café, Mister Kàpêk?", "kape caf");

function highlightTerm($text, $keyword) {
    $text = iconv('utf-8', 'ISO-8859-1//IGNORE', Normalizer::normalize($text, Normalizer::FORM_D));
    $words = explode(" ", $keyword);
    $p = implode('|', array_map('preg_quote', $words));
    return preg_replace(
        "/($p)/ui", 
        '<span style="background:yellow;">$1</span>', 
        $text
    );
}

Solution

  • Instead of normalizing the text, you can use the tedious approach of creating a dynamic, accent-agnostic regex pattern and then directly perform replacements on the input string.

    The regex map (based on the second code block of this answer):

    define(
        'ACCENT_MAP',
        [
            "A" => "[AАĂǍĄÀÃÁÆÂÅǺĀא]",
            "B" => "[BБבÞ]",
            "C" => "[CĈĆÇЦצĊČץ]",
            "D" => "[DДĎĐדÐ]",
            "E" => "[EÈĘÉËÊЕĒĖĚĔЄƏע]",
            "F" => "[FФƑ]",
            "G" => "[GĞĠĢĜГגҐ]",
            "H" => "[HחĦХĤה]",
            "I" => "[IIÏÎÍÌĮĬIИĨǏיЇĪІ]",
            "J" => "[JЙĴ]",
            "K" => "[KĸכĶКך]",
            "L" => "[LŁĿЛĻĹĽל]",
            "M" => "[MמМם]",
            "N" => "[NÑŃНŅןŊנʼnŇ]",
            "O" => "[OØÓÒÔÕОŐŎŌǾǑƠ]",
            "P" => "[PפףП]",
            "Q" => "[Qק]",
            "R" => "[RŔŘŖרР]",
            "S" => "[SŞŚȘŠСŜס]",
            "T" => "[TТȚטŦתŤŢ]",
            "U" => "[UÙÛÚŪУŨƯǓŲŬŮŰǕǛǙǗ]",
            "V" => "[VВו]",
            "Y" => "[YÝЫŶŸ]",
            "Z" => "(?:Z|ŹŽŻЗז",
            "a" => "[aаăǎąàãáæâåǻāא]",
            "b" => "[bбבþ]",
            "c" => "[cĉćçцצċčץ]",
            "ch" => "(?:ch|ч)",
            "d" => "[dдďđדð]",
            "e" => "[eèęéëêеēėěĕєəע]",
            "f" => "[fфƒ]",
            "g" => "[gğġģĝгגґ]",
            "h" => "[hחħхĥה]",
            "i" => "[iiïîíìįĭıиĩǐיїīі]",
            "j" => "[jйĵ]",
            "k" => "[kĸכķкך]",
            "l" => "[lłŀлļĺľל]",
            "m" => "[mמмם]",
            "n" => "[nñńнņןŋנʼnň]",
            "o" => "[oøóòôõоőŏōǿǒơ]",
            "p" => "[pפףп]",
            "q" => "[qק]",
            "r" => "[rŕřŗרр]",
            "s" => "[sşśșšсŝס]",
            "t" => "[tтțטŧתťţ]",
            "u" => "[uùûúūуũưǔųŭůűǖǜǚǘ]",
            "v" => "[vвו]",
            "y" => "[yýыŷÿ]",
            "z" => "[zźžżзזſ]",
            "ae" => "(?:ae|[ÄǼäæǽ])",
            "ch" => "(?:ch|[Чч])",
            "ij" => "(?:ij|[ijIJ])",
            "ja" => "(?:ja|[яЯ])",
            "je" => "(?:je|[Ээ])",
            "jo" => "(?:jo|[ёЁ])",
            "ju" => "(?:ju|[юЮ])",
            "oe" => "(?:oe|[œŒöÖ])",
            "sch" => "(?:sch|[щЩ])",
            "sh" => "(?:sh|[шШ])",
            "ss" => "(?:ss|[ß])",
            "ue" => "(?:ue|[Ü)",
            "zh" => "(?:zh|[Жж])"
        ]);
    

    Code: (Demo)

    function highlightTerm($text, $keyword) {
        $regex = implode(
            '|',
            array_map(
                fn($w) => strtr(preg_quote($w), ACCENT_MAP),
                explode(" ", $keyword)
            )
        );
        return preg_replace(
                   "#$regex#ui",
                   '<span style="background:yellow;">$0</span>',
                   $text
               );
    }
    
    echo highlightTerm("Would you like a café, Mister Kàpêk?", "kape caf");
    

    Output:

    Would you like a <span style="background:yellow;">caf</span>é, Mister <span style="background:yellow;">Kàpê</span>k?