What's a good implementation for unescaping numeric HTML/XML entities, e.g.
and replacing them with the ASCII equivalent?
Expressed as a unit test:
local orig = "It's the "end" &ok; "
local fixd = unescape(orig) -- Implement this
assert( fixd == "It's the \"end\" &ok;\n" )
Here's a simple implementation that also handles the core named XML entities:
function unescape(str)
str = string.gsub( str, '<', '<' )
str = string.gsub( str, '>', '>' )
str = string.gsub( str, '"', '"' )
str = string.gsub( str, ''', "'" )
str = string.gsub( str, '&#(%d+);', function(n) return string.char(n) end )
str = string.gsub( str, '&#x(%d+);', function(n) return string.char(tonumber(n,16)) end )
str = string.gsub( str, '&', '&' ) -- Be sure to do this after all others
return str
end
print(unescape(""Hello" 'World'")) --> "Hello" 'World'
However, note that this fails for one pathological case: a numeric ampersand entity followed by the text amp;
:
print(unescape("Ampersand entity is &amp;")) --> Ampersand entity is &
-- The result should actually be Ampersand entity is &
We can fix this edge case by handling all entities at once, but the code gets a good bit uglier:
function unescape(str)
local map={ ["lt"]="<", ["gt"]=">", ["amp"]="&", ["quot"]='"', ["apos"]="'" }
str = string.gsub( str, '(&(#?x?)([%d%a]+);)', function(orig,n,s)
return (n=='' and map[s])
or (n=="#x" and tonumber(s,16)) and string.char(tonumber(s,16))
or (n=="#" and tonumber(s)) and string.char(s)
or orig
end )
return str
end
print(unescape("Ampersand entity is &amp;")) --> Ampersand entity is &
Finally, we can unwrap it for a little more speed:
local gsub, char = string.gsub, string.char
local entityMap = {["lt"]="<",["gt"]=">",["amp"]="&",["quot"]='"',["apos"]="'"}
local entitySwap = function(orig,n,s)
return (n=='' and entityMap[s])
or (n=="#" and tonumber(s)) and string.char(s)
or (n=="#x" and tonumber(s,16)) and string.char(tonumber(s,16))
or orig
end
function unescape(str)
return (gsub( str, '(&(#?x?)([%d%a]+);)', entitySwap ))
end