Is there a way to get flex to match unicode along the lines of
ascSymbol !|#|$|%|&|⋆|+|.|/|<|=|>|?|@|\|^|-|~|:
uniSymbol \p{Symbol}|\p{Other_Symbol}|\p{Punctuation}
symbol ascSymbol|uniSymbol{-}[^|_"',;]
I found http://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html via Flex(lexer) support for unicode but I'd want to be able to something in an automated way.
For example, I'm using cmake and it is configured to generate the lexer/parser on build from the *.l and *.y files. I'd ideally want a work around that didn't require GHC or another Haskell compiler to be installed.
As it turns out getting unicode support in Flex would be a pain unless the Flex source itself adds it. There seems to be some experimental stuff in there for unicode but never made it into a release that I can find.
The Ragel doc is insightful, and comes with built in support for Unicode. I've since found this article which gives an example of how to make Ragel and C++ play nice. Seems to be the better option so going with that.
Hopefully this saves someone else the time it took to figure this out.
EDIT
"Built in support" as stated above is perhaps an exaggeration. It has been easier to get unicode support but it's not just out of the box kind of thing. Using cmake I'm generating a state machine from the derived UCD 7 file. In CMakeLists.txt I do:
#Ruby is required to generate a unicode Ragel machine
FIND_PACKAGE(Ruby REQUIRED)
MESSAGE("Found Ruby ${RUBY_VERSION}")
SET(UNICODE_MACHINE_PATH "${PROJECT_SOURCE_DIR}/src/unicode.rl")
if(NOT EXISTS ${UNICODE_MACHINE_PATH} OR gen_unicode)
MESSAGE("Attempting to generate unicode state machine")
EXECUTE_PROCESS(COMMAND ${RUBY_EXECUTABLE} ${PROJECT_SOURCE_DIR}/unicode2ragel.rb
OUTPUT_FILE ${UNICODE_MACHINE_PATH}
RESULT_VARIABLE RAGEL_UNICODE_GEN_RES)
if(${RAGEL_UNICODE_GEN_RES} EQUAL 0)
MESSAGE("Generaged Ragel Unicode state machine")
else()
MESSAGE(SEND_ERROR "Unable to generate unicode state machine")
endif()
endif()
Then in unicode2ragel.rb (ships with Ragel and modified slightly for UCD 7)
#!/usr/bin/env ruby
#
# This script uses the unicode spec to generate a Ragel state machine
# that recognizes unicode alphanumeric characters. It generates 5
# character classes: uupper, ulower, ualpha, udigit, and ualnum.
# Currently supported encodings are UTF-8 [default] and UCS-4.
#
# Usage: unicode2ragel.rb [options]
# -e, --encoding [ucs4 | utf8] Data encoding
# -h, --help Show this message
#
# This script was originally written as part of the Ferret search
# engine library.
#
# Author: Rakan El-Khalil <rakan@well.com>
require 'optparse'
require 'open-uri'
ENCODINGS = [ :utf8, :ucs4 ]
ALPHTYPES = { :utf8 => "unsigned char", :ucs4 => "unsigned int" }
CHART_URL = "http://www.unicode.org/Public/7.0.0/ucd/extracted/DerivedGeneralCategory.txt"#"http://www.unicode.org/Public/7.0.0/ucd/DerivedCoreProperties.txt"
###
# Display vars & default option
TOTAL_WIDTH = 80
RANGE_WIDTH = 23
@encoding = :utf8
###
# Option parsing
cli_opts = OptionParser.new do |opts|
opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o|
@encoding = o.downcase.to_sym
end
opts.on("-h", "--help", "Show this message") do
puts opts
exit
end
end
cli_opts.parse(ARGV)
unless ENCODINGS.member? @encoding
puts "Invalid encoding: #{@encoding}"
puts cli_opts
exit
end
##
# Downloads the document at url and yields every alpha line's hex
# range and description.
def each_alpha( url, property )
open( url ) do |file|
file.each_line do |line|
next if line =~ /^#/;
next if line !~ /; #{property} #/;
range, description = line.split(/;/)
range.strip!
description.gsub!(/.*#/, '').strip!
if range =~ /\.\./
start, stop = range.split '..'
else start = stop = range
end
yield start.hex .. stop.hex, description
end
end
end
###
# Formats to hex at minimum width
def to_hex( n )
r = "%0X" % n
r = "0#{r}" unless (r.length % 2).zero?
r
end
###
# UCS4 is just a straight hex conversion of the unicode codepoint.
def to_ucs4( range )
rangestr = "0x" + to_hex(range.begin)
rangestr << "..0x" + to_hex(range.end) if range.begin != range.end
[ rangestr ]
end
##
# 0x00 - 0x7f -> 0zzzzzzz[7]
# 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6]
# 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6]
# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6]
UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff]
def to_utf8_enc( n )
r = 0
if n <= 0x7f
r = n
elsif n <= 0x7ff
y = 0xc0 | (n >> 6)
z = 0x80 | (n & 0x3f)
r = y << 8 | z
elsif n <= 0xffff
x = 0xe0 | (n >> 12)
y = 0x80 | (n >> 6) & 0x3f
z = 0x80 | n & 0x3f
r = x << 16 | y << 8 | z
elsif n <= 0x10ffff
w = 0xf0 | (n >> 18)
x = 0x80 | (n >> 12) & 0x3f
y = 0x80 | (n >> 6) & 0x3f
z = 0x80 | n & 0x3f
r = w << 24 | x << 16 | y << 8 | z
end
to_hex(r)
end
def from_utf8_enc( n )
n = n.hex
r = 0
if n <= 0x7f
r = n
elsif n <= 0xdfff
y = (n >> 8) & 0x1f
z = n & 0x3f
r = y << 6 | z
elsif n <= 0xefffff
x = (n >> 16) & 0x0f
y = (n >> 8) & 0x3f
z = n & 0x3f
r = x << 10 | y << 6 | z
elsif n <= 0xf7ffffff
w = (n >> 24) & 0x07
x = (n >> 16) & 0x3f
y = (n >> 8) & 0x3f
z = n & 0x3f
r = w << 18 | x << 12 | y << 6 | z
end
r
end
###
# Given a range, splits it up into ranges that can be continuously
# encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff]
# This is not strictly needed since the current [5.1] unicode standard
# doesn't have ranges that straddle utf8 boundaries. This is included
# for completeness as there is no telling if that will ever change.
def utf8_ranges( range )
ranges = []
UTF8_BOUNDARIES.each do |max|
if range.begin <= max
return ranges << range if range.end <= max
ranges << range.begin .. max
range = (max + 1) .. range.end
end
end
ranges
end
def build_range( start, stop )
size = start.size/2
left = size - 1
return [""] if size < 1
a = start[0..1]
b = stop[0..1]
###
# Shared prefix
if a == b
return build_range(start[2..-1], stop[2..-1]).map do |elt|
"0x#{a} " + elt
end
end
###
# Unshared prefix, end of run
return ["0x#{a}..0x#{b} "] if left.zero?
###
# Unshared prefix, not end of run
# Range can be 0x123456..0x56789A
# Which is equivalent to:
# 0x123456 .. 0x12FFFF
# 0x130000 .. 0x55FFFF
# 0x560000 .. 0x56789A
ret = []
ret << build_range(start, a + "FF" * left)
###
# Only generate middle range if need be.
if a.hex+1 != b.hex
max = to_hex(b.hex - 1)
max = "FF" if b == "FF"
ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left
end
###
# Don't generate last range if it is covered by first range
ret << build_range(b + "00" * left, stop) unless b == "FF"
ret.flatten!
end
def to_utf8( range )
utf8_ranges( range ).map do |r|
build_range to_utf8_enc(r.begin), to_utf8_enc(r.end)
end.flatten!
end
##
# Perform a 3-way comparison of the number of codepoints advertised by
# the unicode spec for the given range, the originally parsed range,
# and the resulting utf8 encoded range.
def count_codepoints( code )
code.split(' ').inject(1) do |acc, elt|
if elt =~ /0x(.+)\.\.0x(.+)/
if @encoding == :utf8
acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1)
else
acc * ($2.hex - $1.hex + 1)
end
else
acc
end
end
end
def is_valid?( range, desc, codes )
spec_count = 1
spec_count = $1.to_i if desc =~ /\[(\d+)\]/
range_count = range.end - range.begin + 1
sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) }
sum == spec_count and sum == range_count
end
##
# Generate the state maching to stdout
def generate_machine( name, property )
pipe = " "
puts " #{name} = "
each_alpha( CHART_URL, property ) do |range, desc|
codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range)
raise "Invalid encoding of range #{range}: #{codes.inspect}" unless
is_valid? range, desc, codes
range_width = codes.map { |a| a.size }.max
range_width = RANGE_WIDTH if range_width < RANGE_WIDTH
desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11
desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH
if desc.size > desc_width
desc = desc[0..desc_width - 4] + "..."
end
codes.each_with_index do |r, idx|
desc = "" unless idx.zero?
code = "%-#{range_width}s" % r
puts " #{pipe} #{code} ##{desc}"
pipe = "|"
end
end
puts " ;"
puts ""
end
puts <<EOF
# The following Ragel file was autogenerated from: #{CHART_URL}
#
# It defines ualpha, udigit, ualnum.
#
# To use this, make sure that your alphtype is set to #{ALPHTYPES[@encoding]},
# and that your input is in #{@encoding}.
%%{
machine WChar;
EOF
generate_machine( :uUppercaseLetter, "Lu" )
generate_machine( :uLowercaseLetter, "Ll" )
generate_machine( :uTitlecaseLetter, "Lt" )
generate_machine( :uModifierLetter, "Lm" )
generate_machine( :uOtherLetter, "Lo" )
generate_machine( :uNonspacingMark, "Mn" )
generate_machine( :uEnclosingMark, "Me" )
generate_machine( :uSpacingMark, "Mc" )
generate_machine( :uDecimalNumber, "Nd" )
generate_machine( :uLetterNumber, "Nl" )
generate_machine( :uOtherNumber, "No" )
generate_machine( :uSpaceSeparator, "Zs" )
generate_machine( :uLineSeparator, "Zl" )
generate_machine( :uParagraphSeparator, "Zp" )
generate_machine( :uFormat, "Cf" )
generate_machine( :uPrivateUse, "Co" )
generate_machine( :uSurrogate, "Cs" )
generate_machine( :uDashPunctuation, "Pd" )
generate_machine( :uOpenPunctuation, "Ps" )
generate_machine( :uClosePunctuation, "Pe" )
generate_machine( :uConnectorPunctuation, "Pc" )
generate_machine( :uOtherPunctuation, "Po" )
generate_machine( :uMathSymbol, "Sm" )
generate_machine( :uCurrencySymbol, "Sc" )
generate_machine( :uModifierSymbol, "Sk" )
generate_machine( :uOtherSymbol, "So" )
generate_machine( :uInitialPunctuation, "Pi" )
generate_machine( :uFinalPunctuation, "Pf" )
puts <<EOF
}%%
EOF
Then in your ragel machine files you can include unicode.rl and get access to each of the unicode groups defined e.g uUppercaseLetter and so on...