I am attempting to create a ANTLR grammar for a HL7 derived language. HL7 has a feature that all the delimiters in a message are mapped using the first few bytes of the input itself. For example: MSH|^~\&
specifies the various delimiters, in order of field separator |
component separator ^
, repetition separator ~
, escape character \
, subcomponent separator &
.
Can an ANTLR grammar be produced that does not hardcode these tokens?
As hinted by Kaby76 in the comments: yes, it is possible with some predicate voodoo:
lexer grammar HL7Lexer;
@members {
private char fieldSeparator;
private char componentSeparator;
private char repetitionSeparator;
private char escapeSeparator;
private char subcomponentSeparator;
private boolean separatorsInitialised = false;
private void setEncodingChars(String chars) {
this.fieldSeparator = chars.charAt(3);
this.componentSeparator = chars.charAt(4);
this.repetitionSeparator = chars.charAt(5);
this.escapeSeparator = chars.charAt(6);
this.subcomponentSeparator = chars.charAt(7);
this.separatorsInitialised = true;
}
private boolean isEncodingCharAhead() {
if (!this.separatorsInitialised) {
return true;
}
char ch = (char)this._input.LA(1);
return ch == this.fieldSeparator || ch == this.componentSeparator
|| ch == this.repetitionSeparator || ch == this.escapeSeparator
|| ch == this.subcomponentSeparator;
}
}
MSH
: 'MSH' . . . . . {this.setEncodingChars(getText());}
;
FIELD_SEP
: {this._input.LA(1) == this.fieldSeparator}? .
;
COMPONENT_SEP
: {this._input.LA(1) == this.componentSeparator}? .
;
REPETITION_SEP
: {this._input.LA(1) == this.repetitionSeparator}? .
;
ESCAPE_SEP
: {this._input.LA(1) == this.escapeSeparator}? .
;
SUBCOMPONENT_SEP
: {this._input.LA(1) == this.subcomponentSeparator}? .
;
OTHER
: ( {!this.isEncodingCharAhead()}? . )+
;
When testing this lexer grammar with the input MSH|^~\&|ADT1|GOOD HEALTH HOSPITAL|GHH LAB, INC.|GOOD HEALTH HOSPITAL|198808181126|SECURITY|ADT^A01^ADT_A01|MSG00001|P|2.8||
:
String message = "MSH|^~\\&|ADT1|GOOD HEALTH HOSPITAL|GHH LAB, INC.|GOOD HEALTH HOSPITAL|198808181126|SECURITY|ADT^A01^ADT_A01|MSG00001|P|2.8||";
HL7Lexer lexer = new HL7Lexer(CharStreams.fromString(message));
CommonTokenStream stream = new CommonTokenStream(lexer);
stream.fill();
for (Token t : stream.getTokens()) {
System.out.printf("%-20s '%s'\n",
HL7Lexer.VOCABULARY.getSymbolicName(t.getType()),
t.getText().replace("\n", "\\n"));
}
the following tokens are created:
MSH 'MSH|^~\&'
FIELD_SEP '|'
OTHER 'ADT1'
FIELD_SEP '|'
OTHER 'GOOD HEALTH HOSPITAL'
FIELD_SEP '|'
OTHER 'GHH LAB, INC.'
FIELD_SEP '|'
OTHER 'GOOD HEALTH HOSPITAL'
FIELD_SEP '|'
OTHER '198808181126'
FIELD_SEP '|'
OTHER 'SECURITY'
FIELD_SEP '|'
OTHER 'ADT'
COMPONENT_SEP '^'
OTHER 'A01'
COMPONENT_SEP '^'
OTHER 'ADT_A01'
FIELD_SEP '|'
OTHER 'MSG00001'
FIELD_SEP '|'
OTHER 'P'
FIELD_SEP '|'
OTHER '2.8'
FIELD_SEP '|'
FIELD_SEP '|'
EOF '<EOF>'