uimaruta

Using Ruta, get a data present in next line of annotated keyword


How to get data below/above annotated keyword present in other line? I am able to annotate keyword but not able to get information

Sample text:

Underwriter's Name    Appraiser's Name          Appraisal Company Name
Alice Wheaton Cooper  Bruce Banner               Stark Industries

Code

TYPESYSTEM utils.PlainTextTypeSystem;
ENGINE utils.PlainTextAnnotator;

EXEC(PlainTextAnnotator, {Line});
ADDRETAINTYPE(WS);
Line{->TRIM(WS)};
REMOVERETAINTYPE(WS);
Document{->FILTERTYPE(SPECIAL)};

DECLARE UnderWriterKeyword, NameKeyword, UnderWriterNameKeyword;
DECLARE UnderWriterName(String label, String value);

CW{REGEXP("\\bUnderwriter") -> UnderWriterKeyword};
CW{REGEXP("Name")->NameKeyword};
(UnderWriterKeyword SW NameKeyword){->UnderWriterNameKeyword};
ADDRETAINTYPE(SPACE);
Line{CONTAINS(UnderWriterNameKeyword)} Line -> {
    (CW SPACE)+ {-> MARK(UnderWriterName)};
    };
REMOVERETAINTYPE(SPACE)

Expected Output :

Underwriter's Name: Alice Wheaton Cooper    
Appraiser's Name: Bruce Banner
Appraisal Company Name: Stark Industries

Please suggest if it is possible in RUTA ? If true, how to get data?


Solution

  • TYPESYSTEM utils.PlainTextTypeSystem;
    ENGINE utils.PlainTextAnnotator;
    
    DECLARE Header;
    DECLARE ColumnDelimiter;
    DECLARE Cell(INT column);
    
    DECLARE Keyword (STRING label);
    DECLARE Keyword UnderWriterNameKeyword, AppraiserNameLicenseKeyword,
    AppraisalCompanyNameKeyword;
    
    "Underwriter's Name" -> UnderWriterNameKeyword ( "label" = "UnderWriter
    Name");
    "Appraiser's Name/License" -> AppraiserNameLicenseKeyword ( "label" =
    "Appraiser Name");
    "Appraisal Company Name" -> AppraisalCompanyNameKeyword ( "label" =
    "Appraisal Company Name");
    
    DECLARE Entry(Keyword keyword);
    
    EXEC(PlainTextAnnotator, {Line,Paragraph});
    
    ADDRETAINTYPE(WS);
    Line{->TRIM(WS)};
    Paragraph{->TRIM(WS)};
    
    SPACE[3,100]{-PARTOF(ColumnDelimiter) -> ColumnDelimiter};
    Line -> {ANY+{-PARTOF(Cell),-PARTOF(ColumnDelimiter) -> Cell};};
    REMOVERETAINTYPE(WS);
    
    INT index = 0;
    BLOCK(structure) Line{}{
        ASSIGN(index, 0);
        Line{STARTSWITH(Paragraph) -> Header};
        c:Cell{-> c.column = index, index = index + 1};
    }
    
    Header<-{hc:Cell{hc.column == c.column}<-{k:Keyword;};}
        # c:@Cell{-PARTOF(Header) -> e:Entry, e.keyword = k};
    
    DECLARE Entity (STRING label, STRING value);
    DECLARE Entity UnderWriterName, AppraiserNameLicense, AppraisalCompanyName;
    
    FOREACH(entry) Entry{}{
        entry{ -> CREATE(UnderWriterName, "label" = k.label, "value" =
    entry.ct)}<-{k:entry.keyword{PARTOF(UnderWriterNameKeyword)};};
        entry{ -> CREATE(AppraiserNameLicense, "label" = k.label, "value" =
    entry.ct)}<-{k:entry.keyword{PARTOF(AppraiserNameLicenseKeyword)};};
        entry{ -> CREATE(AppraisalCompanyName, "label" = k.label, "value" =
    entry.ct)}<-{k:entry.keyword{PARTOF(AppraisalCompanyNameKeyword)};};
    }
    

    The most important rule is the following:

    Header<-{hc:Cell{hc.column == c.column}<-{k:Keyword;};}
        # c:@Cell{-PARTOF(Header) -> e:Entry, e.keyword = k};
    

    It contains three rule element, Header,# and Cell, and works this way:

    As a summary, the rule creates an Entry annotation for each Cell annotation that is not part of the header and assigns the header keyword of the corresponding column in order to define the type of the entry.