Package: src/packages/recognisers.fdoc

key file
recogniser_base.flx share/lib/std/strings/recogniser_base.flx
recognisers.flx share/lib/std/strings/recognisers.flx

String Matching recogniser_base

Recognisers

A recogniser is a component which tries to match the prefix of a string. If it succeeds it returns the position of the first character not matched.

Buffer type

Recognisers work on an array of chars in memory. We use a Google StringPiece to represent it.

//[recogniser_base.flx]
include "std/control/chips";
class RecogniserBase
{
open BaseChips; //needed for pipe symbol to work

struct Buffer
{
  sp: varray[char];
  pos: int;

  fun atend => self.pos >= self.sp.len.int;

  fun get =>
    if self.atend then char ""
    else (self.sp) . (self.pos)
  ;

  proc next {
    if not self*.atend do
      pre_incr self.pos;
    done
  }

  fun advanced =>
    if self.atend then self
    else Buffer (self.sp, self.pos + 1)
  ;

  fun lookahead (i:int) =>
    if self.pos + i > self.sp.len.int then char ""
    elif self.pos + i < 0 then char ""
    else (self.sp) . (self.pos + i)
  ;

  fun stl_end => Buffer (self.sp,self.sp.len.int);

}


ctor Buffer (p:varray[char]) =>
  Buffer (p,0)
;

ctor Buffer (p:string) =>
  Buffer (p.varray_nonul,0)
;

ctor Buffer (p: &string) =>
  Buffer (*p)
;

instance Str[Buffer] {
  fun str (b:Buffer) => "@"+b.pos.str;
}

// hack, ignore underlying data.. FIXME
instance Eq[Buffer] {
  fun == (a:Buffer, b:Buffer) => a.pos == b.pos;
}
instance Tord[Buffer] {
  fun < (a:Buffer, b:Buffer) => a.pos < b.pos;
}

open Eq[Buffer];
open Tord[Buffer];

ctor string (a:Buffer, b:Buffer) =
{
  var x = "";
  for i in a.pos ..< b.pos do
    x += a.sp.i;
  done
  return x;
}

typedef recog_t = BaseChips::iochip_t[Buffer,Buffer];
// rendering lazy terms to actual recognizer

A string matcher.

//[recogniser_base.flx]
chip match_string (s:string)
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
nextmatch:>
  var b = read io.inp;
  //println$ "Match " + s + " at " + b.str;
  for i in 0..< s.len.int do
    if s.[i] != b.get goto nextmatch;
    b&.next;
  done
  //println$ "Matched " + s + " to " + b.str;
  write (io.out, b);
  goto nextmatch;
}

Whitespace matcher.

Note: never fails.

//[recogniser_base.flx]
chip match_white
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
  while true do
    var b = read io.inp;
    while not b.atend and b.get <= char ' ' perform b&.next;
    write (io.out,b);
  done
}

C++ comment matcher

Note: cannot fail.

//[recogniser_base.flx]
chip match_cxx_comment
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
again:>
  var b = read io.inp;
  var b_saved = b;

  if b.get != char "/" goto bad;
  b&.next;

  if b.get != char "/" goto bad;
  b&.next;

  while not b.atend and not (b.get == char "\n")  perform b&.next;
  b&.next; // works fine even if atend
ok:>
  write (io.out,b);
  goto again;
bad:>
  write (io.out,b_saved);
  goto again;
}

Nested C comment matcher

Note: cannot fail.

//[recogniser_base.flx]
chip match_nested_c_comment
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
again:>
  var depth = 0;
  var b = read io.inp;
  var b_saved = b;
  if b.get != char "/" goto bad;
  b&.next;
  if b.get != char "*" goto bad;

nest:>
  b&.next;
  ++depth;

scan:>
  if b.get == "/" do // start nested comment
    b&.next;
    if b.get == "*" goto nest;
    goto scan;
  done

  if b.get == "*" do // end comment group
    b&.next;
    if b.get == "/" goto unnest;
    goto scan;
  done

  b&.next;
  goto scan;

unnest:>
  b&.next;
  --depth;
  if depth > 0 goto scan;
  write (io.out,b);
  goto again;

bad:>
  write (io.out,b_saved);
  goto again;
}

Felix comments

Note: can fail.

//[recogniser_base.flx]

chip match_felix_white
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
  var ri,wi= #mk_ioschannel_pair[Buffer];
  var ro,wo= #mk_ioschannel_pair[Buffer];
  device w = BaseChips::pipeline_list ([match_white, match_nested_c_comment, match_cxx_comment]);
  circuit
     wire ri to w.inp
     wire wo to w.out
  endcircuit

again:>
  var start = read io.inp;
more:>
  write (wi, start);
  var fin = read ro;
  if fin != start do
    start = fin;
    goto more;
  done

  write (io.out, fin);
  goto again;
}

regex matcher.

//[recogniser_base.flx]
chip match_regex (r:RE2)
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
  while true do
    var b = read io.inp;
//println$ "Match regex " + r.str;
    var matched = varray[StringPiece] (1uz,StringPiece());
    var result = Match(r,StringPiece(b.sp),b.pos,ANCHOR_START,matched.stl_begin,1);
//println$ "Match result " + result.str;
    if result do
//println$ "Matched OK, match len = " + matched.0.len.str;
      var b2 = Buffer (b.sp,b.pos+matched.0.len.int);
//println$ "Writing buffer = " + b2.str;
      write(io.out,b2);
    done
  done
}

Identifier matcher.

For C like identifiers.

//[recogniser_base.flx]
device cident_matcher = match_regex (RE2 "[A-Za-z][A-Za-z0-9_]*");
device flxident_matcher = match_regex (RE2 "[A-Za-z_][A-Za-z0-9_']*");
device texident_matcher = match_regex (RE2 "\\\\[A-Za-z]+");

chip flx_n_ident_matcher
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
nextnident:>
  var b = read io.inp;
  if b.get != char "n" goto nextnident;
  b&.next;
  if b.get == char "'" do
    b&.next;
    while not b.atend and b.get != char "'" perform b&.next;
    b&.next;
    write (io.out, b);
  elif b.get == char '"' do
    b&.next;
    while not b.atend and b.get != char '"' perform b&.next;
    b&.next;
    write (io.out, b);
  done
  goto nextnident;
}

chip felix_identifier_matcher
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
  device x = BaseChips::tryall_list
    ([
      flxident_matcher,
      texident_matcher,
      flx_n_ident_matcher
    ])
  ;
  circuit
    wire io.inp to x.inp
    wire io.out to x.out
  endcircuit
}

Integer matcher.

For plain identifiers.

//[recogniser_base.flx]
device decimal_integer_matcher = match_regex (RE2 "[0-9]+");

Felix integer matcher.

With radix prefix, and allows embedded underscores. Will recognise repeated underscores and trailing underscores even though these are not allowed. I mean, what should we do if we find them?

//[recogniser_base.flx]

chip felix_integer_matcher
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
nexttry:>
  var b = read io.inp;
//println$ "Felix integer matcher "+b.str;
  var ch = b.get;
  if ch not in "0123456789" goto bad;

  if ch == char "0" do
    b&.next;
    ch = b.get;
//println$ "felix_integer got leading 0, next char " + ch;
    if ch in "bB" goto nextbinary;
    if ch in "oO" goto nextoctal;
    if ch in "dD0123456789_" goto nextdecimal;
    if ch in "xX" goto nexthex;
//println$ "Bad radix";
    goto bad;
  done
  goto decimal;

nextbinary:>
  b&.next;
binary:>
  ch = b.get;
  if ch in "_01234567" goto nextbinary;
  goto suffix;

nextoctal:>
  b&.next;
octal:>
  ch = b.get;
  if ch in "_01234567" goto nextoctal;
  goto suffix;


nextdecimal:>
  b&.next;
decimal:>
  ch = b.get;
  if ch in "_0123456789" goto nextdecimal;
  goto suffix;

nexthex:>
  b&.next;
hex:>
  ch = b.get;
  if ch in "_0123456789ABCDEFabcdef" goto nexthex;
  goto suffix;

suffix:>
  // 3 char suffix
  if "" + toupper (b.get) + toupper (b.lookahead 1) + toupper (b.lookahead 2) in
    ([
      "I16", "I32","I64",
      "U16", "U32","U64"
    ])
  do
    b&.next;
    b&.next;
    b&.next;

  // 2 char suffix
  elif "" + toupper (b.get) + toupper (b.lookahead 1) in
    ([
      "LL","I8","U8",
      "UT","US","UD","UL","UV","UZ","UJ",
      "TU","SU","DU","LU","VU","ZU","JU"
    ])
  do
    b&.next;
    b&.next;

  // one char suffix
  elif "" + toupper (b.get) in
    ([
      'T', // tiny
      'S', // short
      'I', // int
      'L', // long
      'V', // long long
      "Z", // size
      "J", // intmax
      "P", // intptr
      "D"  // ptrdiff
    ])
  do
    b&.next;
  done
  goto ok;

ok:>
//println$ "Felix integer ok";
  write (io.out,b);
  goto nexttry;

bad:>
//println$ "Felix integer bad";
  goto nexttry;
}

Felix float matcher.

//$ Follows ISO C89, except that we allow underscores; //$ AND we require both leading and trailing digits so that //$ x.0 works for tuple projections and 0.f is a function //$ application

//[recogniser_base.flx]
chip felix_float_literal_matcher
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
nexttry:>
  var b = read io.inp;
  var ch = b.get;
  if ch == char "0" do
    b&.next;
    ch = b.get;
//println$ "felix_integer got leading 0, next char " + ch;
    if ch in "dD0123456789_" goto nextdecimal;
    if ch in "xX" goto nexthex;
//println$ "Bad radix";
    goto bad;
  done
  goto decimal;


nextdecimal:>
  b&.next;
decimal:>
  ch = b.get;
  if ch in "_0123456789" goto nextdecimal;
  if b.get != char "." goto bad;
  b&.next;
  if b.get not in "0123456789" goto bad;
  b&.next;

nextdecimalfrac:>
  b&.next;
decimalfrac:>
  ch = b.get;
  if ch in "_0123456789" goto nexthexfrac;
  if ch not in "Ee" goto ok;
  b&.next;
  if b.get == char "-" perform b&.next;
  if b.get not in "0123456789" goto bad;
nextdecexp:>
  b&.next;
  if b.get not in "0123456789" goto suffix;
  goto nextdecexp;

nexthex:>
  b&.next;
hex:>
  ch = b.get;
  if ch in "_0123456789ABCDEFabcdef" goto nexthex;
  if b.get != char "." goto bad;
  b&.next;
  if b.get not in "0123456789ABCDEFabcdef" goto bad;
  b&.next;

nexthexfrac:>
  b&.next;
hexfrac:>
  ch = b.get;
  if ch in "_0123456789ABCDEFabcdef" goto nexthexfrac;
  if ch not in "Pp" goto ok;
  b&.next;
  if b.get == char "-" perform b&.next;
  if b.get not in "0123456789" goto bad;
nexthexexp:>
  b&.next;
  if b.get not in "0123456789" goto suffix;
  goto nexthexexp;

suffix:>
  if b.get in "fFlL" perform b&.next;

ok:>
//println$ "Felix float ok";
  write (io.out,b);
  goto nexttry;

bad:>
//println$ "Felix integer bad";
  goto nexttry;
}

String Literal matcher.

One shot. Simple, matches single or double quoted string not spanning lines, with no escape codes,

//[recogniser_base.flx]
chip match_string_literal
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
restart:>
  var b = read io.inp;
  if b.atend goto restart; // end of data
  var leadin = b.get;
//println$ "string literal matcher got char " + leadin.str;
  if not (leadin in (char '"', char "'")) goto restart;
//println$ "Got valid string start .. ";
  b&.next;
  if b.atend goto restart;
  var ch = b.get;
  while ch != leadin do
    b&.next;
    if b.atend goto restart;
    ch = b.get;
    if ch == char "\n" goto restart; // end of line
  done
  b&.next;
  io.out `(write) b;
  goto restart;
}

chip match_string_literal_backquote
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
restart:>
  var b = read io.inp;
  if b.atend goto restart; // end of data
  var leadin = b.get;
//println$ "string literal matcher got char " + leadin.str;
  if leadin != char '`' goto restart;
//println$ "Got valid string start .. ";
  b&.next;
  if b.atend goto restart;
  var ch = b.get;
  while ch != leadin do
    b&.next;
    if b.atend goto restart;
    ch = b.get;
    if ch == char "\n" goto restart; // end of line
  done
  b&.next;
  io.out `(write) b;
  goto restart;
}

chip felix_string_literal_matcher
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
restart:>
  var b = read io.inp;
  var triple = false; // single quoted
  var escape = char ""; // no escape

  // r: raw string, f: function, c: C string
  // add others here

  // check for raw prefix r
  if b.get in "r" do
    if b.lookahead 1 != char '"' goto bad;
    b&.next;
    goto strlit;
  done

  // check for other prefixen
  if b.get in "cf" do
    if b.lookahead 1 != char '"' goto bad;
    b&.next;
  done

  // normal escaping on
  escape = char "\\";

strlit:>
  if b.get not in "'\"" goto bad;
  var first_leadin = b.get;
  b&.next;
  if b.get == first_leadin and b.lookahead 1 == first_leadin do
    triple = true;
    b&.next;
    b&.next;
  done

//println$ "Leadin=" + first_leadin + ", triple=" + triple.str + ", escape=" + escape.str;

eatup:>
//println$ "Eatup " + b.get;

  if b.get == escape goto doescape;
  if not triple and b.get == "\n"  goto bad; // newline in string
  if not triple and b.get == first_leadin do
    b&.next;
    goto ok;
  done

  if triple and
    b.get == first_leadin and
    b.lookahead 1 == first_leadin and
    b.lookahead 2 == first_leadin
  do
    b&.next;
    b&.next;
    b&.next;
    goto ok;
  done

  b&.next;
  goto eatup;


doescape:>
//println$ "Escape";
  b&.next;
  b&.next;
  goto eatup;

ok:>
  write (io.out, b);
  goto restart;

bad:>
  goto restart;
}

End of string matcher

//[recogniser_base.flx]
chip eos_matcher
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
  while true do
    var x = read io.inp;
    if x.atend perform write (io.out,x);
  done
}

Longest match

//[recogniser_base.flx]
chip longest_match (a: list[recog_t])
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
  var x = read io.inp;
  var results = None[Buffer];
  proc storemax[T with Tord [T]] (p: &opt[T]) (a:T) {
    match *p with
    | None => p <- Some a;
    | Some v => if a > v perform p <- Some a;
    endmatch;
  }
  for r in a call
    run (x.value |-> r |-> (storemax &results).procedure)
  ;
  match results with
  | None => ;
  | Some answer => write (io.out, answer);
  endmatch;
}

Match to eos

Equivalent to .* but faster.

//[recogniser_base.flx]
chip toeos_matcher
  connector io
    pin inp: %<Buffer
    pin out: %>Buffer
{
  while true do
    var x = read io.inp;
    write (io.out,x.stl_end);
  done
}
}

Lazy Syntactic form

//[recognisers.flx]
// this is a function, so it cannot construct pipeline
// chips, because they actually spawn the components internally
// and functions can't do service calls.
//
// So instead we just return a function 1->recog_t which does the
// job on invocation.
include "std/strings/recogniser_base";
include "std/strings/grammars";

class Recognisers
{
inherit RecogniserBase;
open BaseChips;

open Grammars;

typedef ntdef_t = string * recog_t;

fun find (v:varray[ntdef_t]) (nt:string) : size =
{
  for i in 0uz ..< v.len do
    if v.i.0 == nt return i;
  done
  assert false;
}


fun render_prod
  (lib:gramlib_t,v:varray[ntdef_t])
  (p:prod_t)
: recog_t =>
  match p with
  | `Terminal (s,r) => r
  | `Epsilon =>  epsilon[Buffer]
  | `Seq ps =>  pipeline_list (
      map (fun (p:prod_t) => render_prod (lib,v) p) ps)
  | `Alt ps =>   tryall_list (
      map (fun (p:prod_t) => render_prod (lib,v) p) ps)
  | `Nonterminal nt =>
    let idx = find v nt in
    let pslot = -(v.stl_begin + idx) in
    let pchip = pslot . 1 in
    BaseChips::deref_first_read pchip
  endmatch
;

fun recogniser
  (start:string, lib:gramlib_t) : recog_t =
{
    var cl = closure (start,lib);

    // allocate a varray with a slot for each nonterminal
    var n = cl.len;
    var v = varray[string * recog_t] n;

    // populate the varray with the terminal names and a dummy chip
    for nt in cl call // initialise array
      push_back (v,(nt,BaseChips::epsilon[Buffer]))
    ;

    // now assign the real recogniser_base to the array
    var index = 0uz;
    for nt in cl do
      match find lib nt with
      | None => assert false;
      | Some prod =>
        // get wrapped recogniser
        var entry = render_prod (lib, v) prod;

        // address of the slot
        var pentry : &recog_t = (-(v.stl_begin+index)).1;

        // overwrite dummy value
        pentry <- entry;
      endmatch;
      ++index;
    done
    return v.(find v start).1;
}

fun in (s:string) (g:grammar_t) =
{
  chip false_if_got (pr: &bool)
     connector io
       pin inp: %<Buffer
  {
    C_hack::ignore$ read io.inp;
    pr <- true;
  }
  var r = recogniser g;
  var result = false;
  run (s.Buffer.value |-> r |-> eos_matcher |-> false_if_got &result);
  return result;
}

} // Recognisers