Package: src/packages/parsers.fdoc

key file
parsers.flx share/lib/std/strings/parsers.flx
parser_synlib.flx share/lib/std/strings/parser_synlib.flx

Parsing

Chips to providing parsing functions.

//[parsers.flx]
include "std/control/chips";
include "std/strings/recognisers";
include "std/strings/parser_synlib";
class Parsers
{
  open Recognisers;
  open Grammars;

  variant action_t =
  | Reduce of string * int
  | Scroll of int
  | Unscroll of int
  | Pack of int
  | Unpack
  | Drop of int
  | Swap
  | Sequence of list[action_t]
  ;

  instance Str[action_t] {
    fun str: action_t -> string =
    | Reduce (s,n) => "Reduce(" + s + ","+n.str+")"
    | Scroll n => "Scroll " + n.str
    | Unscroll n => "Unscroll " + n.str
    | Pack n => "Pack " + n.str
    | Drop n => "Drop " + n.str
    | Swap => "Swap"
    | Sequence aa =>
      "Seq(" + catmap "," (str of action_t) aa + ")"
    ;
  }

  typedef open_pgram_t[T] =
  (
    | `Action of action_t
    | open_prod_t[T]
  )
  ;

  instance[T with Str[T]] Str[open_pgram_t[T]]
  {
    fun str: open_pgram_t[T] -> string =
    | `Action a => "{" + a.str + "}"
    | open_prod_t[T] :>> r => r.str
    ;
  }


  typedef pgram_t = open_pgram_t[pgram_t];

  typedef open_pgramentry_t[T] = string * open_pgram_t[T];
  typedef open_pgramlib_t[T] = list[open_pgramentry_t[T]];
  typedef open_pgrammar_t[T] = string * open_pgramlib_t[T];

  typedef pgramentry_t = open_pgramentry_t[pgram_t];
  typedef pgramlib_t = open_pgramlib_t[pgram_t];
  typedef pgrammar_t = open_pgrammar_t[pgram_t];

  typedef lexeme = (start:Buffer, finish:Buffer);

  variant stack_node_t =
  | RTerminal of string * lexeme
  | RNonterminal of string * list[stack_node_t]
  ;

  instance Str[stack_node_t] {
    fun str: stack_node_t -> string =
    | RTerminal (s,x) => s+"("+string (x.start,x.finish)+")"
    | RNonterminal (s,xs) =>
      s + "(" + catmap "," (str of stack_node_t) xs + ")"
    ;
  }

  typedef parser_stack_t = list[stack_node_t];

  instance Str[parser_stack_t] {
    fun str (x:parser_stack_t) =>
      catmap "; " (str of stack_node_t) x
    ;
  }

  typedef parser_state_t =
  (
    pos: Buffer,
    stack: parser_stack_t
  );

  instance Str[parser_state_t] {
    fun str (x:parser_state_t) =>
      x.pos.str + ":  " + x.stack.str
    ;
  }

  instance Str[pgramlib_t] {
    fun str (lib: pgramlib_t) : string =
    {
      var s = "";
      match nt,ex in lib do
        s += nt + ":\n";
        s += "  " + ex.str+"\n";
      done
      return s;
    }
  }

  typedef parser_t = BaseChips::iochip_t[parser_state_t,parser_state_t];

  chip ActionShift (label:string) (r: recog_t)
    connector io
      pin inp: %<parser_state_t
      pin out: %>parser_state_t
   {
     // We need to use a secondary chip so that if the recogniser
     // writes no output, this chip will block on it and die
     // without killing off the ActionShift chip.
     chip handler
       connector inner
         pin inp: %<parser_state_t
     {
       var inp = read inner.inp;

       var ri,wi = #mk_ioschannel_pair[Buffer];
       var ro,wo = #mk_ioschannel_pair[Buffer];
       circuit
         wire ri to r.inp
         wire wo to r.out
       endcircuit

       var ipos = inp.pos;
       write (wi, ipos);
       var opos = read ro;
       var entry = RTerminal (label, (start = ipos, finish = opos));
       //println$ "ActionShift " + label + " write " + io.out.address.str;
       write (io.out, (pos = opos, stack = entry ! inp.stack));
     }

     while true do
       var inp = read io.inp;
       var ri,wi = #mk_ioschannel_pair[parser_state_t];
      circuit
         wire wi to handler.inp
       endcircuit
       write (wi, inp);
     done
   }

  chip ActionSecond (label:string) (r1: recog_t) (r2: recog_t)
    connector io
      pin inp: %<parser_state_t
      pin out: %>parser_state_t
   {

     chip handler
       connector inner
         pin inp: %<parser_state_t
     {
       var inp = read inner.inp;

       var ri1,wi1 = #mk_ioschannel_pair[Buffer];
       var ro1,wo1 = #mk_ioschannel_pair[Buffer];
       var ri2,wi2 = #mk_ioschannel_pair[Buffer];
       var ro2,wo2 = #mk_ioschannel_pair[Buffer];
       circuit
         wire ri1 to r1.inp
         wire wo1 to r1.out
         wire ri2 to r2.inp
         wire wo2 to r2.out
       endcircuit

       // whitespace
       var pos1 = inp.pos;
       write (wi1, pos1);
       var pos2 = read ro1;

       // terminal
       write (wi2, pos2);
       var pos3 = read ro2;

       var entry = RTerminal (label, (start = pos2, finish = pos3));
       //println$ "ActionSecond " + label + " write " + io.out.address.str;
       write (io.out, (pos = pos3, stack = entry ! inp.stack));
     }

     while true do
       var inp = read io.inp;
       var ri,wi = #mk_ioschannel_pair[parser_state_t];
       circuit
         wire wi to handler.inp
       endcircuit
       write (wi, inp);
     done
   }

  fun doaction (aux: parser_stack_t,s:parser_stack_t) (a:action_t) =>
    match a with
    | Reduce (label,n) =>
      let revhead,tail = revsplit n s in
      aux,RNonterminal (label,revhead) ! tail

    | Drop n => aux,drop n s

    | Swap => aux,
      match s with
      | e1 ! e2 ! tail => e2 ! e1 ! tail
      | _ => s
      endmatch

    | Scroll n => let s,a = scroll (s,aux) n in a,s
    | Unscroll n => scroll (aux,s) n

    | Pack n =>
      let revhead,tail = revsplit n s in
      aux,RNonterminal ("_Tuple",revhead) ! tail

    | Unpack =>
      match s with
      | RNonterminal (_,ss) ! tail => aux, ss + tail
      | _ => aux,s
      endmatch

    | Sequence actions =>
      fold_left (fun (aux:parser_stack_t,s:parser_stack_t) (a:action_t) =>
        doaction (aux,s) a)
        (aux,s)
        actions

    endmatch
  ;

  fun doaction (s:parser_stack_t) (a:action_t) =>
    let _,s = doaction (Empty[stack_node_t], s) a in
    s
  ;

  chip ActionGeneral (a:action_t)
    connector io
      pin inp: %<parser_state_t
      pin out: %>parser_state_t
  {
    while true do
      var i = read io.inp;
      var pos = i.pos;
      var stack = doaction i.stack a;
      //println$ "ActionGeneral ["+a.str+"] write " + io.out.address.str;
      write (io.out, (pos=pos, stack=stack));
    done
  }

  typedef pntdef_t = string * parser_t;

  fun find (v:varray[pntdef_t]) (nt:string) : size =
  {
    for i in 0uz ..< v.len do
      if v.i.0 == nt return i;
    done
    assert false;
  }

  fun render_pgram
    (lib:pgramlib_t,v:varray[pntdef_t])
    (white:recog_t)
    (p:pgram_t)
  : parser_t =>
    match p with
    | `Terminal (s,r) => ActionSecond s white r
    | `Epsilon => BaseChips::epsilon[parser_state_t]
    | `Seq ps => BaseChips::pipeline_list (
          unbox (map (fun (p:pgram_t) => render_pgram (lib,v) white p) ps))
    | `Alt ps =>  BaseChips::tryall_list (
          unbox (map (fun (p:pgram_t) => render_pgram (lib,v) white p) ps))
    | `Nonterminal nt =>
         let idx : size = find v nt in
         let pslot : &pntdef_t = -(v.stl_begin + idx) in
         let pchip : &parser_t = pslot . 1 in
         BaseChips::deref_each_read pchip
    | `Action a => ActionGeneral a
  ;

  fun open_add_pgram[T]
    (aux: list[string] -> T -> list[string])
    (acc:list[string]) (p: open_pgram_t[T])
  : list[string] =>
    match p with
    | `Action a => acc
    | open_prod_t[T] :>> r => open_add_prod[T] aux acc r
    endmatch
  ;

  fun add_pgram (acc:list[string]) (p:pgram_t) : list[string] =>
    fix open_add_pgram[pgram_t] acc p
  ;

  fun closure (g:pgrammar_t): list[string] =>
    generic_closure[pgram_t] add_pgram g
  ;

  chip make_parser_from_grammar (white:recog_t)
    connector io
      pin inp: %<pgrammar_t
      pin out: %>parser_t
  {

    while true do
      // read in the grammar
      var start, lib = read io.inp;

      // calculate the transitive closure of nonterminals
      // from the start symbol
      var cl = closure (start,lib);

      // allocate a varray with a slot for each nonterminal
      var n = cl.len;
      var v = varray[string * parser_t] n;

      // populate the varray with the terminal names and a dummy chip
      for nt in cl call // initialise array
        push_back (v,(nt,BaseChips::epsilon[parser_state_t]))
      ;

      // now assign the real recognisers to the array
      var index = 0uz;
      for nt in cl do
        match find lib nt with
        | None => assert false;
        | Some prod =>
          // get wrapped parser
          var entry = render_pgram (lib, v) white prod;

          // address of the slot
          var pentry : &parser_t = (-(v.stl_begin+index)).1;

          // overwrite dummy value
          pentry <- entry;
        endmatch;
        ++index;
      done
      write (io.out, (v.(find v start).1));
    done
  }

  gen make_parser_from_grammar (g:pgrammar_t) (white:recog_t) : parser_t =
  {
    var parsr: parser_t;
    var sched = #fibre_scheduler;
    spawn_fthread sched {
      var gri,gwi = mk_ioschannel_pair[pgrammar_t]();
      var gro,gwo = mk_ioschannel_pair[parser_t]();
      spawn_fthread (make_parser_from_grammar white (inp=gri,out=gwo));
      write (gwi, g);
      parsr = read gro;
    };
    sched.run;
    return parsr;
  }

  gen run_parser_on_string (parsr:parser_t) (s:string) : list[parser_state_t] =
  {
    var results = Empty[parser_state_t];
    var b = Buffer s;
    var ps : parser_state_t = (pos=b, stack=Empty[stack_node_t]);
    var sched = #fibre_scheduler;
    spawn_fthread sched {
      var ri,wi = mk_ioschannel_pair[parser_state_t]();
      var ro,wo = mk_ioschannel_pair[parser_state_t]();
      spawn_fthread (parsr (inp=ri, out=wo));
      write (wi,ps);
      while true do
        var result = read ro;
        results = result ! results;
        //println$ "Test1: End pos (should be 14)=" + result.str;
      done
    };
    sched.run;
    return results;
  }

  // replace internal sub-expressions with fresh nonterminals
  fun unpack (fresh:1->string) (head:string, p:pgram_t) : pgramlib_t =
  {
   var out = Empty[pgramentry_t];
   match p with
   | `Action a => out = ([head,p]);
   | `Epsilon => out = ([head,p]);
   | `Terminal _ => out = ([head,(`Seq ([p]):>>pgram_t)]);
   | `Nonterminal s => out= ([head,(`Seq ([p]):>>pgram_t)]);

   | `Seq ps =>
     var newseq = Empty[pgram_t];
     for term in ps do
       match term with
       | `Action _ => newseq = term ! newseq;
       | `Epsilon => ;
       | `Nonterminal _ => newseq = term ! newseq;
       | `Terminal _ => newseq = term ! newseq;
       | _ =>
         var newhead = fresh();
         newseq = (`Nonterminal newhead :>>pgram_t) ! newseq;
         out = unpack fresh (newhead,term);
       endmatch;
     done

     match newseq with
     | Empty => out = (head,(#`Epsilon:>> pgram_t)) ! out;
     | _ => out = (head,(`Seq(unbox (rev newseq)):>>pgram_t)) ! out;
     endmatch;

   | `Alt ps =>
     iter (proc (p:pgram_t) { out = unpack fresh (head,p) + out; }) ps;
   endmatch;
   return out;
  }

  // expand internal sub-expressions, return a list of symbol sequences
  // the outer list are the alternatives and the inner ones sequences
  // IN REVERSE ORDER!
  fun expand_aux (p:pgram_t) : list[list[pgram_t]] =
  {
   var out = ([Empty[pgram_t]]);
   match p with
   // add symbol to each alternative
   | `Epsilon => ;
   | `Action a
   | `Terminal _
   | `Nonterminal s =>
     out = unbox (map (fun (ss: list[pgram_t]) => Cons (p,ss)) out);

   // A sequence is unpacked by successively unpacking each
   // symbol. The result is then prepended to each alternative.
   | `Seq ps =>
     for term in ps do
       var tmp = expand_aux term;
       var out2 = Empty[list[pgram_t]];
       for left in tmp perform
         for right in out perform
           out2 += left + right;
       out = out2;
     done

   | `Alt ps =>
     var alts = cat (unbox (map expand_aux ps));
     out2 = Empty[list[pgram_t]];
     for left in alts perform
       for right in out perform
         out2 += left + right;
     out = out2;

   endmatch;
   return out;
  }

  fun expand (p:pgram_t) : pgram_t =>
    let ps = expand_aux p in
    (`Alt (unbox (map (fun (seqs: list[pgram_t]) => `Seq(unbox (rev seqs)):>>pgram_t) ps))) :>> pgram_t
  ;

  // in p replace nonterminal name with value (where q=name,value)
  fun substitute (q:pgramentry_t) (p:pgram_t)=>
    let name,value = q in
    match p with
    | `Nonterminal s when name == s => value
    | `Seq ls => `Seq (unbox (map (substitute q) ls)) :>> pgram_t
    | `Alt ls => `Alt (unbox (map (substitute q) ls)) :>> pgram_t
    | _ => p
  ;

// direct left recursion eliminator
// assumes A = A alpha | beta form
// outputs
// A = beta A'
// A' = alpha A' | Eps
//
// BETTER
//
// A = beta | beta A'
// A' = alpha A' | alpha
//
// since this is Epsilon free

  fun direct_left_recursion_elimination
   (fresh:1->string)
   (lib:pgramlib_t)
  =
  {
   var outgram = Empty[pgramentry_t];
   for ntdef in lib do
     var nt,expr = ntdef;
     var alphas = Empty[list[pgram_t]];
     var betas = Empty[list[pgram_t]];
  // where does Epsilon go??
     match expr with
     | `Alt alts =>
       for alt in alts do
         match alt with
         | (`Seq (Cons ((`Nonterminal $(nt)),tail))) => alphas = tail ! alphas;
         | (`Seq b) => betas = b ! betas;
         | x => betas = ([x]) ! betas;

         //| x => println$ "EDLR, unexpected alternative " + x.str; assert false;
         endmatch;
       done
     | x => betas = ([x]) ! betas;

     //| x => println$ "EDLR, unexpected expr " + x.str; assert false;
     endmatch;
     if alphas.len == 0uz do
       outgram = (nt,expr) ! outgram;
     else
       var newntname = fresh();
       var newnt = `Nonterminal newntname :>> pgram_t;
       var alts = unbox (map (fun (b:list[pgram_t]) => `Seq (b + newnt):>>pgram_t) betas);
       outgram =  (nt, (`Alt alts :>>pgram_t)) !  outgram ;
       alts = unbox (map (fun (a:list[pgram_t]) => (`Seq (a + newnt):>>pgram_t)) alphas) + (#`Epsilon:>>pgram_t);
       outgram = (newntname, (`Alt alts:>>pgram_t)) ! outgram;
     done
   done
   return outgram;
  }

  gen fresh_sym () : string = {
    var n = 1;
  next:>
    yield "_"+n.str;
    ++n;
    goto next;
  }
  // this needs to be global so the algo can be re-applied to the same
  // grammar library
  var fresh = fresh_sym;

  fun direct_left_recursion_elimination (lib:pgramlib_t) =
  {
    return direct_left_recursion_elimination fresh lib;
  }

  fun make_seq (a:pgram_t) (b:list[pgram_t]) =>
    match a with
    | (`Seq a) => `Seq (a + b) :>> pgram_t
    | _ => `Seq (a ! b) :>> pgram_t
  ;

  // requires one entry per non-terminal, sorted for performance
  // must be in form Alt (Seq (nt, ...)) or Seq (nt, ...) or sym
  // right is the original grammar which i scans thru
  // left is the modified grammar for j = 1 to n -1
  // each recursion advances i one step

  fun left_recursion_elimination_step
    (fresh:1->string)
    (var left:pgramlib_t)
    (var right:pgramlib_t)
  =
  {
     match right with
     | Empty => return left;
     | (rnt,rdfn) ! tail => // A_i
println$ "left_recursion_elimination considering nonterminal A_i=" +rnt;
       var rprods =
         match rdfn with
         | `Alt alts => alts
         | _ => ([rdfn])
       ;

       var toremove = Empty[int];
       var toadd = Empty[pgram_t];
       match lnt,ldfn in left do // A_j = 1 to i - 1
println$ "  left_recursion_elimination considering nonterminal A_j=" +lnt;
         var lprods =
           match ldfn with
           | `Alt alts => alts
           | _ => ([ldfn])
         ;
         var counter = -1;
         for rprod in rprods do // A_i = A_j alpha
println$ "    checking if " + rnt + " = " + rprod.str + " has left corner A_j=" + lnt;
           ++counter;
           match rprod with
           | `Seq ((`Nonterminal s) ! alpha) =>
             if s == lnt do
println$ "      YES: replace";
               toremove = counter ! toremove;
               for beta in lprods perform
                 toadd  = make_seq beta alpha ! toadd;
             else // not of form A_i = A_j alpha
println$ "      NO: keep";
             done
           | `Nonterminal s => // alpha = Epsilon
             if s == lnt do
println$ "      YES: replace";
               toremove = counter ! toremove;
               for beta in lprods perform
                 toadd  = beta ! toadd;
             else
println$ "      NO: keep";
             done
           | _ =>
println$ "      NO: keep";
           endmatch;
         done // all A_i of form A_J alpha
       done
       // strip replaced productions out, add the others
       counter = -1;
       for elt in rprods do
         ++counter;
         if not (counter in toremove) perform
           toadd = elt ! toadd;
       done
       var newa_i = direct_left_recursion_elimination fresh ([rnt, (`Alt toadd :>> pgram_t)]);
       return left_recursion_elimination_step fresh (newa_i + left) tail;
     endmatch;
  }

  fun left_recursion_elimination
    (fresh:1->string)
    (var right:pgramlib_t)
  => left_recursion_elimination_step fresh Empty[pgramentry_t] right;

} // class
//[parser_synlib.flx]
include "std/strings/parsers";

class Parser_synlib
{
  open Parsers;
  open Grammars;
  fun NT (s:string) => `Nonterminal  s :>> pgram_t ;
  fun TERM (s:string, r:Recognisers::recog_t) => `Terminal (s,r) :>> pgram_t;
  fun STR (s:string) => (`Terminal (s, (Recognisers::match_string s)));
  fun REDUCE (s:string, n:int) => `Action (Reduce (s,n)) :>> pgram_t;
  fun BINOP(s:string) => `Action (Sequence ([Swap, Drop 1, (Reduce (s,2))])):>>pgram_t;
  fun SWAP () => `Action (Swap) :>> pgram_t;
  fun DROP (n:int) => `Action (Drop n) :>> pgram_t;
  fun ALT (ls: list[pgram_t]) => `Alt ls :>> pgram_t;
  fun SEQ (ls: list[pgram_t]) => `Seq ls :>> pgram_t;
  fun EPS () => (#`Epsilon) :>> pgram_t;
}