Package: src/packages/cparse.fdoc
C Parsing Utils¶
key | file |
---|---|
ctok.flx | share/lib/std/c/ctok.flx |
C Tokeniser¶
This is a crude C tokeniser that is intended for parsing header files only, i.e. interfaces.
//[ctok.flx]
// TODO: strings, chars
// NOTE: preprocesor directives start with # and end with newline
// NOTE: trailing slosh is elided, but also ends a token, not really correct
class Ctok
{
fun c_tok(x:string): list[string] =
{
var result = Empty[string];
var token = "";
var n = x.len.int;
var inmacro = false;
var i = 0;
while i < n do
// 3 char tokens
if x.[i.+3] in ("<<=", ">>=","...") do
result -= x.[i.+3];
i += 3;
elif x.[i.+2] == "\\\n" do
i += 2;
// C++ comment
elif x.[i.+2] == "//" do
i += 2;
while i <= n and x.[i] != "\n" do
++i;
done
if inmacro do
result -= "\\n";
inmacro = false;
done
++i;
// C comment
elif x.[i.+2] == "/*" do
i += 2;
while i <= n and x.[i.+2] != "*/" do
++i;
done
i += 2;
elif x.[i] == "\n" do
if inmacro do
result -= "\n";
inmacro = false;
done
++i;
// whitespace
elif ord x.[i] <= 32 do
++i;
// 2 char tokens
elif x.[i.+2] in ("::",
"<<", ">>", "<=", ">=", "==","!=". "&&","||",
"++","--", "+=","==","!=","&=","^=","|=")
do
result -= x.[i.+2];;
i += 2;
elif x.[i] == "#" do
inmacro = true;
result -= "#";
++i;
// 1 char tokens
elif x.[i] in "<>^!+-*/%(){}[],:;&|?=" do
result -= x.[i].string;
++i;
// char
elif x.[i] == "'" do
++i;
token = x.[i].string;
while i < n and x.[i] != "'" do
if x.[i] == "\\" do
token += "\\";
++i;
done
token += x.[i];
++i;
done
result -= token + "'";
++i;
// string
elif x.[i] == '"' do
token = x.[i].string;
++i;
while i < n and x.[i] != '"' do
if x.[i] == "\\" do
token += "\\";
++i;
done
token += x.[i];
++i;
done
result -= token + '"';
++i;
// identifier
elif iscidstart x.[i] do
token = x.[i].string;
++i;
while i <= n and iscidcont x.[i] do
token += string x.[i];
++i;
done
result -= token;
// number
elif x.[i] in "0123456789" do
token = x.[i].string;
++i;
while x.[i] in "0123456789xabcdefXABCDEF.pP+-uUlL" do
token += x.[i].string;
++i;
done
result -= token;
// other
else
result -= x.[i].string;
++i;
done // conditional
done //loop
return rev result;
}
proc dump_tokens_formatted (x:list[string]) {
var nest = 0;
proc newline() { println$ ""; print$ " " * ( 2 * nest); }
println$ "Tokens = ";
for token in tokens do
if token == ";" do
print$ ";"; newline;
elif token == "{" do
newline;
print$ "{";
++nest;
newline;
elif token == "}" do
--nest;
newline;
print$ "}";
newline;
elif token == "#" do
newline;
print$ "#";
elif token == "\n" do
print "#endmacro";
newline;
else
print$ token.str+ " ";
done
done
} // dump
fun iscident(x:string) : bool = {
var result = iscidstart x.[0];
for ch in x.[1..] perform
result &= iscidcont ch;
return result;
}
typedef parameter_t = ctyp_t * string;
typedef field_t = ctyp_t * string;
// either or both the tag string is present of the field list is present
typedef struct_t = string * opt[list[field_t]];
typedef enumconst_t = string * int;
typedef enum_t = string * list[enumconst_t];
variant ctyp_t =
| Base of string
| Ptr of ctyp_t
| ConstPtr of ctyp_t
| Array of ctyp_t * int
| Bitfield of string * int
| Function of ctyp_t * list[parameter_t]
| Struct of struct_t
| Union of struct_t
| Enum of enum_t
;
instance Eq[ctyp_t] {
fun == (a:ctyp_t, b:ctyp_t) =>
match a,b with
| Base x, Base y => x == y
| Ptr x, Ptr y => x == y
| ConstPtr x, ConstPtr y => x == y
| Array (x,l1), Array (y,l2) => x == y and l1 == l2
| Bitfield (x,l1), Bitfield (y,l2) => x == y and l1 == l2
| Struct x, Struct y => x == y
| Union x, Union y => x == y
| Enum x, Enum y => x == y
| _ => false
endmatch
;
}
inherit Eq[ctyp_t];
fun strbody (tag:string, fields:opt[list[field_t]]) : string =>
if tag != "" then tag + " " else "" endif +
match fields with
| Some flds =>
"{ " + fold_left (fun (acc:string) (fld:field_t) => acc + " " + fld.str + ";") "" flds + " }"
| None => ""
endmatch
;
instance Str[ctyp_t] {
fun str : ctyp_t -> string =
| Base b => b
| Ptr x => x.str + "*"
| ConstPtr x => x.str + " const*"
| Array (x,l) => "array(" + x.str + "," + l.str + ")"
| Bitfield (b,l) => b + ":" + l.str
| Function (r,ps) =>
r.str + "(" + catmap "," str of parameter_t ps + ")"
| Struct s => "struct " + strbody s
| Union s => "union " + strbody s
| Enum b => "enum " + b.str
;
}
instance Str[parameter_t] {
fun str (t:ctyp_t,n:string) => t.str + if n!="" then " " else "" endif + n;
}
fun discard_to_rbrace (level:int) (tokens: list[string]) =>
match tokens with
| "{" ! tokens => discard_to_rbrace (level + 1) tokens
| "}" ! tokens => if level > 1 then discard_to_rbrace (level - 1) tokens else tokens
| _ ! tokens => discard_to_rbrace level tokens
| Empty => tokens
endmatch
;
// discard everything up to the next top level }
fun discard_to_rbrace (tokens: list[string]) =>
discard_to_rbrace 0 tokens
;
// discard everything up to the next top level ;
// won't work for nested code because of ; in for statements
fun discard_to_semi (tokens: list[string]) =>
match tokens with
| "{" ! tokens => discard_to_semi (discard_to_rbrace 1 tokens)
| ";" ! tokens => tokens
| _ ! tokens => discard_to_semi tokens
| Empty => tokens
endmatch
;
fun rev_grab_to_rbrace (out:list[string]) (level:int) (tokens: list[string]) =>
match tokens with
| "{" ! tokens => rev_grab_to_rbrace ("{" ! out) (level + 1) tokens
| "}" ! tokens => if level > 1 then rev_grab_to_rbrace ("}" ! out) (level - 1) tokens else out, tokens
| token ! tokens => rev_grab_to_rbrace (token ! out) level tokens
| Empty => out, tokens
endmatch
;
// grab everything up to the next top level }
fun grab_to_rbrace (tokens: list[string]) =>
rev (rev_grab_to_rbrace Empty[string] 0 tokens)
;
// grab everything up to the next top level ;
// won't work for nested code because of ; in for statements
fun rev_grab_to_semi (out:list[string]) (tokens: list[string]) =>
match tokens with
| "{" ! tokens =>
let grabbed, trailing = (rev_grab_to_rbrace out 1 tokens) in
rev_grab_to_semi grabbed trailing
| ";" ! tokens => out,tokens
| _ ! tokens => rev_grab_to_semi out tokens
| Empty => out,tokens
endmatch
;
fun grab_to_semi (tokens: list[string]) =>
rev (rev_grab_to_semi Empty[string] tokens)
;
// ***********************************************
// input a token list
// parse the LHS of a C type, the base type, eg int
// return that and the rest of the tokens
// ***********************************************
fun parse_type_base (tokens:list[string]) : opt[ctyp_t * bool] * list[string] =
{
var isconst = false;
next:>
if is_empty tokens return None[ctyp_t * bool], tokens;
var token = head tokens;
tokens = tail tokens;
if token == "const" do
isconst = true;
goto next;
elif token == "struct" do
// set tag if present
var tag = "";
if tokens.head.iscident do
tag = head tokens;
tokens = tail tokens;
done
// set field list if present
var fields, toks = parse_struct_body tokens;
var ctyp = Struct (tag, fields);
tokens = toks;
elif token == "union" do
tag = "";
if tokens.head.iscident do
tag = head tokens;
tokens = tail tokens;
done
fields, toks = parse_struct_body tokens;
ctyp = Union (tag, fields);
tokens = toks;
elif token == "enum" do
token = head tokens;
tokens = tail tokens;
ctyp = Enum (token, Empty[enumconst_t]);
else
ctyp = Base token; // base type
done
return Some (ctyp, isconst), tokens;
}
// ***********************************************
// input a token list and a base type and const flag
// parse the RHS of a C type, combine with base type
// return that and the rest of the tokens
// ***********************************************
fun parse_tail_type
(
var ctyp:ctyp_t,
isconst:bool,
var tokens:list[string]
)
: ctyp_t * list[string] =
{
next:>
if is_empty tokens return ctyp, tokens;
var token = head tokens;
if token == "*" do
if isconst do
ctyp = ConstPtr (ctyp);
isconst = false;
else
ctyp = Ptr ctyp;
done
elif token == "long" and ctyp == Base "long" do
ctyp = Base "long long";
else
return ctyp, tokens;
done
tokens = tail tokens;
goto next;
}
// ***********************************************
// input a token list
// parse a C type
// return type and trailing tokens
// ***********************************************
fun parse_ctype (var tokens:list[string]) : opt[ctyp_t] * list[string] = {
match parse_type_base tokens with
| Some (cty, isconst),toks =>
var ctyp, trailing = parse_tail_type (cty, isconst, toks);
return Some ctyp, trailing;
| None,toks => return None[ctyp_t],toks;
endmatch;
}
// ***********************************************
// parse array suffix if present
// recursive, assumes length is a single token or missing
// ***********************************************
fun parse_array (ctyp:ctyp_t, tokens: list[string]) : ctyp_t * list[string] = {
match tokens with
| "[" ! v ! "]" ! rest => return Array (ctyp, v.int), rest;
| "[" ! "]" ! rest => return Array (ctyp, 0), rest;
| _ => println$ "Failed array"; return ctyp, tokens;
endmatch;
}
// ***********************************************
// Input a token list
// Parse a function parameter
// retun C type and parameter name (could be empty string)
// bugs out if trailing more than one token
// ***********************************************
fun parse_parameter(var tokens: list[string]) : opt[ctyp_t * string] * list[string] =
{
match parse_ctype tokens with
| None,toks =>
return None [ctyp_t * string], toks;
| Some cty, toks =>
var ctyp = cty;
tokens = toks;
return parse_field_tail (ctyp, tokens);
endmatch;
}
fun parse_field_tail (var ctyp:ctyp_t, var tokens: list[string])
: opt[ctyp_t * string] * list[string]
=
{
again:>
match tokens with
// no parameter name
| Empty => return Some (ctyp,""),tokens;
// array type, no parameter name
| "[" ! toks =>
tokens = "" ! tokens;
goto again; // push dummy name onto input
// array type, parameter name
| param ! "[" ! toks when iscident param =>
var l1 = tokens.len;
ctyp, tokens= parse_array (ctyp, tail tokens);
var l2 = tokens.len;
if l2 < l1 do // stop infinite loop
tokens = param + tokens; // put param name back in input
goto again;
else
return Some (ctyp, param), toks;
done
// function type, function name
// named: i.e like int f(int,int)
| param ! "(" ! toks when iscident param =>
l1 = tokens.len;
ctyp, tokens= parse_function_decl (ctyp, tail tokens);
l2 = tokens.len;
if l2 < l1 do // stop infinite loop
tokens = param + tokens; // put param name back in input
goto again;
else
return Some (ctyp, param), toks;
done
// function pointer type, name
// named: i.e like int (*f)(int,int)
| "(" ! "*" ! param ! toks when iscident param =>
l1 = tokens.len;
ctyp, tokens= parse_function_decl (ctyp, "(" ! "*" ! toks);
l2 = tokens.len;
if l2 < l1 do // stop infinite loop
tokens = param + tokens; // put param name back in input
goto again;
else
return Some (ctyp, param), toks;
done
// simple type, parameter name
| param ! toks when iscident param =>
return Some (ctyp,param),toks;
// simple type, no parameter name
| _ =>
return Some (ctyp,""), tokens;
endmatch;
}
// comma separated list
fun parse_parameter_list(var tokens: list[string]) : list[ctyp_t * string] * list[string] =
{
var parameters = Empty[ctyp_t * string];
again:>
match parse_parameter tokens with
| None,toks =>
return rev parameters, toks;
| Some (ctyp,name), toks =>
tokens = toks;
parameters -= (ctyp,name);
if is_empty tokens return rev parameters,tokens;
match tokens with
| ","! rest =>
tokens = rest;
goto again;
| _ =>
return rev parameters, tokens;
endmatch;
endmatch;
}
// excludes trailing ; cause it isn't a ; in a typedef!
fun parse_field_list (var tokens: list[string]) : list[ctyp_t * string] * list[string] =
{
var fields = Empty[ctyp_t * string];
match parse_type_base tokens with
| None, toks => return fields, tokens; // no fields
| Some (base_ctyp,isconst), toks =>
var ctyp,toks2 = parse_tail_type (base_ctyp,isconst,toks);
again:>
match parse_field_tail (ctyp, toks2) with
| None, toks => return fields, toks;
| Some fld, toks =>
fields -= fld;
if head toks == "," do
ctyp,toks2 = parse_tail_type (base_ctyp,isconst,tail toks);
goto again;
else
return rev fields, toks;
done
endmatch;
endmatch;
}
// includes { and }
// returns None if no { present
fun parse_struct_body (var tokens: list[string]) : opt[list[field_t]] * list[string] =
{
var fields = Empty[field_t];
if head tokens != "{" return None[list[field_t]], tokens;
tokens = tail tokens;
again:>
if head tokens == "}" return Some fields, tail tokens;
var flds, toks = parse_field_list tokens;
fields += flds;
if head toks != ";" return Some fields,toks;
tokens = tail toks;
goto again;
}
// called by parse_function_decl only
private fun parse_function_ptr_decl (ret:ctyp_t, var tokens:list[string]) : ctyp_t * list[string] = {
match tokens with
| "*" ! ")" ! "(" ! trailing =>
def var params, tokens = parse_parameter_list trailing;
assert head tokens== ")";
return Ptr (Function (ret, params)), tail tokens;
| _ =>
println$ "Error, parse_function_ptr_decl, tokens=" + tokens.str;
assert false;
endmatch;
}
fun parse_function_decl (ret:ctyp_t, var tokens:list[string]) : ctyp_t * list[string] = {
assert head tokens == "(";
tokens = tail tokens; // head must be "(" to get here
if head tokens == "*" return parse_function_ptr_decl (ret, tokens);
var params, trailing= parse_parameter_list tokens;
assert head trailing== ")";
return Function (ret,params), tail trailing;
}
}