(* ========================================================================= *)
(* Lexical analyzer, type and preterm parsers.                               *)
(*                                                                           *)
(*       John Harrison, University of Cambridge Computer Laboratory          *)
(*                                                                           *)
(*            (c) Copyright, University of Cambridge 1998                    *)
(*              (c) Copyright, John Harrison 1998-2007                       *)
(* ========================================================================= *)

needs "preterm.ml";;

(* ------------------------------------------------------------------------- *)
(* Need to have this now for set enums, since "," isn't a reserved word.     *)
(* ------------------------------------------------------------------------- *)

parse_as_infix (",",(14,"right"));;

(* ------------------------------------------------------------------------- *)
(* Basic parser combinators.                                                 *)
(* ------------------------------------------------------------------------- *)

exception Noparse;;

let (||) parser1 parser2 input =
  try parser1 input
  with Noparse -> parser2 input;;

let (++) parser1 parser2 input =
  let result1,rest1 = parser1 input in
  let result2,rest2 = parser2 rest1 in
  (result1,result2),rest2;;

let rec many prs input =
  try let result,next = prs input in
      let results,rest = many prs next in
      (result::results),rest
  with Noparse -> [],input;;

let (>>) prs treatment input =
  let result,rest = prs input in
  treatment(result),rest;;

let fix err prs input =
  try prs input
  with Noparse -> failwith (err ^ " expected");;

let rec listof prs sep err =
  prs ++ many (sep ++ fix err prs >> snd) >> (fun (h,t) -> h::t);;

let nothing input = [],input;;

let elistof prs sep err =
  listof prs sep err || nothing;;

let leftbin prs sep cons err =
  prs ++ many (sep ++ fix err prs) >>
  (fun (x,opxs) -> let ops,xs = unzip opxs in
                   itlist2 (fun op y x -> cons op x y) (rev ops) (rev xs) x);;

let rightbin prs sep cons err =
  prs ++ many (sep ++ fix err prs) >>
  (fun (x,opxs) -> if opxs = [] then x else
                   let ops,xs = unzip opxs in
                   itlist2 cons ops (x::butlast xs) (last xs));;

let possibly prs input =
  try let x,rest = prs input in [x],rest
  with Noparse -> [],input;;

let some p =
  function
      [] -> raise Noparse
    | (h::t) -> if p h then (h,t) else raise Noparse;;

let a tok = some (fun item -> item = tok);;

let rec atleast n prs i =
  (if n <= 0 then many prs
   else prs ++ atleast (n - 1) prs >> (fun (h,t) -> h::t)) i;;

let finished input =
  if input = [] then 0,input else failwith "Unparsed input";;

(* ------------------------------------------------------------------------- *)
(* The basic lexical classes: identifiers, strings and reserved words.       *)
(* ------------------------------------------------------------------------- *)

type lexcode = Ident of string
             | Resword of string;;

(* ------------------------------------------------------------------------- *)
(* Lexical analyzer. Apart from some special bracket symbols, each           *)
(* identifier is made up of the longest string of alphanumerics or           *)
(* the longest string of symbolics.                                          *)
(* ------------------------------------------------------------------------- *)

reserve_words ["//"];;

let comment_token = ref (Resword "//");;

let lex =
  let collect (h,t) = end_itlist (^) (h::t) in
  let reserve =
    function
        (Ident n as tok) ->
            if is_reserved_word n then Resword(n) else tok
      | t -> t in
  let stringof p = atleast 1 p >> end_itlist (^) in
  let simple_ident = stringof(some isalnum) || stringof(some issymb) in
  let undertail = stringof (a "_") ++ possibly simple_ident >> collect in
  let ident = (undertail || simple_ident) ++ many undertail >> collect in
  let septok = stringof(some issep) in
  let escapecode i =
    match i with
      "\\"::rst -> "\\",rst
    | "\""::rst -> "\"",rst
    | "\'"::rst -> "\'",rst
    | "n"::rst -> "\n",rst
    | "r"::rst -> "\r",rst
    | "t"::rst -> "\t",rst
    | "b"::rst -> "\b",rst
    | " "::rst -> " ",rst
    | "x"::h::l::rst ->
         String.make 1 (Char.chr(int_of_string("0x"^h^l))),rst
    | a::b::c::rst when forall isnum [a;b;c] ->
         String.make 1 (Char.chr(int_of_string(a^b^c))),rst
    | _ -> failwith "lex:unrecognized OCaml-style escape in string" in
  let stringchar =
      some (fun i -> i <> "\\" & i <> "\"")
  || (a "\\" ++ escapecode >> snd) in
  let string = a "\"" ++ many stringchar ++ a "\"" >>
        (fun ((_,s),_) -> "\""^implode s^"\"") in
  let rawtoken = (string || some isbra || septok || ident) >>
    (fun x -> Ident x) in
  let simptoken = many (some isspace) ++ rawtoken >> (reserve o snd) in
  let rec tokens i =
    try let (t,rst) = simptoken i in
        if t = !comment_token then
          (many (fun i -> if i <> [] & hd i <> "\n" then 1,tl i
                          else raise Noparse) ++ tokens >> snd) rst
        else
          let toks,rst1 = tokens rst in t::toks,rst1
    with Noparse -> [],i in
  fst o (tokens ++ many (some isspace) ++ finished >> (fst o fst));;

(* ------------------------------------------------------------------------- *)
(* Parser for pretypes. Concrete syntax:                                     *)
(*                                                                           *)
(* TYPE        :: SUMTYPE -> TYPE                                            *)
(*              | SUMTYPE                                                    *)
(*                                                                           *)
(* SUMTYPE     :: PRODTYPE + SUMTYPE                                         *)
(*              | PRODTYPE                                                   *)
(*                                                                           *)
(* PRODTYPE    :: POWTYPE # PRODTYPE                                         *)
(*              | POWTYPE                                                    *)
(*                                                                           *)
(* POWTYPE     :: APPTYPE ^ POWTYPE                                          *)
(*              | APPTYPE                                                    *)
(*                                                                           *)
(* APPTYPE     :: ATOMICTYPES type-constructor  [Provided arity matches]     *)
(*              | ATOMICTYPES                   [Provided only 1 ATOMICTYPE] *)
(*                                                                           *)
(* ATOMICTYPES :: type-constructor              [Provided arity zero]        *)
(*              | type-variable                                              *)
(*              | ( TYPE )                                                   *)
(*              | ( TYPE LIST )                                              *)
(*                                                                           *)
(* TYPELIST    :: TYPE , TYPELIST                                            *)
(*              | TYPE                                                       *)
(*                                                                           *)
(* Two features make this different from previous HOL type syntax:           *)
(*                                                                           *)
(*  o Any identifier not in use as a type constant will be parsed as a       *)
(*    type variable; a ' is not needed and a * is not allowed.               *)
(*                                                                           *)
(*  o Antiquotation is not supported.                                        *)
(* ------------------------------------------------------------------------- *)

let parse_pretype =
  let btyop n n' x y = Ptycon(n,[x;y])
  and mk_apptype =
    function
        ([s],[]) -> s
      | (tys,[c]) -> Ptycon(c,tys)
      | _ -> failwith "Bad type construction"
  and type_atom input =
    match input with
      (Ident s)::rest ->
          (try pretype_of_type(assoc s (type_abbrevs())) with Failure _ ->
           if try get_type_arity s = 0 with Failure _ -> false
           then Ptycon(s,[]) else Utv(s)),rest
    | _ -> raise Noparse
  and type_constructor input =
    match input with
      (Ident s)::rest -> if try get_type_arity s > 0 with Failure _ -> false
                         then s,rest else raise Noparse
    | _ -> raise Noparse in
  let rec pretype i = rightbin sumtype (a (Resword "->")) (btyop "fun") "type" i
  and sumtype i = rightbin prodtype (a (Ident "+")) (btyop "sum") "type" i
  and prodtype i = rightbin carttype (a (Ident "#")) (btyop "prod") "type" i
  and carttype i = leftbin apptype (a (Ident "^")) (btyop "cart") "type" i
  and apptype i = (atomictypes ++ (type_constructor >> (fun x -> [x])
                                || nothing) >> mk_apptype) i
  and atomictypes i =
        (((a (Resword "(")) ++ typelist ++ a (Resword ")") >> (snd o fst))
      || type_atom >> (fun x -> [x])) i
  and typelist i = listof pretype (a (Ident ",")) "type" i in
  pretype;;

(* ------------------------------------------------------------------------- *)
(* Hook to allow installation of user parsers.                               *)
(* ------------------------------------------------------------------------- *)

let install_parser,delete_parser,installed_parsers,try_user_parser =
  let rec try_parsers ps i =
    if ps = [] then raise Noparse else
    try snd(hd ps) i with Noparse -> try_parsers (tl ps) i in
  let parser_list =
    ref([]:(string*(lexcode list -> preterm * lexcode list))list) in
  (fun dat -> parser_list := dat::(!parser_list)),
  (fun key -> try parser_list := snd (remove (fun (key',_) -> key = key')
                                                 (!parser_list))
                  with Failure _ -> ()),
  (fun () -> !parser_list),
  (fun i -> try_parsers (!parser_list) i);;

(* ------------------------------------------------------------------------- *)
(* Initial preterm parsing. This uses binder and precedence/associativity/   *)
(* prefix status to guide parsing and preterm construction, but treats all   *)
(* identifiers as variables.                                                 *)
(*                                                                           *)
(* PRETERM            :: APPL_PRETERM binop APPL_PRETERM                     *)
(*                     | APPL_PRETERM                                        *)
(*                                                                           *)
(* APPL_PRETERM       :: APPL_PRETERM : type                                 *)
(*                     | APPL_PRETERM BINDER_PRETERM                         *)
(*                     | BINDER_PRETERM                                      *)
(*                                                                           *)
(* BINDER_PRETERM     :: binder VARSTRUCT_PRETERMS . PRETERM                 *)
(*                     | let PRETERM and ... and PRETERM in PRETERM          *)
(*                     | ATOMIC_PRETERM                                      *)
(*                                                                           *)
(* VARSTRUCT_PRETERMS :: TYPED_PRETERM VARSTRUCT_PRETERMS                    *)
(*                     | TYPED_PRETERM                                       *)
(*                                                                           *)
(* TYPED_PRETERM      :: TYPED_PRETERM : type                                *)
(*                     | ATOMIC_PRETERM                                      *)
(*                                                                           *)
(* ATOMIC_PRETERM     :: ( PRETERM )                                         *)
(*                     | if PRETERM then PRETERM else PRETERM                *)
(*                     | [ PRETERM; .. ; PRETERM ]                           *)
(*                     | { PRETERM, .. , PRETERM }                           *)
(*                     | { PRETERM | PRETERM }                               *)
(*                     | identifier                                          *)
(*                                                                           *)
(* Note that arbitrary preterms are allowed as varstructs. This allows       *)
(* more general forms of matching and considerably regularizes the syntax.   *)
(* ------------------------------------------------------------------------- *)

let parse_preterm =
  let rec pairwise r l =
    match l with
      [] -> true
    | h::t -> forall (r h) t & pairwise r t in
  let rec pfrees ptm acc =
    match ptm with
      Varp(v,pty) ->
        if v = "" & pty = dpty then acc
        else if can get_const_type v or can num_of_string v
                or exists (fun (w,_) -> v = w) (!the_interface) then acc
        else insert ptm acc
    | Constp(_,_) -> acc
    | Combp(p1,p2) -> pfrees p1 (pfrees p2 acc)
    | Absp(p1,p2) -> subtract (pfrees p2 acc) (pfrees p1 [])
    | Typing(p,_) -> pfrees p acc in
  let pdest_eq (Combp(Combp(Varp(("="|"<=>"),_),l),r)) = l,r in
  let pmk_let (letbindings,body) =
    let vars,tms = unzip (map pdest_eq letbindings) in
    let _ = warn(not
     (pairwise (fun s t -> intersect(pfrees s []) (pfrees t []) = []) vars))
     "duplicate names on left of let-binding: latest is used" in
    let lend = Combp(Varp("LET_END",dpty),body) in
    let abs = itlist (fun v t -> Absp(v,t)) vars lend in
    let labs = Combp(Varp("LET",dpty),abs) in
    rev_itlist (fun x f -> Combp(f,x)) tms labs in
  let pmk_vbinder(n,v,bod) =
    if n = "\\" then Absp(v,bod)
    else Combp(Varp(n,dpty),Absp(v,bod)) in
  let pmk_binder(n,vs,bod) =
    itlist (fun v b -> pmk_vbinder(n,v,b)) vs bod in
  let pmk_set_enum ptms =
    itlist (fun x t -> Combp(Combp(Varp("INSERT",dpty),x),t)) ptms
           (Varp("EMPTY",dpty)) in
  let pgenvar =
    let gcounter = ref 0 in
    fun () -> let count = !gcounter in
              (gcounter := count + 1;
               Varp("GEN%PVAR%"^(string_of_int count),dpty)) in
  let pmk_exists(v,ptm) = Combp(Varp("?",dpty),Absp(v,ptm)) in
  let pmk_list els =
    itlist (fun x y -> Combp(Combp(Varp("CONS",dpty),x),y))
           els (Varp("NIL",dpty)) in
  let pmk_bool =
    let tt = Varp("T",dpty) and ff = Varp("F",dpty) in
    fun b -> if b then tt else ff in
  let pmk_char c =
    let lis = map (fun i -> pmk_bool((c / (1 lsl i)) mod 2 = 1)) (0--7) in
    itlist (fun x y -> Combp(y,x)) lis (Varp("ASCII",dpty)) in
  let pmk_string s =
    let ns = map (fun i -> Char.code(String.get s i))
                 (0--(String.length s - 1)) in
    pmk_list(map pmk_char ns) in
  let pmk_setcompr (fabs,bvs,babs) =
    let v = pgenvar() in
    let bod = itlist (curry pmk_exists) bvs
                     (Combp(Combp(Combp(Varp("SETSPEC",dpty),v),babs),fabs)) in
    Combp(Varp("GSPEC",dpty),Absp(v,bod)) in
  let pmk_setabs (fabs,babs) =
    let evs =
      let fvs = pfrees fabs []
      and bvs = pfrees babs [] in
      if length fvs <= 1 or bvs = [] then fvs
      else intersect fvs bvs in
    pmk_setcompr (fabs,evs,babs) in
  let rec mk_precedence infxs prs inp =
    match infxs with
      (s,(p,at))::_ ->
          let topins,rest = partition (fun (s',pat') -> pat' = (p,at)) infxs in
          (if at = "right" then rightbin else leftbin)
          (mk_precedence rest prs)
          (end_itlist (||) (map (fun (s,_) -> a (Ident s)) topins))
          (fun (Ident op) x y -> Combp(Combp(Varp(op,dpty),x),y))
          ("term after binary operator")
          inp
    | _ -> prs inp in
  let pmk_geq s t = Combp(Combp(Varp("GEQ",dpty),s),t) in
  let pmk_pattern ((pat,guards),res) =
    let x = pgenvar() and y = pgenvar() in
    let vs = pfrees pat []
    and bod =
     if guards = [] then
       Combp(Combp(Varp("_UNGUARDED_PATTERN",dpty),pmk_geq pat x),
             pmk_geq res y)
     else
       Combp(Combp(Combp(Varp("_GUARDED_PATTERN",dpty),pmk_geq pat x),
                   hd guards),
             pmk_geq res y) in
    Absp(x,Absp(y,itlist (curry pmk_exists) vs bod)) in
  let pretype = parse_pretype
  and string inp =
    match inp with
      Ident s::rst when String.length s >= 2 &
                        String.sub s 0 1 = "\"" &
                        String.sub s (String.length s - 1) 1 = "\""
       -> String.sub s 1 (String.length s - 2),rst
    | _ -> raise Noparse
  and singleton1 x = [x]
  and lmk_ite (((((_,b),_),l),_),r) =
          Combp(Combp(Combp(Varp("COND",dpty),b),l),r)
  and lmk_typed =
    function (p,[]) -> p | (p,[ty]) -> Typing(p,ty) | _ -> fail()
  and lmk_let (((_,bnds),_),ptm) = pmk_let (bnds,ptm)
  and lmk_binder ((((s,h),t),_),p) = pmk_binder(s,h::t,p)
  and lmk_setenum(l,_) = pmk_set_enum l
  and lmk_setabs(((l,_),r),_) = pmk_setabs(l,r)
  and lmk_setcompr(((((f,_),vs),_),b),_) =
     pmk_setcompr(f,pfrees vs [],b)
  and lmk_decimal ((_,l0),ropt) =
    let l,r = if ropt = [] then l0,"1" else
              let r0 = hd ropt in
              let n_l = num_of_string l0
              and n_r = num_of_string r0 in
              let n_d = power_num (Int 10) (Int (String.length r0)) in
              let n_n = n_l */ n_d +/ n_r in
              string_of_num n_n,string_of_num n_d in
     Combp(Combp(Varp("DECIMAL",dpty),Varp(l,dpty)),Varp(r,dpty))
  and lmk_univ((_,pty),_) =
    Typing(Varp("UNIV",dpty),Ptycon("fun",[pty;Ptycon("bool",[])]))
  and any_identifier =
    function
        ((Ident s):: rest) -> s,rest
      | _ -> raise Noparse
  and identifier =
    function
       ((Ident s):: rest) ->
        if can get_infix_status s or is_prefix s or parses_as_binder s
        then raise Noparse else s,rest
      | _ -> raise Noparse
  and binder =
    function
        ((Ident s):: rest) ->
        if parses_as_binder s then s,rest else raise Noparse
      | _ -> raise Noparse
  and pre_fix =
    function
        ((Ident s):: rest) ->
        if is_prefix s then s,rest else raise Noparse
      | _ -> raise Noparse in
  let rec preterm i =
    mk_precedence (infixes()) typed_appl_preterm i
  and nocommapreterm i =
    let infs = filter (fun (s,_) -> s <> ",") (infixes()) in
    mk_precedence infs typed_appl_preterm i
  and typed_appl_preterm i =
    (appl_preterm ++
     possibly (a (Resword ":") ++ pretype >> snd)
    >> lmk_typed) i
  and appl_preterm i =
    (pre_fix ++ appl_preterm
    >> (fun (x,y) -> Combp(Varp(x,dpty),y))
  || binder_preterm ++ many binder_preterm >>
        (fun (h,t) -> itlist (fun x y -> Combp(y,x)) (rev t) h)) i
  and binder_preterm i =
    (a (Resword "let") ++
     leftbin (preterm >> singleton1) (a (Resword "and")) (K (@)) "binding" ++
     a (Resword "in") ++
     preterm
    >> lmk_let
  || binder ++
     typed_apreterm ++
     many typed_apreterm ++
     a (Resword ".") ++
     preterm
    >> lmk_binder
  || atomic_preterm) i
  and typed_apreterm i =
    (atomic_preterm ++
     possibly (a (Resword ":") ++ pretype >> snd)
    >> lmk_typed) i
  and atomic_preterm i =
    (try_user_parser
  || (a (Resword "(") ++ a (Resword ":")) ++ pretype ++ a (Resword ")")
    >> lmk_univ
  || string
     >> pmk_string
  || a (Resword "(") ++
     (any_identifier >> (fun s -> Varp(s,dpty))) ++
     a (Resword ")")
    >> (snd o fst)
  || a (Resword "(") ++
     preterm ++
     a (Resword ")")
    >> (snd o fst)
  || a (Resword "if") ++
     preterm ++
     a (Resword "then") ++
     preterm ++
     a (Resword "else") ++
     preterm
     >> lmk_ite
  || a (Resword "[") ++
     elistof preterm (a (Resword ";")) "term" ++
     a (Resword "]")
    >> (pmk_list o snd o fst)
  || a (Resword "{") ++
     (elistof nocommapreterm (a (Ident ",")) "term" ++  a (Resword "}")
            >> lmk_setenum
      || preterm ++ a (Resword "|") ++ preterm ++ a (Resword "}")
            >> lmk_setabs
      || preterm ++ a (Resword "|") ++ preterm ++
         a (Resword "|") ++ preterm ++ a (Resword "}")
            >> lmk_setcompr)
    >> snd
  || a (Resword "match") ++ preterm ++ a (Resword "with") ++ clauses
     >> (fun (((_,e),_),c) -> Combp(Combp(Varp("_MATCH",dpty),e),c))
  || a (Resword "function") ++ clauses
     >> (fun (_,c) -> Combp(Varp("_FUNCTION",dpty),c))
  || a (Ident "#") ++ identifier ++
     possibly (a (Resword ".") ++ identifier >> snd)
    >> lmk_decimal
  || identifier
    >> (fun s -> Varp(s,dpty))) i
  and pattern i =
    (preterm ++ possibly (a (Resword "when") ++ preterm >> snd)) i
  and clause i =
   ((pattern ++ (a (Resword "->") ++ preterm >> snd)) >> pmk_pattern) i
  and clauses i =
   ((possibly (a (Resword "|")) ++
     listof clause (a (Resword "|")) "pattern-match clause" >> snd)
    >> end_itlist (fun s t -> Combp(Combp(Varp("_SEQPATTERN",dpty),s),t))) i in
  (fun inp ->
    match inp with
      [Ident s] -> Varp(s,dpty),[]
    | _ -> preterm inp);;

(* ------------------------------------------------------------------------- *)
(* Type and term parsers.                                                    *)
(* ------------------------------------------------------------------------- *)

let parse_type s =
  let pty,l = (parse_pretype o lex o explode) s in
  if l = [] then type_of_pretype pty
  else failwith "Unparsed input following type";;

let parse_term s =
  let ptm,l = (parse_preterm o lex o explode) s in
  if l = [] then
   (term_of_preterm o (retypecheck [])) ptm
  else failwith "Unparsed input following term";;