(**********************************************************************)
(* (c) Greg Morrisett, Neal Glew, Dan Grossman                        *)
(*     June 1998, all rights reserved.                                *)
(**********************************************************************)

(* tallex.mll
 *
 * A basic lexer for tal format MASM files.
 *
 * TODO: really should make a more efficient symbol table as the three hash
 *       table lookups with exception handlers ain't great!
 *)

{
open Numtypes;;
open Tal;;
open Talparser;;

let err s lexbuf =
  let seg =
    Gcdfec.seg_of_abs
      (Lexing.lexeme_start lexbuf) (Lexing.lexeme_end lexbuf) in
  Gcdfec.post_error (Gcdfec.mk_err_lex seg s)
;;

(* Unfortunately we need to scan some filenames and they look too much like
 * identifiers.  The only way around this is to use context information to
 * know when to look for them.  The rw_filename table contains keywords that
 * require a filename on the same line.
 *)

let rw_filename = Hashtbl.create 13;;

let _ =
  List.iter 
    (fun (s,f) -> 
      Hashtbl.add rw_filename s f;
      Hashtbl.add rw_filename (String.uppercase s) f)
    ["include",(fun s->Tinclude s); "tal_export",(fun s->Ttal_export s);
      "tal_import",(fun s->Ttal_import s)]
;;

(* MASM reserved words are not case sensitive, we use two tables for reserved
 * words: those that are case sensitive (some of the TAL keywords), and those
 * that aren't (some TAL keywords, all MASM reserved words)
 *)

let rw_cis = Hashtbl.create 157;; (* Replace size with something better *)
let rw_cs = rw_cis (* Hashtbl.create 157;;*)

let _ =
  List.iter
    (fun (s,t) -> 
      Hashtbl.add rw_cis s t; 
      Hashtbl.add rw_cis (String.uppercase s) t)
    ["adc",Tadc; "add",Tadd; "ah",Tah; "al",Tal;
      "and",Tand; "asub",Tasub;
      "aupd",Taupd; "ax",Tax; "bh",Tbh; "bl",Tbl; "bp",Tbp;
      "bswap",Tbswap; "btagi",Tbtagi; "btagvar",Tbtagvar; "bx",Tbx;
      "call",Tcall; "cbw",Tcbw; "cdq",Tcdq; "ch",Tch; "cl",Tcl;
      "clc",Tclc; "cmc",Tcmc;
      "cmova",Tcmov Above; "cmovae",Tcmov AboveEq; "cmovb",Tcmov Below;
      "cmovbe",Tcmov BelowEq; "cmovc",Tcmov Below; "cmove",Tcmov Eq;
      "cmovg",Tcmov Greater; "cmovge",Tcmov GreaterEq; "cmovl",Tcmov Less;
      "cmovle",Tcmov LessEq; "cmovna",Tcmov BelowEq; "cmovnae",Tcmov Below;
      "cmovnb",Tcmov AboveEq; "cmovnbe",Tcmov Above; "cmovnc",Tcmov AboveEq;
      "cmovne",Tcmov NotEq; "cmovng",Tcmov LessEq; "cmovnge",Tcmov Less;
      "cmovnl",Tcmov GreaterEq; "cmovnle",Tcmov Greater;
      "cmovno",Tcmov NotOverflow; "cmovnp",Tcmov ParityOdd;
      "cmovns",Tcmov NotSign; "cmovnz",Tcmov NotEq; "cmovo",Tcmov Overflow;
      "cmovp",Tcmov ParityEven; "cmovpe",Tcmov ParityEven;
      "cmovpo",Tcmov ParityOdd; "cmovs",Tcmov Sign; "cmovz",Tcmov Eq;
      "cmp",Tcmp; "code",Tcode; "coerce",Tcoerce;
      "cwd",Tcwd; "cwde",Tcwde; "cx",Tcx; 
      "data",Tdata; "db",Tdb; "dd",Tdd;
      "dec",Tdec; "dh",Tdh; "di",Tdi;"div",Tdiv;
      "dl",Tdl; "dw",Tdw; "dword",Tdword; "dx",Tdx; 
      "eax",Teax; "ebp",Tebp; "ebx",Tebx; "ecx",Tecx; "edi",Tedi; "edx",Tedx;
      "end",Tend; "esi",Tesi; "esp",Tesp; "fallthru",Tfallthru; 
      "gettla",Tgettla;
      "idiv",Tidiv; "imul",Timul; "inc",Tinc; "int",Tint;
      "into",Tinto; 
      "ja",Tj Above; "jae",Tj AboveEq; "jb",Tj Below; "jbe",Tj BelowEq;
      "jc",Tj Below; "je",Tj Eq; "jecxz",Tjecxz; "jg",Tj Greater;
      "jge",Tj GreaterEq; "jl",Tj Less; "jle",Tj LessEq; "jmp",Tjmp;
      "jna",Tj BelowEq; "jnae",Tj Below; "jnb",Tj AboveEq; "jnbe",Tj Above;
      "jnc",Tj AboveEq; "jne",Tj NotEq; "jng",Tj LessEq; "jnge",Tj Less;
      "jnl",Tj GreaterEq; "jnle",Tj Greater; "jno",Tj NotOverflow;
      "jnp",Tj ParityOdd; "jns",Tj NotSign; "jnz",Tj NotEq; "jo",Tj Overflow;
      "jp",Tj ParityEven; "jpe",Tj ParityEven; "jpo",Tj ParityOdd;
      "js",Tj Sign; "jz",Tj Eq;
      "labeltype",Tlabeltype; "lahf",Tlahf; "lea",Tlea;
      "loopd",Tloopd; "looped",Tlooped; "loopned",Tloopned;
      "malloc",Tmalloc; "mov",Tmov; "movsx",Tmovsx; "movzx",Tmovzx;
      "mul",Tmul; "neg",Tneg; "nop",Tnop; "not",Tnot; "or",Tor; "pop",Tpop;
      "popad",Tpopad; "popfd",Tpopfd; "ptr",Tptr;
      "push",Tpush; "pushad",Tpushad;
      "pushfd",Tpushfd; "rcl",Trcl; "rcr",Trcr; "retn",Tretn; "rol",Trol;
      "ror",Tror; "sahf",Tsahf; "sal",Tsal; 
      "sar",Tsar; "sbb",Tsbb;
      "seta",Tset Above; "setae",Tset AboveEq; "setb",Tset Below;
      "setbe",Tset BelowEq; "setc",Tset Below;
      "sete",Tset Eq; "setg",Tset Greater; "setge",Tset GreaterEq;
      "setl",Tset Less; "setle",Tset LessEq; "setna",Tset BelowEq;
      "setnae",Tset Below; "setnb",Tset AboveEq; "setnbe",Tset Above;
      "setnc",Tset AboveEq; "setne",Tset NotEq; "setng",Tset LessEq;
      "setnge",Tset Less; "setnl",Tset GreaterEq; "setnle",Tset Greater;
      "setno",Tset NotOverflow; "setnp",Tset ParityOdd; "setns",Tset NotSign;
      "setnz",Tset NotEq; "seto",Tset Overflow; "setp",Tset ParityEven;
      "setpe",Tset ParityEven; "setpo",Tset ParityOdd; 
      "settla",Tsettla; "sets",Tset Sign; "setz",Tset Eq;
      "shl",Tshl; "shld",Tshld; "shr",Tshr; "shrd",Tshrd; "si",Tsi; "sp",Tsp;
      "stc",Tstc; "sub",Tsub;
      "tal_struct",Ttal_struct; "tal_ends",Ttal_ends; "test",Ttest;
      "tla",Ttla;
      "type",Ttype; "unpack",Tunpack; "val",Tval; "xchg",Txchg; "xor",Txor
    ]
;;

let _ =
  List.iter
    (fun (s,t) -> Hashtbl.add rw_cs s t) 
    ["?", Tquestion; "_begin_TAL",T_begin_TAL; "_end_TAL",T_end_TAL;
      "All",TAll;
      "array",Tarray; "B1",TB Byte1; "B2",TB Byte2; "B4",TB Byte4;
      "B8",TB Byte8; "Exist",TExist;
      "fn",Tfn; "junk",Tjunk; "pack",Tpack;
      "R",TR; "R16",TR16; "rec",Trec; "RH",TRH; "RL",TRL; "roll",Troll;
      "rollsum",Trollsum;
      "S",TS; "se",Tse; "Sint",TSint; "slot",Tslot; "sptr",Tsptr;
      "subsume",Tsubsume; "sum",Tsum;
      "T",TT; "T1",TT1; "T2",TT2; "T4",TT4; "T8",TT8; "tapp",Ttapp; "Tm",TTm;
      "Ts",TTs; "unroll",Tunroll
    ] 
;;

(* Cyclone *)
let _ =
  List.iter
    (fun (s,t) -> 
      Hashtbl.add rw_cis s t;
      Hashtbl.add rw_cis (String.uppercase s) t)
    [ "cgstart",Tcgstart; "cgdump",Tcgdump;
      "cgend",Tcgend; "cgabort",Tcgabort; "cgforget",Tcgforget;
      "cgfill",Tcgfill; "cgfillbtag",Tcgfillbtag;
      "cgfilljmp",Tcgfilljmp; "cgfilljcc",Tcgfilljcc;
      "cghole",Tcghole;
      "cgholebtagi",Tcgholebtagi; "cgholebtagvar",Tcgholebtagvar;
      "cgholejmp",Tcgholejmp;
      "cgholejcc",Tcgholejcc;
      "template_start",Ttemplate_start; "template_end",Ttemplate_end;
      "tmpl",Ttmpl; "cgregion",Tcgregion; "ecg",Tecg
    ]
;;
let _ =
  List.iter
    (fun (s,t) -> Hashtbl.add rw_cs s t) 
    [ "_begin_CYCLONE",T_begin_CYCLONE;
      "te",Tte; "Tt",TTt; "tptr",Ttptr; ]
;;
(* End Cyclone *)

let process_identifier s filename lexbuf =
(*   let lower = String.lowercase s in *)
  try Hashtbl.find rw_cis  s (* lower *)
  with Not_found ->
(*    try Hashtbl.find rw_cs s
    with Not_found -> *)
      try
	let t = (Hashtbl.find rw_filename s (* lower*) ) in
	t (filename lexbuf)
      with Not_found ->
      	Tident s
;;

let process_number s =
  let l = String.length s in
  if l=0 then invalid_arg "Tallex.process_number";
  match s.[l-1] with
    'b' | 'y' | 'o' | 'q' | 'h' ->
      failwith "Tallex.process_number - nondecimal unimplemented"
  | 'd' | 't' -> Tnumber (int32_of_string (String.sub s 0 (l-1)))
  | '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' ->
      Tnumber (int32_of_string s)
  | _ -> invalid_arg "Tallex.process_number"
;;

let process_string s =
  Tstring (String.sub s 1 (String.length s - 2))
;;
} 

rule main = parse
(* Whitespace, Comments *)
  [' ' '\009' '\011' '\012']+ {main lexbuf}
| '\010' | '\013' | "\010\013" | "\013\010"
    {Gcdfec.new_line lexbuf; Teol}
| ';' {comment lexbuf; Teol}
(* Identifiers *)
| ['a'-'z' 'A'-'Z' '_' '$' '?']['a'-'z' 'A'-'Z' '_' '$' '?' '0'-'9']*
    {process_identifier (Lexing.lexeme lexbuf) filename lexbuf}
(* Literals *)
| '-'? ['0'-'9']+['d' 't']?
| ['0'-'9']+['b' 'y' 'o' 'q']
| ['0'-'9']['0'-'9' 'a'-'f' 'A'-'F']*'h'
    {process_number (Lexing.lexeme lexbuf)}
| '\'' [' '-'&' '\040'-'~']* '\''
| '\034' ['\032' '\033' '\035'-'~']* '\034'
    {process_string (Lexing.lexeme lexbuf)}
(* Punctuation *)
| '<' {Tlab}            | '^' {Tcaret}
| '>' {Trab}            | '@' {Tat}
| '(' {Tlb}             | ',' {Tcomma}
| ')' {Trb}		| '.' {Tdot}  
| '[' {Tlsb}		| '*' {Tstar} 
| ']' {Trsb}		| '+' {Tplus} 
| '{' {Tlcb}		| ':' {Tcolon}
| '}' {Trcb}		| '=' {Tequal}
| '|' {Tbar}            | '`' {Tbackquote}
(* Special *)
| "-!>" {Tarrow}
| "::" {Tcons}
| "<=" {Tleq}
| "^r" {Tvar Read}
| "^w" {Tvar Write}
| "^rw" {Tvar ReadWrite}
| "^u" {Tvar Uninit}
| eof {Teof}
(* Cyclone *)
| ":::" {Ttcons}
(* End Cyclone *)
(* Ununsed *)
| ['~' '\\' '/' '-' '&' '%' '#' '!' '\'']
    {err "Current unused character" lexbuf; main lexbuf}
| ['\000'-'\008' '\014'-'\031' '\127'-'\255']
    {err "Illegal Character" lexbuf; raise Gcdfec.Exit}
| _ {failwith ("should not be here! "^
	       (Char.escaped (Lexing.lexeme_char lexbuf 0)))}
(* Comments *)
and comment = parse
  '\010' | '\013' | "\010\013" | "\013\010" {Gcdfec.new_line lexbuf}
| [^ '\010' '\013' ]+ {comment lexbuf}

(* Filenames *)
and filename = parse
  [' ' '\009' '\011' '\012']+ {filename lexbuf}
| ['\033'-'\126']+ {Lexing.lexeme lexbuf}
| _ {err "Illegal Character in filename" lexbuf; raise Gcdfec.Exit}

(* EOF: tallex.mll *)
