(**********************************************************************)
(* (c) Greg Morrisett, Neal Glew, Richard Samuels                     *)
(*     September 1998, all rights reserved.                           *)
(**********************************************************************)

(* Talasm
 * Generic TAL assembler, x86 stuff is in Talasmx86
 *)

(* Changes:

   RLS 11/15/98:
     Added the DEBUG_ASSEMBLER flag seen below. By default it is true for now.
     Added assemble_instr, which checks for instruction length errors when
     DEBUG_ASSEMBLER is true. If you see "Assembler assertion: instruction has
     grown", then there's something wrong with length_of_instr for that
     instruction in talasmx86.ml.

   RLS 11/10/98:
     update_symbol_size now prints an error message if a block has grown.
     If you see "Assembler assertion: block has grown", then there's something
     wrong with length_of_operand in talasmx86.ml.


   To do:
   - Is it better to store addends in relocations and put them in later in the
   linking pass (done now), or to write the addends in the second pass, and
   read them and add the symbol offsets to them in the linking pass? The first
   way stores addends along with the relocations (I thought this was good)
   and is better for ELF "rela" relocations; the second is better for COFF
   relocations and ELF "rel" relocations.
*)

open Numtypes;;
open Identifier;;
open Tal;;
open Objfile;;
open Talasmx86;;
open Talpp;;

let debug_assembler = ref true;;

(***** asm_env *****)

(* make_asm_env
 * Returns a new asm_env. This function SHOULD NOT CHANGE between
 * implementations.
 *)

let make_asm_env schan symdict pos_offset r_reloclist sec_num = {
   a_schan = schan;
   a_symdict = symdict;
   a_secnum = sec_num;
   a_pos_offset = pos_offset;
   a_r_reloclist = r_reloclist
 }
;;

(********************************** SYMBOLS **********************************)

(***** Imports and exports *****)

(* make_import_symbols
   Creates an import symbol for each identifier in the imports list.
   Returns: a list of import symbols.
   Side effects: all symbols are inserted into the symbol dict.
     Any special symbols are inserted without identifier numbers. *)

let rec make_import_symbols r_symdict imports =
   match imports with
      [] -> []
    | ((import, _) :: tail) ->
	 let import_symbol = (make_external_symbol import) in
	 (insert_symbol_in_dict r_symdict import_symbol);
	 import_symbol :: (make_import_symbols r_symdict tail)
;;

(* export_symbol
   Side effects:
     - Looks up the symbol with identifier ident, and makes it global. *)

let export_symbol symdict (ident, _) =
   let symbol = (lookup_symbol symdict ident) in
   symbol.sym_scope <- Global
;;

(* update_symbol_pos
   Looks up the symbol with identifier ident in the symbol dictionary,
   and changes its position to newpos.

   Returns:
     - The difference between the real position and the expected
       position. E.g., if expected position is 12 and real position is 10,
       returns 2.
   Side effects:
     - Changes the position of the symbol to newpos. *)

let update_symbol_pos symdict ident newpos =
   let symbol = (Dict.lookup symdict ident) in
   if symbol.sym_section = 0 then
      ((failwith "update_symbol_pos: symbol is external.");
	 0)
   else
      let pos_offset = symbol.sym_offset - newpos in
      symbol.sym_offset <- newpos;
      pos_offset
;;

(* update_symbol_pos
   Looks up the symbol with identifier ident in the symbol dictionary,
   and changes its position to newpos.

   Side effects:
     - Changes the size of the symbol to newsize. *)

let update_symbol_size symdict ident newsize =
   let symbol = (Dict.lookup symdict ident) in

   (* Prints an error message if block has grown. --RLS 11/10/98*)
   if newsize > symbol.sym_size then
      Printf.eprintf
       "Assembler assertion: block has grown.\n\t %s was %d bytes, now is %d bytes.\n"
       (id_to_string ident) symbol.sym_size newsize;

   symbol.sym_size <- newsize;
;;

(****************************** CODE SECTIONS ********************************)

(***** Code preparsing *****)

(* length_of_instrs
   Returns: The length of all the instructions in the vector. *)

let rec length_of_instrs instrs =
   let r_length = ref 0 in
   for count = 0 to (Array.length instrs) - 1 do
      try
	 r_length := !r_length + (length_of_instr instrs.(count));
      with
	 Invalid_argument s ->
	    (invalid_arg (Printf.sprintf "instruction %d: %s" count s))
   done;
   !r_length
;;

(* preparse_code
   Preparses each of the code blocks, finding its length, and creating
   a symbol for the code block.

   Returns: Total code length.
   Side effects: A symbol is created in r_symlist and inserted in r_symdict
     for every code block. *)

let preparse_code r_symdict r_symlist sec_num startpos blocks =
   let r_offset = ref startpos in
   for count = 0 to (Array.length blocks) - 1 do
      match blocks.(count) with
	 (ident, _, instrs) ->
	    try
	       let block_size = (length_of_instrs instrs) in
	       let block_symbol =
		  (make_internal_symbol ident sec_num !r_offset block_size) in
	       (insert_symbol r_symdict r_symlist block_symbol);
	       r_offset := !r_offset + block_size;
	    with
	       Invalid_argument s ->
		  (invalid_arg (Printf.sprintf
				     "Preparsing error, label %s, %s"
				     (id_to_string ident) s))
   done;
   !r_offset				(* Return total code length. *)
;;

(***** Code assembly *****)

(* assemble_instr
   Assemble a single instruction. If in debug mode, check to see that
   the real length does not exceed the predicted length. *)

let assemble_instr env instr =
   if !debug_assembler then
      begin
	 let start_pos = (Stringchan.get_mark env.a_schan) in
	 let result = (write_instr env instr) in
	 let actual_length = ((Stringchan.get_mark env.a_schan) - start_pos) in
	 let predicted_length = (length_of_instr instr) in
	 if actual_length > predicted_length then
	    begin
	       Printf.printf "Assembler assertion: instruction has grown.\n";
	       (Talpp.print_instruction Format.std_formatter
		     {style = MASM; kinds = false; cons = false} instr);
	       (Printf.printf "\n\tPredicted %d bytes, actually %d bytes.\n"
		     predicted_length actual_length);
	    end
      end
   else
      (write_instr env instr)
;;

(* assemble_code
   Assemble each code block, updating the block's reference in the symbol
   dict to point to the actual position of the instruction.

   Preconditions:
     - schan must be a string channel at least as long as we'll need.
   Side effects:
     - Symbols in symdict are updated to their new positions.
     - Relocations are inserted into r_reloclist.
     - The assembled code is written into schan. *)

let assemble_code schan r_reloclist sec_num symdict blocks =
   let env = (make_asm_env schan symdict 0 r_reloclist sec_num) in
   for count = 0 to (Array.length blocks) - 1 do
      match blocks.(count) with
	 (ident, _, instrs) ->
	    let realpos = (Stringchan.get_mark schan) in
	    env.a_pos_offset <- (update_symbol_pos symdict ident realpos);
	    for instrcount = 0 to (Array.length instrs) - 1 do
	       try
		 assemble_instr env instrs.(instrcount);
	       with
		  Invalid_argument s ->
		    invalid_arg (Printf.sprintf
				   "Assembly error, label %s, instr %d: %s"
				   (id_to_string ident) instrcount s)
	    done;
	    (update_symbol_size symdict ident
		  ((Stringchan.get_mark schan) - realpos));
   done
;;

(* make_code_section
   Assembles the code from a tal_imp, and returns a new code section.

   Preconditions: The code must already have been preparsed, and a symbol
       must exist for every identifier in the section.

   Returns:  A ".text" section with data, relocs, and symbol fields set.
   Side effects:  - symbols in symdict are updated to be in their proper
       positions. *)

let make_code_section sec_num symdict code_symbols code_length imp =
   let r_code_relocs = ref [] in
   let code_data = (Stringchan.create code_length) in
   (assemble_code code_data r_code_relocs sec_num symdict imp.code_blocks);
   (make_section ".text" code_data
	 (List.rev !r_code_relocs)	(* Need to reverse lists. *)
	 (List.rev code_symbols))
;;

(****************************** DATA SECTIONS  *******************************)

(***** Data preparsing *****)

(* length_of_data_items
   Returns the sum of the lengths of all of the data items in the list. *)

let rec length_of_data_items ditems =
   match ditems with
      [] -> 0            
    | (ditem :: tail) -> 
	(* Alignment fix by Dan *)
	let this_one = length_of_data_item ditem in
	this_one + ((4-(this_one mod 4)) mod 4) + (length_of_data_items tail)
;;

(* preparse_data
   Preparses each of the data blocks, finding its length, and creating
   a symbol for it.

   Returns: total data length.
   Side effects: A symbol is created in r_symlist and inserted into r_symdict
     for each data symbol. *)

let preparse_data r_symdict r_symlist sec_num startpos blocks =
   let r_offset = ref startpos in
   for count = 0 to (Array.length blocks) - 1 do
      match blocks.(count) with
	 (ident, _, (ditems, _)) ->
	    let data_size = (length_of_data_items ditems) in
	    let data_symbol = (make_internal_symbol ident sec_num !r_offset
				    data_size) in
	    (insert_symbol r_symdict r_symlist data_symbol);
	    r_offset := !r_offset + data_size;
   done;
   !r_offset				(* Return total data length. *)
;;

(***** Data assembly *****)

(* assemble_data
   Assemble each data block.

   Preconditions:
     - schan must be a string at least as long as we'll need.
   Side effects:
     - Relocations are inserted into r_reloclist.
     - The assembled data is written into schan. *)

let assemble_data schan r_reloclist sec_num symdict blocks =
   let env = (make_asm_env schan symdict 0 r_reloclist sec_num) in
   for count = 0 to (Array.length blocks) - 1 do
     (* Alignment fix by Dan *)
     (* This is not a fix if indirect pointers can be at bad alignment *)
     let rec iter n =
       match n with
	 0 -> ()
       | _ -> Stringchan.put_byte schan 0; iter (n-1) in
     iter ((4 - ((Stringchan.get_mark schan)mod 4)) mod 4);
     (* end fix *)
     match blocks.(count) with
       (ident, _, (data, _)) ->
	 (List.iter (write_data_item env) data)
   done
;;

(* make_data_section
   Takes a TAL implementation, and returns a data section.

   Preconditions: The data section must already have been preparsed,
      and symdict must contain its symbols.

   Returns: A ".data" section with data, relocs, and symbol fields. *)

let make_data_section sec_num symdict data_symbols data_length imp =
   let r_data_relocs = ref [] in
   let data_chan = (Stringchan.create data_length) in
   (assemble_data data_chan r_data_relocs sec_num symdict imp.data_blocks);
   (make_section ".data" data_chan
	 (List.rev !r_data_relocs)
	 (List.rev data_symbols))
;;

(******************************** LINKING *********************************)

(***** Linker *****)

(* is_reloc_internal
   Returns TRUE if the relocation is internal to the section. *)

let is_reloc_internal sec_num reloc =
   ((reloc.rel_symbol.sym_section = sec_num) && (not reloc.rel_force_public))
;;

(* reloc_dest
   Returns the destination of a relocation, as either an absolute address
   or an offset from a given position. *)

let reloc_dest pos reloc =
   match reloc.rel_relativity with
      Absolute ->
 	(reloc.rel_symbol.sym_offset + (int32_to_int reloc.rel_addend))
    | Relative ->
 	((reloc.rel_symbol.sym_offset + (int32_to_int reloc.rel_addend)) - pos)
;;

(* put_reloc_dest
   Side effects:
     - Writes the relocation destination at the given position. *)

let put_reloc_dest data pos reloc =
   (Stringchan.set_mark data pos);
   match reloc.rel_scale with
      Byte1 -> (Stringchan.put_byte data (reloc_dest (pos + 1) reloc))
    | Byte2 -> (Stringchan.put_2bytes data (reloc_dest (pos + 2) reloc))
    | Byte4 -> (Stringchan.put_4bytes data (reloc_dest (pos + 4) reloc))
    | _ -> failwith "Linker: put_reloc_dest: Bad relocation scale."
;;

(* link_relocs
   Links all section-internal relocations in the given list of relocations.
   Returns:
     - A list of relocations with the linked internal relocations removed.
   Side effects:
     - Identifiers in the section's data that used internal relocations now
       have the proper addresses. *)

let rec link_relocs sec_num data relocs =
   match relocs with
      [] -> []
    | (reloc :: tail) ->
	 if (is_reloc_internal sec_num reloc) then
	    begin
	       (put_reloc_dest data reloc.rel_pos reloc);
	       (link_relocs sec_num data tail)(* Delete the used relocation. *)
	    end
	 else
	    (reloc :: (link_relocs sec_num data tail))
;;

(* link_sections
   Links the internal relocations in each section.
   Side effects:
     - All internal relocations are removed from each section's relocation
       list.
     - Identifiers in the section's data that used internal relocations are
       linked to the proper addresses. *)

let rec link_sections sec_num secs =
   match secs with
      [] -> ()
    | (sec :: tail) ->
	 let savemark = (Stringchan.get_mark sec.s_data) in
	 sec.s_relocs <- (link_relocs sec_num sec.s_data sec.s_relocs);
	 (Stringchan.set_mark sec.s_data savemark);
	 (link_sections (sec_num + 1) tail)
;;

(* link_file
   Performs internal linking for every section in the file.
   Side effects: Internal relocations are linked and removed. *)

let link_file objfile =
   (link_sections 1 objfile.o_secs)
;;

(***************************** ENTRY FUNCTION *****************************)

(***** Assembly *****)

(* assemble
   Assembles a Tal implementation into an object file.
   Returns: The assembled object file. *)

let assemble imp (imports_intt, exports_intt) =
   let r_symdict = ref (empty_symbol_table()) in
   let r_code_symbols = ref [] in
   let r_data_symbols = ref [] in

					(* Preparse & gather symbols. *)
   let import_symbols = (make_import_symbols r_symdict
			      (special_labels @ imports_intt.it_vals)) in
   let code_length = (preparse_code r_symdict r_code_symbols 1 0
			   imp.code_blocks) in
   let data_length = (preparse_data r_symdict r_data_symbols 2 0
			   imp.data_blocks) in

					(* Declare export symbols public. *)
   (List.iter (export_symbol !r_symdict) exports_intt.it_vals);

					(* Assemble code and data. *)
   let code_section = (make_code_section 1 !r_symdict !r_code_symbols
			    code_length imp) in
   let data_section = (make_data_section 2 !r_symdict !r_data_symbols
			    data_length imp) in

					(* Create an object file structure. *)
   let objfile = (make_objfile [code_section; data_section]
		       import_symbols !r_symdict) in

   (link_file objfile);			(* Link all local symbols. *)

   objfile
;;

(* EOF: talasm.ml *)
