/compiler.pas
unit Compiler;
interface

uses Opcode, SymTable;

const COMPILE_BUFFER_SIZE: Word = 65535;

type
SegArray = array[0..32767] of Byte;
PSegArray = ^SegArray;
CompileBuffer = object
    data: PSegArray;
    data_len: Word;
    constructor Init;
    destructor Done;
    procedure Add(const bytes: Pointer; const len: Word);
    procedure AddByte(const b: Byte);
    procedure AddWord(const w: Word);
    procedure Increment(const len: Word);
    procedure Write(var f: File);
end;
CompiledObject = record
    buffer: CompileBuffer;
    name: string;
    segment_name: string;
    symbols: SymbolTable;
    public_symbols: SymbolTable;
end;
OpArray = array[1..3] of Operand;

var currentObj: ^CompiledObject;

procedure Compile(const filename: string; var obj: CompiledObject);

implementation

uses Parse, Util;

type PartsKind = (partsNone, partsInstruction, partsDirective);
Parts = record
    label_or_name: string[32];
    mnemonic: string[7];
    case Kind: PartsKind of
        partsInstruction: (
            op1: string[32];
            op2: string[32];
            op3: string[32];
        );
        partsDirective: (
            args: string[128];
        )
end;
DataDefType = (DataDB, DataDW, DataDD);

procedure PartsClear(var pts: Parts);
begin
    pts.Kind := partsNone;
    pts.label_or_name := '';
    pts.mnemonic := '';
    pts.op1 := '';
    pts.op2 := '';
    pts.op3 := '';
    pts.args := '';

end;

procedure PartsDump(const pts: Parts);
begin
    if pts.kind = partsNone then
        Writeln('=== NONE')
    else if pts.kind = partsInstruction then begin
        Writeln('=== Instruction label: ', pts.label_or_name, '; ',
                'mnemonic: ', pts.mnemonic, '; ',
                'op1: ', pts.op1, '; ',
                'op2: ', pts.op2, '; ',
                'op3: ', pts.op3, ';');
    end
    else if pts.kind = partsDirective then begin
        Writeln('=== Directive label:', pts.label_or_name, '; ',
                'directive: ', pts.mnemonic, '; ',
                'args: ', pts.args, '; ');
    end;
end;

constructor CompileBuffer.Init;
begin
    New(data);
end;

destructor CompileBuffer.Done;
begin
    Dispose(data);
end;

procedure CompileBuffer.Add(const bytes: Pointer; const len: Word);
var i: Integer;
    b: ^Byte;
begin
    b := bytes;
    for i := 0 to len - 1 do begin
        data^[data_len + i] := b^;
        Inc(b);
    end;
    data_len := data_len + len;
end;

procedure CompileBuffer.AddByte(const b: Byte);
begin
    data^[data_len] := b;
    Inc(data_len);
end;

procedure CompileBuffer.AddWord(const w: Word);
begin
    data^[data_len] := Lo(w);
    Inc(data_len);
    data^[data_len] := Hi(w);
    Inc(data_len);
end;

procedure CompileBuffer.Increment(const len: Word);
begin
    data_len := data_len + len;
end;

procedure CompileBuffer.Write(var f: File);
begin
    BlockWrite(f, data, data_len);
end;

procedure PickParts(const s: String; var pts: Parts);
var i, j: Byte;
    a, b: String[32];
begin
    i := 1;
    ConsumeWhitespace(s, i);
    a := ToUpCase(ConsumeIdentifier(s, i));
    if s[i] = ':' then begin { this is a label }
        pts.label_or_name := a;
        Inc(i);
        ConsumeWhitespace(s, i);
        if i > Length(s) then
            Exit;

        a := ToUpCase(ConsumeIdentifier(s, i));
    end;
    ConsumeWhitespace(s, i);
    j := i;  { save position }
    b := ToUpCase(ConsumeIdentifier(s, j));
    { EQU uniquely and annoyingly has a name before and an arg after }
    if b = 'EQU' then begin
        pts.Kind := partsDirective;
        pts.label_or_name := a;
        pts.mnemonic := b;
        ConsumeWhitespace(s, j);
        pts.args := Copy(s, j, Length(s) - j + 1);
        Exit;
    end
    { Things which have a name but no other args }
    else if (b = 'SEGMENT') or (b = 'STRUC') or (b = 'ENDS') then begin
        pts.Kind := partsDirective;
        pts.label_or_name := a;
        pts.mnemonic := b;
        Exit;
    end
    { Then everything that is just a directive and arguments }
    else if (a = 'DB') or (a = 'DW') or (a = 'DD') or (a = 'DS') or
            (a = 'ORG') or (a = 'NAME') or (a = 'PUBLIC') or
            (a = 'EXTRN') then begin
        pts.Kind := partsDirective;
        pts.mnemonic := a;
        ConsumeWhitespace(s, i);
        pts.args := Copy(s, i, Length(s) - i + 1);
        Exit;
    end
    { And finally, standalone directives }
    else if (a = 'EVEN') or (a = 'END') then begin
        pts.Kind := partsDirective;
        pts.mnemonic := a;
        Exit;
    end;

    { Everything left must be an instruction }
    pts.Kind := partsInstruction;
    pts.mnemonic := a;
    ConsumeWhitespace(s, i);
    pts.op1 := ConsumeUntilComma(s, i);
    Trim(pts.op1);
    ConsumeWhitespaceAndCommas(s, i);
    if i > Length(s) then
        Exit;
    pts.op2 := ConsumeUntilComma(s, i);
    Trim(pts.op2);
    ConsumeWhitespaceAndCommas(s, i);
    if i > Length(s) then
        Exit;
    pts.op3 := ConsumeUntilComma(s, i);
    Trim(pts.op3);
end;

procedure AddPublic(const s: String);
var i: Byte;
    n: String[32];
begin
    i := 1;
    repeat
        n := ConsumeUntilComma(s, i);
        if Length(n) > 0 then begin
            DefineSymbol(currentObj^.public_symbols, ToUpCase(Trim(n)), 0);
            ConsumeWhitespaceAndCommas(s, i);
        end;
    until Length(n) = 0;
end;

procedure DefineData(const s: String; width: DataDefType);
var i, j: Byte;
    n: Word;
    def: String[32];
begin
    i := 1;
    repeat
        def := ConsumeUntilComma(s, i);
        if Length(def) > 0 then begin
            Trim(def);
            if def[1] = '''' then
                case width of
                    dataDB: begin
                        for j := 2 to Length(def) do begin
                            if def[j] = '''' then
                                Break;
                            currentObj^.buffer.AddByte(Byte(def[j]));
                        end;
                        if j < Length(def) then
                            DieBadOperand('Trailing garbage after terminal '' in string');
                    end;
                    DataDW: begin
                        if Length(def) = 3 then
                            n := Word(def[2])
                        else if Length(def) = 4 then
                            n := (Word(def[2]) shl 8) or Word(def[3])
                        else
                            DieBadOperand('Malformed string in DW definition');
                        currentObj^.buffer.AddWord(n);
                    end;
                    DataDD: DieBadOperand('Cannot use strings in DD definition');
                end
            else begin
                if not ParseNumericExpression(def, n) then
                    DieBadOperand(def);
                case width of
                    DataDB: currentObj^.buffer.AddByte(Lo(n));
                    DataDW: currentObj^.buffer.AddWord(n);
                    DataDD: begin
                        { Uhhh punting here as I don't really understand what
                          values are allowed in a DD. This only allows 16-bit
                          values and pads them to 32-bit. }
                        currentObj^.buffer.AddWord(n);
                        currentObj^.buffer.AddWord(0);
                    end;
                end;
            end;
            Inc(i); { skip comma }
            ConsumeWhitespace(s, i);
        end;
    until Length(def) = 0;
end;

procedure FindInstruction(const pts: Parts; var m: Mnemonic; var op: OpArray; var ie: PInstructionEncoding);
var i, lo_enc, hi_enc: Integer;
begin
    m := ParseMnemonic(pts.mnemonic);
    if m = mInvalid then begin
        ErrorAtLine('Invalid mnemonic: ' + pts.mnemonic);
        Halt(1);
    end;
    GetMnemonicEncodingBounds(m, lo_enc, hi_enc);
    if lo_enc = High(IEnc) then begin
        ErrorAtLine('Internal error: mnemonic not found');
        Halt(128);
    end;
    { iterate over possible encodings }
    for i := lo_enc to hi_enc do begin
        op[1].Kind := OpAbsent;
        op[2].Kind := OpAbsent;
        op[3].Kind := OpAbsent;
        ie := @IEnc[i];
        { Writeln('  ie ', i, ': ', m, ' ', ie^.o1, ' ', ie^.o2, ' ', ie^.o3); }
        if (ie^.o1 <> OpAbsent) and not ParseOperand(pts.op1, op[1], ie^.o1) then
            Continue;
        if (ie^.o2 <> OpAbsent) and not ParseOperand(pts.op2, op[2], ie^.o2) then
            Continue;
        if (ie^.o3 <> OpAbsent) and not ParseOperand(pts.op3, op[3], ie^.o3) then
            Continue;
        Exit;
    end;
    { We didn't find anything }
    ErrorAtLine('Invalid operands for ' + MnemonicStr(m));
    Writeln('Possible valid options:');
    for i := lo_enc to hi_enc do begin
        Writeln('    ', MnemonicStr(m), ' ', ShortOperandStr(IEnc[i].o1),
                                        ' ', ShortOperandStr(IEnc[i].o2),
                                        ' ', ShortOperandStr(IEnc[i].o3));
    end;
    Halt(1);
end;

procedure CompileInstruction(const pts: Parts);
var i: Integer;
    instr: array[0..5] of Byte;
    instr_len: Byte;
    m: Mnemonic;
    op: OpArray;
    ie: PInstructionEncoding;
    rbp: Byte;
    disp: Integer;
begin
    instr_len := 2;
    if pts.mnemonic = '' then
        Exit;
    FindInstruction(pts, m, op, ie);
    instr[1] := Byte(ie^.op);
    instr[0] := Byte(ie^.w);
    if ie^.wb <> wLocation then
        instr[0] := instr[0] OR (Byte(ie^.wb) SHL 3);
    if ((m = mMOV) or (m = mMOVB)) AND (ie^.o1 = OpMemory) AND (ie^.o2 = OpMemory) then begin
        { mem-mem MOV is a special case that emits two instructions back-to-back. Note that the
          source (the second operand) is first. }
        instr[1] := instr[1] OR Byte(op[2].mm);
        instr[0] := instr[0] OR (Byte(op[2].aa) SHL 1);
        if op[2].aa = BaseOffset then begin
            instr[2] := Lo(op[2].offset);
            Inc(instr_len);
        end;
        currentObj^.buffer.Add(@instr, instr_len);
        instr_len := 2;
        instr[1] := $CC OR Byte(op[1].mm);
        instr[0] := Byte(ie^.w) OR (Byte(op[1].aa) SHL 1);
        if op[1].aa = BaseOffset then begin
            instr[2] := Lo(op[1].offset);
            Inc(instr_len);
        end;
        currentObj^.buffer.Add(@instr, instr_len);
        Exit;
    end
    else begin
        for i := 1 to 3 do begin
            case op[i].Kind of
                OpMemory: begin
                    instr[1] := instr[1] OR Byte(op[i].mm);
                    instr[0] := instr[0] OR (Byte(op[i].aa) SHL 1);
                    if op[i].aa = BaseOffset then begin
                        instr[2] := Lo(op[i].offset);
                        Inc(instr_len);
                    end;
                end;
                OpRegister:
                    rbp := Byte(op[i].reg);
                OpPointer:
                    rbp := Byte(op[i].preg);
                OpImmediate: begin
                    case ie^.wb of
                        wOne: begin
                            if op[i].immed > 255 then
                                WarnAtLine('byte immediate is > 255');
                            instr[instr_len] := Lo(op[i].immed);
                            Inc(instr_len);
                        end;
                        wTwo: begin
                            instr[instr_len] := Lo(op[i].immed);
                            Inc(instr_len);
                            instr[instr_len] := Hi(op[i].immed);
                            Inc(instr_len);
                        end;
                    end;
                end;
                OpLocation: begin
                    { The displacement is added to TP at the time of execution, which annoyingly
                      is already advanced to the instruction after the jump. So where we calculate
                      from depends on whether it's a one-byte or two-byte displacement, and we
                      don't know whether we need a two-byte displacement until we calculate the
                      displacement. :|

                      I *think* this errs on the side of making a two-byte displacement for a case
                      that would only need one, and not the other way around. But the proof will be
                      in the testing. }
                    disp := Integer(op[i].loc) - Integer(currentObj^.buffer.data_len + instr_len + 1);
                    if (ie^.wb = wTwo) OR (disp > 128) OR (disp < -127) then begin
                        Dec(disp); { displacement origin moves one byte forward with two-byte displacement }
                        instr[instr_len] := Lo(Word(disp));
                        Inc(instr_len);
                        instr[instr_len] := Hi(Word(disp));
                        Inc(instr_len);
                        { Not totally sure if it's necessary to also adjust W, but here it is }
                        instr[0] := instr[0] OR (Byte(wTwo) SHL 3) OR 1;
                    end
                    else begin
                        instr[instr_len] := Byte(Shortint(disp));
                        Inc(instr_len);
                        instr[0] := instr[0] OR (Byte(wOne) SHL 3);
                    end;
                end;
                OpBit:
                    rbp := Byte(op[i].bit);
                OpWidth:
                    if op[i].width = widByte then
                        rbp := 0
                    else
                        rbp := 1;
            end;
        end;
    end;
    case ie^.rbp of
        rbpReg, rbpBit, rbpPtrReg: instr[0] := instr[0] OR (rbp SHL 5);
        { rbpNone does nothing }
        rbpHlt: instr[0] := instr[0] OR $20;
        rbpSintr: instr[0] := instr[0] OR $40;
        rbpJmp: instr[0] := instr[0] OR $80;
        rbpWid: instr[0] := instr[0] OR (1 SHL 7)
                                     OR (Byte(op[1].width) SHL 6)
                                     OR (Byte(op[2].width) SHL 5);
        rbpXfer: instr[0] := instr[0] OR $60;
    end;
    currentObj^.buffer.Add(@instr, instr_len);
    { Write('mnemonic: ', m, '; op1: ');
    DumpOperand(op[1]);
    Write('; op2: ');
    DumpOperand(op[2]);
    Writeln(''); }
end;

procedure CompileDirective(const pts: Parts);
var op: Operand;
begin
    if pts.mnemonic = 'EQU' then begin
        if not ParseOperand(pts.args, op, OpImmediate) then
            DieBadOperand(pts.args);
        DefineSymbol(currentObj^.symbols, pts.label_or_name, op.immed);
    end
    else if pts.mnemonic = 'DB' then begin
        DefineData(pts.args, DataDB);
    end
    else if pts.mnemonic = 'DW' then begin
        DefineData(pts.args, DataDW);
    end
    else if pts.mnemonic = 'DD' then
    begin
        DefineData(pts.args, DataDD);
    end
    else if pts.mnemonic = 'DS' then begin
        if not ParseOperand(pts.args, op, OpImmediate) then
            DieBadOperand(pts.args);
        currentObj^.buffer.Increment(op.immed);
    end
    else if pts.mnemonic = 'NAME' then begin
        currentObj^.name := pts.op1;
    end
    else if pts.mnemonic = 'SEGMENT' then begin
        if currentObj^.segment_name = '' then
            currentObj^.segment_name := pts.label_or_name
        else
            DieBadOperand('Segment has already been defined');
    end
    else if pts.mnemonic = 'ENDS' then begin
        if pts.label_or_name <> currentObj^.segment_name then
            DieBadOperand('Segment end name is not the same as the beginning');
        { TODO: make sure no instructions appear outside of the segment bounds }
    end
    else if pts.mnemonic = 'PUBLIC' then begin
        AddPublic(pts.args);
    end
    else if pts.mnemonic = 'EXTRN' then begin
        ErrorAtLine('EXTRN not yet implemented');
        Halt(1);
    end
    else if pts.mnemonic = 'END' then begin
        { Not sure what to do with this }
    end;
end;

procedure Compile(const filename: string; var obj: CompiledObject);
var f: Text;
    pts: Parts;
begin
    currentObj := @obj;  { poor man's `this` pointer }
    obj.buffer.Init;
    Assign(f, filename);
    Reset(f);
    currentLine := 0;
    while not Eof(f) do begin
        Readln(f, linebuf);
        Inc(currentLine);
        { Writeln('*** ', linebuf); }
        StripComments(linebuf);
        if Length(linebuf) = 0 then { nothing left on the line }
            Continue;
        PartsClear(pts);
        PickParts(linebuf, pts);
        { PartsDump(pts); }

        if (pts.label_or_name <> '') and
            not ((pts.mnemonic = 'EQU') or (pts.mnemonic = 'STRUC') or
                 (pts.mnemonic = 'SEGMENT') or (pts.mnemonic = 'ENDS')) then
            DefineSymbol(obj.symbols, pts.label_or_name, obj.buffer.data_len);

        case pts.Kind of
            partsDirective: CompileDirective(pts);
            partsInstruction: CompileInstruction(pts);
        end;
    end;
    { DumpSymbols; }
end;

end.