/src/video/gen.rs
use cortex_m::asm;
use cortex_m::peripheral::NVIC;
use atsam3xa::{interrupt, TC0};
use embedded_hal::digital::OutputPin;
use atsam3xa_hal::prelude::*;
use atsam3xa_hal::pmc::PMC;
use atsam3xa_hal::pio::{self, PIOB};

use crate::video::mem::{ROWS, COLS, VIDEO_RAM};
use crate::video::font::FONT;

#[allow(dead_code)]
enum VideoStage {
    EqualizingHead,
    FieldSync,
    Equalizing,
    BlankLine,
    Line,
    HalfLineTop,
    HalfLineBottom,
}

struct GenOp {
    stage: VideoStage,
    count: usize,
    call_every: usize,
}

const OPERATIONS: [GenOp; 5] = [
    // EqualizingHead sets TC period to half a frame for the vblank period
    GenOp { stage: VideoStage::EqualizingHead, count: 6, call_every: 6 },
    GenOp { stage: VideoStage::FieldSync, count: 6, call_every: 6 },
    GenOp { stage: VideoStage::Equalizing, count: 6, call_every: 6 },
    // BlankLine sets TC period back to a full frame
    GenOp { stage: VideoStage::BlankLine, count: 11, call_every: 11 },
    // 242 visible lines(?)
    GenOp { stage: VideoStage::Line, count: 242, call_every: 1 },
    /*
    // HalfLineTop sets TC period back to half a frame before entering vblank on second field
    GenOp { stage: VideoStage::HalfLineTop, count: 1, call_every: 1 },
    GenOp { stage: VideoStage::Equalizing, count: 6, call_every: 6 },
    GenOp { stage: VideoStage::FieldSync, count: 6, call_every: 6 },
    GenOp { stage: VideoStage::Equalizing, count: 5, call_every: 5 },
    // HalfLineBottom is the last equalizing cycle, but sets the period back to a full frame
    GenOp { stage: VideoStage::HalfLineBottom, count: 1, call_every: 1 },
    // Another set of blank lines (this also resets the cycle, but it doesn't matter)
    GenOp { stage: VideoStage::BlankLine, count: 11, call_every: 11 },
    // And 242 lines on the bottom field
    GenOp { stage: VideoStage::Line, count: 232, call_every: 1 },
    GenOp { stage: VideoStage::BlankLine, count: 9, call_every: 10 },
    */
];

const fn us_to_ticks(us: f64) -> u32 {
    (TICKS_PER_SECOND as f64 * (us * 1e-6)) as u32
}

const TICKS_PER_SECOND: u32 = 42_000_000;
const TICKS_PER_FRAME: u32 = TICKS_PER_SECOND / 30;
const TICKS_PER_LINE: u32 = TICKS_PER_FRAME / 525;
const TICKS_PER_HALF_LINE: u32 = TICKS_PER_LINE / 2;
const PRE_EQ_PULSE: u32 = us_to_ticks(2.3);
const SYNC_PULSE: u32 = us_to_ticks(4.7);
const FIELD_SYNC_PULSE: u32 = TICKS_PER_HALF_LINE - SYNC_PULSE;
const BLANK_TIME: u32 = us_to_ticks(9.2);

const LEFT_MARGIN: u32 = us_to_ticks(3.1);

const HRES: usize = 320;
const VRES: usize = 240;
const VSCALE: usize = (240 / VRES);

struct VideoGenerator {
    tc: TC0,
    // luma isn't used directly because the pixel loop is assembly
    #[allow(dead_code)]
    luma: pio::Pin,
}

extern "C" {
    fn pixel_pusher(mem: *const u8);
    fn write_scanline(linedata: *const u8, char_mem: *const u8, font: *const u8);
}

global_asm!(r#"
    .type pixel_pusher,function
pixel_pusher:
    // ARGUMENTS
    // r0 = pointer to scanline data
    // WORK REGISTERS
    // r1 = PIOB.26 ODSR bit band address (0x43c20768)
    // r2 = end address
    // r3 = work byte
    mov r1, 0x0768
    movt r1, 0x43c2
    add r2, r0, #40
0:
    ldr r3, [r0], #4

.rept 31
    ror r3, r3, 31
    str r3, [r1]
    nop
    nop
    nop
    nop
    nop
    nop
    nop
    nop
.endr
    ror r3, r3, 31
    str r3, [r1]

    cmp r0, r2
    bne 0b

    // Set output to low at end of line
    mov r3, #0
    str r3, [r1]

    bx lr

    .type write_scanline,function
write_scanline:
    // ARGUMENTS
    // r0 = scanline data address
    // r1 = character memory address
    // r2 = font memory address (modulo pixel row)
    // WORK REGISTERS
    // r3 = byte counter
    // r4 = 4x character
    // r5 = output word
    // r6 = temporary char
    push {r4-r6}
    mov r3, 36
0:
    mov r5, 0
    ldr r4, [r1, r3]

    ubfx r6, r4, #0, #8
    ldrb r6, [r2, r6, lsl #3]
    orr r5, r5, r6, lsl #24

    ubfx r6, r4, #8, #8
    ldrb r6, [r2, r6, lsl #3]
    orr r5, r5, r6, lsl #16

    ubfx r6, r4, #16, #8
    ldrb r6, [r2, r6, lsl #3]
    orr r5, r5, r6, lsl #8

    ubfx r6, r4, #24, #8
    ldrb r6, [r2, r6, lsl #3]
    orr r5, r5, r6

    str r5, [r0, r3]
    cbz r3, 1f
    sub r3, r3, 4
    b 0b

1:
    // These nops stabilize the pixel timing somehow.  May require adjustment
    // if the above changes.
    nop
    nop
    pop {r4-r6}
    bx lr
"#);

impl VideoGenerator {
    fn new(tc: TC0, luma: pio::Pin) -> VideoGenerator {
        let mut c: u8 = 0;
        for y in 0..ROWS {
            for x in 0..COLS {
                unsafe { VIDEO_RAM.set_cell(x, y, c) };
                c = c + 1;
            }
        }

        VideoGenerator {
            tc,
            luma,
        }
    }

    fn execute(&mut self, stage: &VideoStage, count: usize) {
        match stage {
            VideoStage::EqualizingHead => {
                self.pre_eq();
                self.half_line_period();
            },
            VideoStage::Equalizing => {
                self.pre_eq();
            },
            VideoStage::FieldSync => {
                self.field_sync();
            },
            VideoStage::BlankLine => {
                self.line_sync();
                self.full_line_period();
            },
            VideoStage::Line => {
                if count >= VRES * VSCALE {
                    return;
                }

                let pixel_line = count / VSCALE;
                let char_row = pixel_line / 8;
                let char_line = pixel_line % 8;
                let mut linedata : [u8; COLS] = unsafe { core::mem::uninitialized() };

                unsafe {
                    let char_mem = VIDEO_RAM.row_address(char_row);
                    // We add the char_line offset here since it's constant for every character
                    let font_mem = (&FONT[0] as *const u8).offset(char_line as isize);
                    write_scanline(&linedata as *const u8, char_mem, font_mem);
                }

                /*
                let tf = self.tc.cv0.read().cv().bits();
                if count == 0 {
                    use crate::console::ConsoleOutput;
                    crate::util::print_dec(tf as u32);
                    crate::serial_console::get_global_console().write("\n");
                }
                */

                // wait for the sync pulse to finish
                while self.tc.cv0.read().cv().bits() < BLANK_TIME + LEFT_MARGIN {}

                unsafe {
                    pixel_pusher(&linedata as *const u8);
                }
            },
            VideoStage::HalfLineTop => {
                self.half_line_period();
            },
            VideoStage::HalfLineBottom => {
                self.full_line_period();
            }
        };
    }

    fn full_line_period(&self) {
        self.tc.rc0.write(|w| unsafe { w.bits(TICKS_PER_LINE) });
    }

    fn half_line_period(&self) {
        self.tc.rc0.write(|w| unsafe { w.bits(TICKS_PER_HALF_LINE) });
    }

    fn pre_eq(&self) {
        // PRE_EQ_PULSE cycles low, then high
        self.tc.ra0.write(|w| unsafe { w.bits(PRE_EQ_PULSE) });
    }

    fn field_sync(&self) {
        // FIELD_SYNC_PULSE cycles low, then high
        self.tc.ra0.write(|w| unsafe { w.bits(FIELD_SYNC_PULSE) });
    }

    fn line_sync(&self) {
        // SYNC_PULSE cycles low, then high
        self.tc.ra0.write(|w| unsafe { w.bits(SYNC_PULSE) });
    }
}

static mut VIDEO_GENERATOR: Option<VideoGenerator> = None;

#[interrupt]
fn TC0() {
    static mut OP: usize = 0;
    static mut COUNT: usize = 0;

    let vg = unsafe { VIDEO_GENERATOR.as_mut().unwrap() };
    // The status register is irrelevant to us, but we must read it to clear
    // the interrupt.
    vg.tc.sr0.read().bits();

    let o = &OPERATIONS[*OP];
    if *COUNT % o.call_every == 0 {
        vg.execute(&o.stage, *COUNT);
    }

    *COUNT += 1;
    if *COUNT == o.count {
        *OP = (*OP + 1) % OPERATIONS.len();
        *COUNT = 0;
    }
}

pub fn start(tc: TC0, nvic: &mut NVIC, pmc: &PMC, pio_b: &PIOB) {
    // Enable peripheral clock
    pmc.enable_peripheral_clock(PeripheralID::TC0);

    // Set NVIC priority for TC0 interrupt
    unsafe { nvic.set_priority(atsam3xa::Interrupt::TC0, 0) };

    // Enable TC0 interrupt
    nvic.enable(atsam3xa::Interrupt::TC0);

    // Configure SYNC output pin on TIOA0
    let tioa0 = pio_b.get_pin(25);
    tioa0.peripheral_mode(PeripheralMultiplex::B);

    // set wave mode, up RC mode, and use timer clock 1 (MCLK/2)
    unsafe {
        tc.cmr0.cmr0_wave_eq_1.write(|w| {
            w.wave().set_bit()
             .wavsel().up_rc()
             .acpa().set()
             .acpc().clear()
             .tcclks().timer_clock1()
        });
    }
    // set RC value (wait one line period before starting for no particular reason)
    tc.rc0.write(|w| unsafe { w.rc().bits(TICKS_PER_LINE) });
    // enable interrupt on RC compare
    tc.ier0.write(|w| w.cpcs().set_bit());

    // Get the digital pin for B/W output
    let mut luma = pio_b.get_pin(26);
    luma.output_mode();
    luma.set_synchronous_mode(true);
    luma.set_low();

    unsafe {
        // Store the global VideoGenerator object
        VIDEO_GENERATOR = Some(VideoGenerator::new(tc, luma));
    }

    asm::dsb();

    // enable TC0 and start!
    unsafe {
        VIDEO_GENERATOR.as_mut().unwrap().tc.ccr0.write(|w| {
            w.clken().set_bit()
             .swtrg().set_bit()
        });
    }
}