#include "palmcardProto.h"
#include "palmcardComms.h"
#include "machSpecific.h"
#include "printf.h"
#include "pinout.h"
#include "cpu.h"

////PIO config
	#define SIDE_SET_HAS_ENABLE_BIT 	0
	#define SIDE_SET_NUM_BITS			0
	#define DEFINE_PIO_INSTRS
	#include "pioAsm.h"
////PIO config


struct DmaConfig {
	uint32_t ctrl, read, write, xferCt;
};


static volatile uint16_t mRXedData[66];
static const void *mScreenData;
static uint32_t mScreenDataNumBytes;
static uint8_t mMySm, mMyPc;	//on pio1
static uint8_t mScreenSendLoc1, mScreenSendLoc2, mScreenSendJmpDest;	//for allowing changing the delay
static volatile bool mRemoteBooted = false;
static volatile uint16_t mHwrFlags = PCC_HW_INVALID_RESERVED_BIT;

#define mDesiredState	(*(volatile uint16_t*)&mIrqSta[3])
#define mIrqBits		(*(volatile uint16_t*)&mIrqSta[2])

#ifdef PALMCARD_SUPPORT_SAMPLED_AUDIO
	static uint16_t mAudioData[258] = {0xface, 256, };	//512 samples
#else
	static uint16_t mSimpleAudioReq[] = {0xface, 2, 0, 0};
#endif

static uint16_t mIrqSta[] = {0xFACE, 2, 0, 0};
static uint16_t mAck[] = {0xFACE, 0x0000};


static const uint16_t __attribute__((aligned(4))) mRemoteMainCodeForSize[] = {
	#include "remoteMain.inc"
};

static const uint16_t __attribute__((aligned(4))) mRemoteMainCode[] = {		//all of these are in ROM which means we need the remote to boot before ROMRAM is used, as this will be DMA-ed from...
	
	0xFACE,
	sizeof(mRemoteMainCodeForSize) / sizeof(uint16_t),
	#include "remoteMain.inc"
};

static const uint16_t __attribute__((aligned(4))) mRemoteBootstrapCode[] = {
	
	#include "remoteBootstrap.inc"
};

static void palmcardPrvPioClrBuffers(pio_hw_t *unit, uint_fast8_t index)
{
	uint32_t orig = unit->sm[index].shiftctrl;
	
	unit->sm[index].shiftctrl = (unit->sm[index].shiftctrl &~ PIO_SM0_SHIFTCTRL_FJOIN_TX_BITS) | PIO_SM0_SHIFTCTRL_FJOIN_RX_BITS;
	unit->sm[index].shiftctrl = (unit->sm[index].shiftctrl &~ PIO_SM0_SHIFTCTRL_FJOIN_RX_BITS) | PIO_SM0_SHIFTCTRL_FJOIN_TX_BITS;
	unit->sm[index].shiftctrl = orig;
}

static void palmcardScreenProcessingPioSetup(uint8_t bpp)
{
	uint_fast8_t pc = mMyPc, startPC, wrapToPC, wrapFromPC, i;
	
	//stop and reest our SM SMs
	pio1_hw->ctrl &=~ ((1 << PIO_CTRL_SM_ENABLE_LSB) << mMySm);
	pio1_hw->ctrl = (1 << PIO_CTRL_SM_RESTART_LSB) << mMySm;
	
	//this definitely flushes all buffers
	palmcardPrvPioClrBuffers(pio1_hw, mMySm);
	
	if (bpp == 4) {
		
		//osr shifts left
		//isr shifts left
		startPC = wrapToPC = pc;
		pio1_hw->instr_mem[pc++] = I_OUT(0, 0, OUT_DST_X, 2);		//hi2
		pio1_hw->instr_mem[pc++] = I_OUT(0, 0, OUT_DST_Y, 2);		//lo2
		pio1_hw->instr_mem[pc++] = I_IN(0, 0, IN_SRC_X, 2);			//gets replaced with in from Y sometimes
		wrapFromPC = pc - 1;
		
		pio1_hw->sm[mMySm].clkdiv = (pio1_hw->sm[mMySm].clkdiv &~ (PIO_SM0_CLKDIV_FRAC_BITS | PIO_SM0_CLKDIV_INT_BITS)) | (1 << PIO_SM0_CLKDIV_INT_LSB);
		pio1_hw->sm[mMySm].execctrl = (pio1_hw->sm[mMySm].execctrl &~ (PIO_SM0_EXECCTRL_WRAP_TOP_BITS | PIO_SM0_EXECCTRL_WRAP_BOTTOM_BITS | PIO_SM2_EXECCTRL_SIDE_EN_BITS | PIO_SM0_EXECCTRL_JMP_PIN_BITS)) | (wrapFromPC << PIO_SM0_EXECCTRL_WRAP_TOP_LSB) | (wrapToPC << PIO_SM0_EXECCTRL_WRAP_BOTTOM_LSB) | (SIDE_SET_HAS_ENABLE_BIT ? PIO_SM2_EXECCTRL_SIDE_EN_BITS : 0);
		pio1_hw->sm[mMySm].shiftctrl = (pio1_hw->sm[mMySm].shiftctrl &~ (PIO_SM0_SHIFTCTRL_FJOIN_RX_BITS | PIO_SM0_SHIFTCTRL_FJOIN_TX_BITS | PIO_SM0_SHIFTCTRL_PULL_THRESH_BITS | PIO_SM0_SHIFTCTRL_PUSH_THRESH_BITS | PIO_SM0_SHIFTCTRL_OUT_SHIFTDIR_BITS | PIO_SM0_SHIFTCTRL_IN_SHIFTDIR_BITS)) | PIO_SM0_SHIFTCTRL_AUTOPULL_BITS | PIO_SM0_SHIFTCTRL_AUTOPUSH_BITS | (16 << PIO_SM0_SHIFTCTRL_PUSH_THRESH_LSB);
		pio1_hw->sm[mMySm].pinctrl = 0;
		
		pio1_hw->sm[mMySm].instr = I_JMP(0, 0, JMP_ALWAYS, startPC);	//start at the start
		pio1_hw->ctrl |= ((0x01 << PIO_CTRL_SM_ENABLE_LSB) << mMySm);
	}
}

static void palmcardCommsPrvSetProperScreenDelay(bool fastClock)
{
	uint_fast8_t waitTime = fastClock ? 7 : 31;
	
	pio0_hw->instr_mem[mScreenSendLoc1] = I_JMP(waitTime, 0, JMP_ALWAYS, mScreenSendJmpDest);
	pio0_hw->instr_mem[mScreenSendLoc2] = I_JMP(waitTime, 0, JMP_ALWAYS, mScreenSendJmpDest);
}

static void palmcardCommsPrvSetup(void)
{
	uint_fast8_t startPC0, restartPC0, endPC0, startPC1, restartPC1, endPC1, startPC3, restartPC3, endPC3, pc = 0, jmpDest;
	

	//stop SMs
	pio0_hw->ctrl &=~ (0x0f << PIO_CTRL_SM_ENABLE_LSB);
	
	//reset SMs
	pio0_hw->ctrl = (0x0f << PIO_CTRL_SM_RESTART_LSB);

		
	/*		ISR shifts left, jump pin is CSA2, IN uses one pin which is A1, out uses all DQ pins, register Y needs to be preopulated with the value "1"
			due to how the bus works, some delay after seeing A1 change is adviseable (since CSA might go up). delays seen in real captures are 30ns
			no wraparound is needed for this SM
			
			//we encode "was" value in reverse to what it was, thus "1" represents the value was 0, and "0" represents it was 1
			//this shortens the code for FIRST entry by one instr 
	
		//code prefixed with numbers here shows its absolute address that MUST be carefully maintained
	
	0:	start_with_0:
	0:	was_1_is_0:
			mov Y -> ISR (set ISR to 1)
			goto send_word, delay 3 (see note above about changes right before CSA goes up)
		
	2:	was_1_is_1:
			mov NULL -> ISR (set ISR to 0)
			goto wait_more, no delay since no pin chance was noted
	
	4:	was_0_is_0:
			mov Y -> ISR (set ISR to 1)
			goto wait_more, no delay since no pin chance was noted
		
	6:	was_0_is_1:
			mov NULL -> ISR (set ISR to 0)
			goto send_word, delay 3 (see note above about changes right before CSA goes up)
		
	8:	start_with_1:
			jump	was_0_is_1
		
		send_word:
			if CSA is high, goto irq_wait
			pull nonblocking
			out
		
		wait_more:	//yes we do not check CSA here, this is ok, even if it went up, eventually either it'll go back down or A1 will change and we'll check it above
			in 1 bit from pins
			in 1 bit from zero
			move ISR -> PC
		
		irq_wait:
			wait for irq
			mov 0 -> ISR (set ISR to 0)
			in 1 from pins
			in 3 from zero
			move ISR -> PC		//jump to 8 if pin is high, 0 if low	this instr does not exist - we use wraparound to wrap to the same instr above to save a code location
	
	
	*/
	
	//SM3 needs to be at address 0 since we use "MOV to PC" instr to dispatch
	uint_fast8_t lblWas0is1, jmpWaitMore1, jmpWaitMore2, jmpSendWord1, jmpSendWord2, jmpToIrqWait;
	
	if (pc != 0)
		fatal("SM3 not starting at 0\n");
	
	//was 1 is 0 case, also "enter with 0" case
	pio0_hw->instr_mem[pc++] = I_MOV(0, 0, MOV_DST_ISR, MOV_OP_COPY, MOV_SRC_Y);
	jmpSendWord1 = pc++;

	//was 1 is 1 case
	pio0_hw->instr_mem[pc++] = I_MOV(0, 0, MOV_DST_ISR, MOV_OP_COPY, MOV_SRC_ZEROES);
	jmpWaitMore1 = pc++;
	
	//was 0 is 0 case
	pio0_hw->instr_mem[pc++] = I_MOV(0, 0, MOV_DST_ISR, MOV_OP_COPY, MOV_SRC_Y);
	jmpWaitMore2 = pc++;
	
	//was 0 is 1 case
	lblWas0is1 = pc;
	pio0_hw->instr_mem[pc++] = I_MOV(0, 0, MOV_DST_ISR, MOV_OP_COPY, MOV_SRC_ZEROES);
	jmpSendWord2 = pc++;
	
	//startWith1:
	pio0_hw->instr_mem[pc++] = I_JMP(0, 0, JMP_ALWAYS, lblWas0is1);
	
	//sendWord label is where PC is now. fill in the jumps
	mScreenSendLoc1 = jmpSendWord1;
	mScreenSendLoc2 = jmpSendWord2;
	mScreenSendJmpDest = pc;
	
	palmcardCommsPrvSetProperScreenDelay(true);
	
	//now the code at sendWord:
	jmpToIrqWait = pc++;
	pio0_hw->instr_mem[pc++] = I_PULL(0, 0, 0, 0);				//nonblocking pull
	pio0_hw->instr_mem[pc++] = I_OUT(3, 0, OUT_DST_PINS, 16);
	
	//waitMore label is where PC is now. fill in the jumps
	pio0_hw->instr_mem[jmpWaitMore1] = I_JMP(0, 0, JMP_ALWAYS, pc);
	pio0_hw->instr_mem[jmpWaitMore2] = I_JMP(0, 0, JMP_ALWAYS, pc);
	
	//now the code at waitMore:
	pio0_hw->instr_mem[pc++] = I_IN(0, 0, IN_SRC_PINS, 1);
	pio0_hw->instr_mem[pc++] = I_IN(0, 0, IN_SRC_ZEROES, 1);
	restartPC3 = pc;	//we use wraparound to jump here from the end of "irqWait" to save an instr
	pio0_hw->instr_mem[pc++] = I_MOV(0, 0, MOV_DST_PC, MOV_OP_COPY, IN_SRC_ISR);
	
	//irqWait label is where PC is now. fill in the jump
	pio0_hw->instr_mem[jmpToIrqWait] = I_JMP(0, 0, JMP_PIN, pc);
	
	//now the code at irqWait:
	startPC3 = pc;
	pio0_hw->instr_mem[pc++] = I_WAIT(0, 0, 1, WAIT_FOR_IRQ, 6);
	pio0_hw->instr_mem[pc++] = I_MOV(0, 0, MOV_DST_ISR, MOV_OP_COPY, MOV_SRC_ZEROES);
	pio0_hw->instr_mem[pc++] = I_IN(0, 0, IN_SRC_PINS, 1);
	pio0_hw->instr_mem[pc++] = I_IN(0, 0, IN_SRC_ZEROES, 3);
	endPC3 = pc - 1;		//wrap to "move ISR -> PC" above
	
	//SM0 & 2: (need OSR set to zero on entry)
	// wait for CSA0 low
	// check OE
	// if clear: set pins as OUT, send irq, wait for CS over, reset pins as IN
	// if set: read pins, wait for CS over
	// loop
	//for code size reasons, initial entry is in the middle, restart location is also elsewhere in the middle
	//needs auto-push at 16 for 68k's data to be DMA-ed out from the SM to our RAM. "PIN" needs to be set to nOE pin
	// SM2 also provides a word on input, but it is not really any use - its existence signals an IRQ.
	//why? wo we can use "wait for pin"  which waits for Nth pin mod 32 from the input pin mapping
	//by giving SM2 a different "IN" mapping, we can use the same code as SM0 for it!
	jmpDest = pc;		//code for CSA low, nOE high (Write)
	pio0_hw->instr_mem[pc++] = I_MOV(7, 0, MOV_DST_X, MOV_OP_COPY, MOV_SRC_X);	//NOP, necessary delay before data is valid
	pio0_hw->instr_mem[pc++] = I_IN(0, 0, IN_SRC_PINS, 16);
	restartPC0 = pc;
	pio0_hw->instr_mem[pc++] = I_MOV(0, 0, MOV_DST_OSR, MOV_OP_COPY, MOV_SRC_ZEROES);
	pio0_hw->instr_mem[pc++] = I_WAIT(16, 0, 1, WAIT_FOR_PIN, PIN_CSA0 - PIN_DQ0);	//wait for pin uses input pin mapping. for SM0 DQ0 is base this delay is VERY needed. else we see a tiny glitch on nCS0 when nCS2 goes down and lose a word trying to output into the void
	pio0_hw->instr_mem[pc++] = I_OUT(0, 0, OUT_DST_PINDIRS, 16);
	startPC0 = pc;
	pio0_hw->instr_mem[pc++] = I_MOV(0, 0, MOV_DST_OSR, MOV_OP_INVERT, MOV_SRC_ZEROES);
	pio0_hw->instr_mem[pc++] = I_WAIT(0, 0, 0, WAIT_FOR_PIN, PIN_CSA0 - PIN_DQ0);
	pio0_hw->instr_mem[pc++] = I_JMP(0, 0, JMP_PIN, jmpDest);
	pio0_hw->instr_mem[pc++] = I_IRQ(0, 0, 0, 1, 0x14);		//irq4 raised by SM0, ir6 raised by SM2
	pio0_hw->instr_mem[pc++] = I_OUT(0, 0, OUT_DST_PINDIRS, 16);
	endPC0 = pc - 1;
	
	//SM1: wait for irq, set data out. X register must be pre-populated by zero
	startPC1 = restartPC1 = pc;
	pio0_hw->instr_mem[pc++] = I_PULL(0, 0, 0, 0);		//nonblocking pull
	pio0_hw->instr_mem[pc++] = I_WAIT(0, 0, 1, WAIT_FOR_IRQ, 4);
	pio0_hw->instr_mem[pc++] = I_OUT(0, 0, OUT_DST_PINS, 16);
	endPC1 = pc - 1;
	
	logi("total instr spaces used: %d\n", pc);
	
	
	//configure sm0
	pio0_hw->sm[0].clkdiv = (1 << PIO_SM0_CLKDIV_INT_LSB);	//full speed
	pio0_hw->sm[0].execctrl = (pio0_hw->sm[0].execctrl &~ (PIO_SM0_EXECCTRL_WRAP_TOP_BITS | PIO_SM0_EXECCTRL_WRAP_BOTTOM_BITS | PIO_SM2_EXECCTRL_SIDE_EN_BITS | PIO_SM2_EXECCTRL_JMP_PIN_BITS)) | (PIN_nOE << PIO_SM2_EXECCTRL_JMP_PIN_LSB) | (endPC0 << PIO_SM0_EXECCTRL_WRAP_TOP_LSB) | (restartPC0 << PIO_SM0_EXECCTRL_WRAP_BOTTOM_LSB) | (SIDE_SET_HAS_ENABLE_BIT ? PIO_SM2_EXECCTRL_SIDE_EN_BITS : 0);
	pio0_hw->sm[0].shiftctrl = (pio0_hw->sm[0].shiftctrl &~ (PIO_SM1_SHIFTCTRL_PULL_THRESH_BITS | PIO_SM1_SHIFTCTRL_PUSH_THRESH_BITS | PIO_SM0_SHIFTCTRL_IN_SHIFTDIR_BITS | PIO_SM0_SHIFTCTRL_OUT_SHIFTDIR_BITS | PIO_SM0_SHIFTCTRL_AUTOPULL_BITS)) | PIO_SM0_SHIFTCTRL_AUTOPUSH_BITS | (16 << PIO_SM1_SHIFTCTRL_PUSH_THRESH_LSB) | PIO_SM0_SHIFTCTRL_FJOIN_RX_BITS;
	pio0_hw->sm[0].pinctrl = (SIDE_SET_BITS_USED << PIO_SM1_PINCTRL_SIDESET_COUNT_LSB) | (16 << PIO_SM1_PINCTRL_OUT_COUNT_LSB)| (PIN_DQ0 << PIO_SM1_PINCTRL_OUT_BASE_LSB) | (PIN_DQ0 << PIO_SM2_PINCTRL_IN_BASE_LSB);
	
	//configure sm1
	pio0_hw->sm[1].clkdiv = (1 << PIO_SM0_CLKDIV_INT_LSB);	//full speed
	pio0_hw->sm[1].execctrl = (pio0_hw->sm[1].execctrl &~ (PIO_SM0_EXECCTRL_WRAP_TOP_BITS | PIO_SM0_EXECCTRL_WRAP_BOTTOM_BITS | PIO_SM2_EXECCTRL_SIDE_EN_BITS | PIO_SM2_EXECCTRL_JMP_PIN_BITS)) | (endPC1 << PIO_SM0_EXECCTRL_WRAP_TOP_LSB) | (restartPC1 << PIO_SM0_EXECCTRL_WRAP_BOTTOM_LSB) | (SIDE_SET_HAS_ENABLE_BIT ? PIO_SM2_EXECCTRL_SIDE_EN_BITS : 0);
	pio0_hw->sm[1].shiftctrl = (pio0_hw->sm[1].shiftctrl &~ (PIO_SM1_SHIFTCTRL_PULL_THRESH_BITS | PIO_SM1_SHIFTCTRL_PUSH_THRESH_BITS | PIO_SM0_SHIFTCTRL_IN_SHIFTDIR_BITS | PIO_SM0_SHIFTCTRL_OUT_SHIFTDIR_BITS | PIO_SM0_SHIFTCTRL_AUTOPULL_BITS | PIO_SM0_SHIFTCTRL_AUTOPUSH_BITS)) | PIO_SM0_SHIFTCTRL_FJOIN_TX_BITS;
	pio0_hw->sm[1].pinctrl = (SIDE_SET_BITS_USED << PIO_SM1_PINCTRL_SIDESET_COUNT_LSB) | (16 << PIO_SM1_PINCTRL_OUT_COUNT_LSB)| (PIN_DQ0 << PIO_SM1_PINCTRL_OUT_BASE_LSB);
	
	//configure sm2 (same as SM0 except a different IN mapping
	pio0_hw->sm[2].clkdiv = (1 << PIO_SM0_CLKDIV_INT_LSB);	//full speed
	pio0_hw->sm[2].execctrl = (pio0_hw->sm[2].execctrl &~ (PIO_SM0_EXECCTRL_WRAP_TOP_BITS | PIO_SM0_EXECCTRL_WRAP_BOTTOM_BITS | PIO_SM2_EXECCTRL_SIDE_EN_BITS | PIO_SM2_EXECCTRL_JMP_PIN_BITS)) | (PIN_nOE << PIO_SM2_EXECCTRL_JMP_PIN_LSB) | (endPC0 << PIO_SM0_EXECCTRL_WRAP_TOP_LSB) | (restartPC0 << PIO_SM0_EXECCTRL_WRAP_BOTTOM_LSB) | (SIDE_SET_HAS_ENABLE_BIT ? PIO_SM2_EXECCTRL_SIDE_EN_BITS : 0);
	pio0_hw->sm[2].shiftctrl = (pio0_hw->sm[2].shiftctrl &~ (PIO_SM1_SHIFTCTRL_PULL_THRESH_BITS | PIO_SM1_SHIFTCTRL_PUSH_THRESH_BITS | PIO_SM0_SHIFTCTRL_IN_SHIFTDIR_BITS | PIO_SM0_SHIFTCTRL_OUT_SHIFTDIR_BITS | PIO_SM0_SHIFTCTRL_AUTOPULL_BITS)) | PIO_SM0_SHIFTCTRL_AUTOPUSH_BITS | (16 << PIO_SM1_SHIFTCTRL_PUSH_THRESH_LSB) | PIO_SM0_SHIFTCTRL_FJOIN_RX_BITS;
	pio0_hw->sm[2].pinctrl = (SIDE_SET_BITS_USED << PIO_SM1_PINCTRL_SIDESET_COUNT_LSB) | (16 << PIO_SM1_PINCTRL_OUT_COUNT_LSB)| (PIN_DQ0 << PIO_SM1_PINCTRL_OUT_BASE_LSB) | ((PIN_DQ0 + PIN_CSA2 - PIN_CSA0) << PIO_SM2_PINCTRL_IN_BASE_LSB);
	
	//configure sm3
	pio0_hw->sm[3].clkdiv = (1 << PIO_SM0_CLKDIV_INT_LSB);	//full speed
	pio0_hw->sm[3].execctrl = (pio0_hw->sm[3].execctrl &~ (PIO_SM0_EXECCTRL_WRAP_TOP_BITS | PIO_SM0_EXECCTRL_WRAP_BOTTOM_BITS | PIO_SM2_EXECCTRL_SIDE_EN_BITS | PIO_SM2_EXECCTRL_JMP_PIN_BITS)) | (SIDE_SET_HAS_ENABLE_BIT ? PIO_SM2_EXECCTRL_SIDE_EN_BITS : 0) | (PIN_CSA2 << PIO_SM2_EXECCTRL_JMP_PIN_LSB) | (endPC3 << PIO_SM0_EXECCTRL_WRAP_TOP_LSB) | (restartPC3 << PIO_SM0_EXECCTRL_WRAP_BOTTOM_LSB);
	pio0_hw->sm[3].shiftctrl = (pio0_hw->sm[3].shiftctrl &~ (PIO_SM1_SHIFTCTRL_PULL_THRESH_BITS | PIO_SM1_SHIFTCTRL_PUSH_THRESH_BITS | PIO_SM0_SHIFTCTRL_IN_SHIFTDIR_BITS | PIO_SM0_SHIFTCTRL_OUT_SHIFTDIR_BITS | PIO_SM0_SHIFTCTRL_AUTOPULL_BITS | PIO_SM0_SHIFTCTRL_AUTOPUSH_BITS));
	pio0_hw->sm[3].pinctrl = (SIDE_SET_BITS_USED << PIO_SM1_PINCTRL_SIDESET_COUNT_LSB) | (16 << PIO_SM1_PINCTRL_OUT_COUNT_LSB)| (PIN_DQ0 << PIO_SM1_PINCTRL_OUT_BASE_LSB) | (PIN_A1 << PIO_SM2_PINCTRL_IN_BASE_LSB);
	
	//set X to 0 as neeed for SM1 & SM3 (nonblocking pull contents)
	pio0_hw->sm[1].instr = I_SET(0, 0, SET_DST_X, 0);
	pio0_hw->sm[3].instr = I_SET(0, 0, SET_DST_X, 0);
	
	//set Y to 1 as needed for SM3
	pio0_hw->sm[3].instr = I_SET(0, 0, SET_DST_Y, 1);
	
	//zero OSR for SM0 & 2, as needed
	pio0_hw->sm[0].instr = I_MOV(0, 0, MOV_DST_ISR, MOV_OP_COPY, MOV_SRC_ZEROES);
	pio0_hw->sm[2].instr = I_MOV(0, 0, MOV_DST_ISR, MOV_OP_COPY, MOV_SRC_ZEROES);
	
	logi("starting SMs\n");
	//start SM0s
	pio0_hw->sm[0].instr = I_JMP(0, 0, JMP_ALWAYS, startPC0);
	pio0_hw->sm[1].instr = I_JMP(0, 0, JMP_ALWAYS, startPC1);
	pio0_hw->sm[2].instr = I_JMP(0, 0, JMP_ALWAYS, startPC0);
	pio0_hw->sm[3].instr = I_JMP(0, 0, JMP_ALWAYS, startPC3);
	pio0_hw->ctrl |= (0x0f << PIO_CTRL_SM_ENABLE_LSB);
	
	//wire IRQ0 to PIO0.irq0
	pio0_hw->inte0 |= PIO_IRQ0_INTE_SM2_RXNEMPTY_BITS;
	
	//set up DMA
	dma_hw->ch[0].write_addr = (uintptr_t)&pio0_hw->txf[1];
	dma_hw->ch[0].al1_ctrl = (DREQ_PIO0_TX1 << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB) | (0 << DMA_CH0_CTRL_TRIG_CHAIN_TO_LSB) | (DMA_CH0_CTRL_TRIG_DATA_SIZE_VALUE_SIZE_HALFWORD << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB) | DMA_CH0_CTRL_TRIG_INCR_READ_BITS | DMA_CH0_CTRL_TRIG_EN_BITS;
	
	//DMA ch1 gets data from PIO0.sm0 that was RXed
	dma_hw->ch[1].read_addr = (uintptr_t)&pio0_hw->rxf[0];
	dma_hw->ch[1].al1_ctrl = (DREQ_PIO0_RX0 << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB) | (1 << DMA_CH0_CTRL_TRIG_CHAIN_TO_LSB) | (DMA_CH0_CTRL_TRIG_DATA_SIZE_VALUE_SIZE_HALFWORD << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB) | DMA_CH0_CTRL_TRIG_INCR_WRITE_BITS | DMA_CH0_CTRL_TRIG_EN_BITS;
}

static void palmcardCommsPrvStartScreenDma(uint8_t bpp)
{
	static uint32_t mGarbage;
	uint_fast8_t i;
	
	//order matters here
	dma_hw->ch[3].al1_ctrl = 0;
	dma_hw->ch[5].al1_ctrl = 0;
	
	//abort all 4 of our screen DMAs safely
	dma_hw->abort = 0x0f << 2;
	
	for (i = 0; i < 4; i++) {
		
		while (dma_hw->abort & (1 << (i + 2)));
		while (dma_hw->ch[i + 2].al1_ctrl & DMA_CH0_CTRL_TRIG_BUSY_BITS);
		dma_hw->ch[i + 2].al1_ctrl = 0;
	}
	
	if (!mScreenData || !mScreenDataNumBytes) {
		
		logi("FB NO data set\n");
		
		return;
	}
	
	if (bpp == 4) {
		
		static volatile struct DmaConfig mGreyscaleChain[9];
		static volatile uint32_t mChainStartAddr = (uintptr_t)mGreyscaleChain;
		static const uint8_t mDataXferIdxs[] = {1, 2, 3, 4, 5, 7}, mInstrXferIdxs[] = {0, 6};
		static volatile uint32_t mInstrs[2];
		uint32_t nPixels;
		
		nPixels = mScreenDataNumBytes / sizeof(uint32_t);
		
		mInstrs[0] = I_IN(0, 0, IN_SRC_X, 2);	//get hi2
		mInstrs[1] = I_IN(0, 0, IN_SRC_Y, 2);	//get lo2
		
		for (i = 0; i < sizeof(mDataXferIdxs) / sizeof(*mDataXferIdxs); i++){
			
			uint_fast8_t idx = mDataXferIdxs[i];
			
			mGreyscaleChain[idx].ctrl = ((DREQ_PIO1_TX0 + mMySm) << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB) | (3 << DMA_CH0_CTRL_TRIG_CHAIN_TO_LSB) | (DMA_CH0_CTRL_TRIG_DATA_SIZE_VALUE_SIZE_WORD << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB) | DMA_CH0_CTRL_TRIG_INCR_READ_BITS | DMA_CH1_CTRL_TRIG_BSWAP_BITS | DMA_CH1_CTRL_TRIG_HIGH_PRIORITY_BITS | DMA_CH0_CTRL_TRIG_EN_BITS;
			mGreyscaleChain[idx].read = (uintptr_t)mScreenData;
			mGreyscaleChain[idx].write = (uintptr_t)&pio1_hw->txf[mMySm];
			mGreyscaleChain[idx].xferCt = nPixels;
		}
		
		for (i = 0; i < sizeof(mInstrXferIdxs) / sizeof(*mInstrXferIdxs); i++){
			
			uint_fast8_t idx = mInstrXferIdxs[i];
			
			mGreyscaleChain[idx].ctrl = (0x3f << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB) | (3 << DMA_CH0_CTRL_TRIG_CHAIN_TO_LSB) | (DMA_CH0_CTRL_TRIG_DATA_SIZE_VALUE_SIZE_WORD << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB) | DMA_CH1_CTRL_TRIG_HIGH_PRIORITY_BITS | DMA_CH0_CTRL_TRIG_EN_BITS;
			mGreyscaleChain[idx].read = (uintptr_t)&mInstrs[i];
			mGreyscaleChain[idx].write = (uintptr_t)&pio1_hw->instr_mem[mMyPc + 2];
			mGreyscaleChain[idx].xferCt = 1;
		}
		
		//reset ch3
		mGreyscaleChain[8].ctrl = (0x3f << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB) | (2 << DMA_CH0_CTRL_TRIG_CHAIN_TO_LSB) | (DMA_CH0_CTRL_TRIG_DATA_SIZE_VALUE_SIZE_WORD << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB) | DMA_CH1_CTRL_TRIG_HIGH_PRIORITY_BITS | DMA_CH0_CTRL_TRIG_EN_BITS;
		mGreyscaleChain[8].read = (uintptr_t)&mChainStartAddr;
		mGreyscaleChain[8].write = (uintptr_t)&dma_hw->ch[3].al3_read_addr_trig;
		mGreyscaleChain[8].xferCt = 1;
	
		palmcardScreenProcessingPioSetup(bpp);
		palmcardCommsPrvSetProperScreenDelay(true);
		
		//configure and enable
		
		dma_hw->ch[3].read_addr = (uintptr_t)mChainStartAddr;
		dma_hw->ch[3].write_addr = (uintptr_t)&dma_hw->ch[2].al1_ctrl;
		dma_hw->ch[3].transfer_count = 4;
		dma_hw->ch[3].ctrl_trig = (0x3f << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB) | (3 << DMA_CH0_CTRL_TRIG_CHAIN_TO_LSB) | (DMA_CH0_CTRL_TRIG_DATA_SIZE_VALUE_SIZE_WORD << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB) | DMA_CH0_CTRL_TRIG_INCR_READ_BITS | DMA_CH0_CTRL_TRIG_INCR_WRITE_BITS | DMA_CH0_CTRL_TRIG_RING_SEL_BITS | (4 << DMA_CH0_CTRL_TRIG_RING_SIZE_LSB) | DMA_CH1_CTRL_TRIG_HIGH_PRIORITY_BITS | DMA_CH0_CTRL_TRIG_EN_BITS;
	
		//it'll fill its out buffer by the time this code continues, but we'll make sure
		while (((pio1_hw->flevel >> (mMySm * 8 + 4)) & 0x0f) != 4);
		
		palmcardPrvPioClrBuffers(pio0_hw, 3);
		
		dma_hw->ch[4].read_addr = (uintptr_t)&pio1_hw->rxf[mMySm];
		dma_hw->ch[4].write_addr = (uintptr_t)&pio0_hw->txf[3];
		dma_hw->ch[4].transfer_count = nPixels;
		dma_hw->ch[4].al1_ctrl = (DREQ_PIO0_TX3 << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB) | (5 << DMA_CH0_CTRL_TRIG_CHAIN_TO_LSB) | (DMA_CH0_CTRL_TRIG_DATA_SIZE_VALUE_SIZE_HALFWORD << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB) | DMA_CH1_CTRL_TRIG_HIGH_PRIORITY_BITS | DMA_CH0_CTRL_TRIG_EN_BITS;
		
		dma_hw->ch[5].read_addr = (uintptr_t)&mGarbage;
		dma_hw->ch[5].write_addr = (uintptr_t)&mGarbage;
		dma_hw->ch[5].transfer_count = 1;
		dma_hw->ch[5].ctrl_trig = (0x3f << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB) | (4 << DMA_CH0_CTRL_TRIG_CHAIN_TO_LSB) | (DMA_CH0_CTRL_TRIG_DATA_SIZE_VALUE_SIZE_WORD << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB) | DMA_CH1_CTRL_TRIG_HIGH_PRIORITY_BITS | DMA_CH0_CTRL_TRIG_EN_BITS;
	}
	else if (bpp == 1 || bpp == 2) {
		
		palmcardScreenProcessingPioSetup(bpp);
		palmcardCommsPrvSetProperScreenDelay(false);
		
		dma_hw->ch[4].write_addr = (uintptr_t)&pio0_hw->txf[3];
		dma_hw->ch[4].transfer_count = mScreenDataNumBytes / sizeof(uint16_t);
		dma_hw->ch[4].al1_ctrl = (DREQ_PIO0_TX3 << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB) | (5 << DMA_CH0_CTRL_TRIG_CHAIN_TO_LSB) | (DMA_CH0_CTRL_TRIG_DATA_SIZE_VALUE_SIZE_HALFWORD << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB) | DMA_CH0_CTRL_TRIG_INCR_READ_BITS | DMA_CH1_CTRL_TRIG_BSWAP_BITS | DMA_CH1_CTRL_TRIG_HIGH_PRIORITY_BITS | DMA_CH0_CTRL_TRIG_EN_BITS;
		
		palmcardPrvPioClrBuffers(pio0_hw, 3);
		
		dma_hw->ch[5].read_addr = (uintptr_t)&mScreenData;
		dma_hw->ch[5].write_addr = (uintptr_t)&dma_hw->ch[4].al3_read_addr_trig;
		dma_hw->ch[5].transfer_count = 1;
		dma_hw->ch[5].ctrl_trig = (0x3f << DMA_CH0_CTRL_TRIG_TREQ_SEL_LSB) | (5 << DMA_CH0_CTRL_TRIG_CHAIN_TO_LSB) | (DMA_CH0_CTRL_TRIG_DATA_SIZE_VALUE_SIZE_WORD << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB) | DMA_CH1_CTRL_TRIG_HIGH_PRIORITY_BITS | DMA_CH0_CTRL_TRIG_EN_BITS;
	}
	
	logi("FB data set in %ubpp mode\n", bpp);
}

void palmcardCommsSetScreenDataPtr(const void *data, uint32_t numPixels, uint_fast8_t bpp)
{
	uint32_t newDataBytes = numPixels * bpp / 8;
	
	if (data && (((uintptr_t)data) % sizeof(uint32_t)))
		fatal("screen data not word aligned\n");
	
	//to avoid raced we must be careful that there is always data to DMA since dma is async to this code and uses the variable "mScreenData"
	
	if (newDataBytes > mScreenDataNumBytes)
		mScreenData = data;
	mScreenDataNumBytes = newDataBytes;
	palmcardCommsPrvStartScreenDma(bpp);
	mScreenData = data;
}

static void palmcardCommsPrvQueueTx(const uint16_t *buf, uint32_t nWords)	//will also ARM RX
{
	//abort current xfers... start new oens.. we cannot keep RX machinery waiting for too long....
	
	dma_hw->abort = (1 << 0) | (1 << 1);
	while (dma_hw->abort & (1 << 1));
	while (dma_hw->ch[1].al1_ctrl & DMA_CH0_CTRL_TRIG_BUSY_BITS);
	dma_hw->ch[1].write_addr = (uintptr_t)&mRXedData;
	dma_hw->ch[1].al1_transfer_count_trig = sizeof(mRXedData) / sizeof(*mRXedData);
	
	while (dma_hw->abort & (1 << 0));
	while (dma_hw->ch[0].al1_ctrl & DMA_CH0_CTRL_TRIG_BUSY_BITS);
	dma_hw->ch[0].read_addr = (uintptr_t)buf;
	dma_hw->ch[0].al1_transfer_count_trig = nWords;
}

void palmcardCommsInit(uint8_t *firstFreeSmP, uint8_t *firstFreePioInstrP, uint8_t *firstFreeDmaChP, uint8_t nDmaCh, uint8_t nPioSms, uint8_t nPioInstrs)
{
	mMySm = *firstFreeSmP;
	mMyPc = *firstFreePioInstrP;
	
	(*firstFreeDmaChP) += 6;	//we use 6 DMA channels
	(*firstFreeSmP) += 1;
	(*firstFreePioInstrP) += 3;
	
	palmcardCommsPrvSetup();
	NVIC_ClearPendingIRQ(PIO0_0_IRQn);
	NVIC_EnableIRQ(PIO0_0_IRQn);
	palmcardCommsPrvQueueTx(mRemoteBootstrapCode, sizeof(mRemoteBootstrapCode) / sizeof(uint16_t));
	
	logi("Waiting for remote boot\n");
	machBusyWaitDelayMsec(50);
	while(!mRemoteBooted);
	machBusyWaitDelayMsec(50);
}

static void palmcardCommsUpdateIrqPin(void)
{
	if (mIrqBits)
		sio_hw->gpio_clr = 1 << PIN_IRQ3;
	else
		sio_hw->gpio_set = 1 << PIN_IRQ3;
}

uint_fast16_t palmcardCommsGetPendingIrqs(void)
{
	return mIrqBits;
}

void palmcardCommsUpdateIrqSta(uint_fast16_t mIrqsToSet, uint_fast16_t mIrqsToClear)
{
	mIrqBits = (mIrqBits &~ mIrqsToClear) | mIrqsToSet;
	palmcardCommsUpdateIrqPin();
}

uint_fast16_t palmcardCommsGetDeviceState(void)
{
	return mDesiredState;
}

void palmcardCommsSetDeviceState(uint_fast16_t newState)
{
	mDesiredState = newState;
}

void __attribute__((used)) PIO0_0_IRQHandler(void)
{
	uint32_t nWords;
	
	(void)pio0_hw->rxf[2];
		
	nWords = sizeof(mRXedData) / sizeof(*mRXedData) - dma_hw->ch[1].transfer_count;
	
	if (nWords && PCC_ISVALID_REQ(mRXedData[0])) switch (PCC_UNWRAP_REQ(mRXedData[0])) {
		
		case PCC_REQ_DOWNLOAD_CODE: if (nWords == 2 && mRXedData[1] == 0x0000) {
			
			palmcardCommsPrvQueueTx(mRemoteMainCode, sizeof(mRemoteMainCode) / sizeof(uint16_t));
			logi("PCC: sending main code (%lu words total (payload is %lu)\n", sizeof(mRemoteMainCode) / sizeof(*mRemoteMainCode), mRemoteMainCode[1]);
			mRemoteBooted = true;
			return;
		}
		break;
		
		case PCC_REQ_PUTCHAR: if (nWords == 3 && mRXedData[1] == 0x0001) {
			
			if (LOG_INFO)
				prPutchar(mRXedData[2]);
			palmcardCommsPrvQueueTx(mAck, sizeof(mAck) / sizeof(*mAck));
			return;
		}
		break;
		
		case PCC_REQ_IRQ_STA: if (nWords == 3 && mRXedData[1] == 0x0001) {
			
			mIrqBits &=~ mRXedData[2];
			palmcardCommsUpdateIrqPin();
			palmcardCommsPrvQueueTx(mIrqSta, sizeof(mIrqSta) / sizeof(*mIrqSta));
			return;
		}
		break;
		
		case PCC_REQ_BATT_REPORT: if (nWords == 4 && mRXedData[1] == 0x0002) {
		
			palmcardCommsExtBattReport(mRXedData[2], mRXedData[3]);
			palmcardCommsPrvQueueTx(mAck, sizeof(mAck) / sizeof(*mAck));
			return;
		}
		break;
		
		case PCC_REQ_TOUCH_REPORT: if (nWords == 4 && mRXedData[1] == 0x0002) {
			
			palmcardCommsExtPenReport(mRXedData[2], mRXedData[3]);
			palmcardCommsPrvQueueTx(mAck, sizeof(mAck) / sizeof(*mAck));
			return;
		}
		break;
		
		case PCC_REQ_BTN_REPORT: if (nWords == 3 && mRXedData[1] == 0x0001) {
			
			palmcardCommsExtBtnReport(mRXedData[2]);
			palmcardCommsPrvQueueTx(mAck, sizeof(mAck) / sizeof(*mAck));
			return;
		}
		break;

#ifdef PALMCARD_SUPPORT_SAMPLED_AUDIO
		case PCC_REQ_AUDIO_DATA: if (nWords == 2 && mRXedData[1] == 0x0000) {
			
			palmcardCommsExtNeedAudioData(mAudioData + 2);
			palmcardCommsPrvQueueTx(mAudioData, sizeof(mAudioData) / sizeof(*mAudioData));
			return;
		}
		break;
#else
		case PCC_REQ_SIMPLE_AUDIO_REQ: if (nWords == 2 && mRXedData[1] == 0x0000) {
			
			palmcardCommsPrvQueueTx(mSimpleAudioReq, sizeof(mSimpleAudioReq) / sizeof(*mSimpleAudioReq));
			return;
		}
		break;
#endif
	
		case PCC_REQ_DEVICE_HWR_FLAGS: if (nWords == 3 && mRXedData[1] == 0x0001) {
			
			mHwrFlags = mRXedData[2];
			palmcardCommsPrvQueueTx(mAck, sizeof(mAck) / sizeof(*mAck));
			return;
		}

		default: break;
	}
	
	loge("unknown %u-word req %04x %04x %04x ...\n", nWords, mRXedData[0], mRXedData[1], mRXedData[2]);
}

uint16_t palmcardCommsGetHwFlags(void)
{
	return mHwrFlags;
}

#ifndef PALMCARD_SUPPORT_SAMPLED_AUDIO
	void palmcardCommsSendSimpleSoundReq(uint16_t freq, uint16_t amp)
	{
		mSimpleAudioReq[2] = freq;
		mSimpleAudioReq[3] = amp;
		palmcardCommsUpdateIrqSta(PCC_IRQ_BIT_HAVE_NEW_SIMPLE_SOUND_REQ, 0);
	}
#endif

void palmcardCommsSleep(void)
{
	palmcardCommsUpdateIrqSta(PCC_IRQ_BIT_REQUEST_POWER_OFF, 0);
	while (mIrqBits & PCC_IRQ_BIT_REQUEST_POWER_OFF);	//wait for ack by irq clear
}

bool palmcardCommsSerialConfig(union UartCfg *cfg, RepalmUartRxF rxf, void *userData)
{
	return false;
}

uint32_t palmcardCommsSerialTx(const uint8_t *data, uint32_t len, bool block)
{
	//remember: call with NULL data and nonzero len to send break, NULL data and zero len to stop break
	return 0;
}

uint32_t palmcardCommsSerialGetSta(void)
{
	return UART_STA_BIT_RX_FIFO_EMPTY | UART_STA_BIT_TX_FIFO_EMPTY;
}
