#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "kernel_int.h"
#include "ral_export.h"
#include "printf.h"
#include "kernel.h"
#include "timers.h"
#include "entry.h"
#include "heap.h"
#include "irqs.h"
#include "emu.h"
#include "mpu.h"
#include "dal.h"
#include "ral.h"


//stack top (provided by linker)
extern void __stack_top();

#define IRQPRIO_HARDFAULT				0
#define IRQPRIO_HWSVCS					0	//hardware services cannot be expected to work on c-m0. we do not have enough prio levels
#define IRQPRIO_SYSTICK					0
#define IRQPRIO_SVCALL					1
#define IRQPRIO_SCHED_TIMER				2	//yes same as hwirq. sadly this is a compromise we need to make this all work
#define IRQPRIO_HW_IRQS_LOW_LIMIT		2
#define IRQPRIO_HW_IRQS_HIGH_LIMIT		2
#define IRQPRIO_PENDSV					3

#if __NVIC_PRIO_BITS < 2
	#error "not enough prio bits"
#endif


#ifdef CUSTOM_MMU_FAULT_HANDLER

	extern bool CUSTOM_MMU_FAULT_HANDLER(struct CortexExcFrame *exc, uint32_t addr);	//true to continue execution as if no fault happened

#endif


static void hwrInit(void)
{
	const struct MachInitDataInterrupts irqInfo = {
		.schedulingTimerPrio = IRQPRIO_SCHED_TIMER,
		.lowestAllowablePrio = IRQPRIO_HW_IRQS_LOW_LIMIT,
		.highestAllowablePrio = IRQPRIO_HW_IRQS_HIGH_LIMIT,
		.hardwareServicesPrio = IRQPRIO_HWSVCS,
	};
	uint32_t i;
	
	
	//we need systick to be the highest prio thing possible
	NVIC_SetPriority(SysTick_IRQn, IRQPRIO_SYSTICK);
	
	NVIC_SetPriority(HardFault_IRQn, IRQPRIO_HARDFAULT);
	NVIC_SetPriority(M0_FAULT_DISP_IRQ_USED, IRQPRIO_HARDFAULT);		//for m0FaultDispatch
	
	//syscalls are high prio (so real irq cannot preempt them) - they are fast and need to be atomic
	NVIC_SetPriority(SVCall_IRQn, IRQPRIO_SVCALL);
	
	//hole: lowest for timer used for scheduling, the rest are for hard irqs
	
	//pendsv is lowest prio and does actual rescheduling (it assumes it saves PSP, so it MUST be lowest prio)
	NVIC_SetPriority(PendSV_IRQn, IRQPRIO_PENDSV);
	
	//set all HW ints we do not know about to low prio (highest prio val)
	for (i = 0; i < CPU_NUM_IRQS; i++)
		NVIC_SetPriority((IRQn_Type)i, IRQPRIO_HW_IRQS_HIGH_LIMIT);
	
	//export funcs
	if (!ralSetRePalmTabFunc(REPALM_FUNC_IDX_GET_CLOCK_RATE, cpuGetClockRate) ||
			!ralSetRePalmTabFunc(REPALM_FUNC_IDX_GET_TIMER_VAL, timerGetTime))
		fatal("cannot export DAL funcs\n");
	
	//let mach driver set prios it cares about higher
	machInit(STAGE_INIT_INTERRUPTS, &irqInfo);
	
	//ints on
	(void)irqsAllOn();
}

static void generalFaultHandlerShowSpWords(uint32_t *words, uint32_t nwords)
{
	uint32_t i;
	
	for (i = 0; i < nwords; i++)
		loge("    [SP + 0x%03x ( == 0x%08x) ] = 0x%08x\n", i * 4, words + i, words[i]);
}

static void generalFaultHandlerLogSpWords(char *line, uint32_t *words, uint32_t nwords)
{
	while (nwords >= 3) {
		spr(line + strlen(line), "%08x %08x %08x\n", words[0], words[1], words[2]);
		nwords -= 3;
		words +=3;
	}
}

//cause is EXC_m0_CAUSE_* and extraData is address, if relevant
static void __attribute__((used)) generalFaultHandler(struct CortexExcFrame *exc, struct CortexPushedRegs *pushedRegs, uint32_t cause, uint32_t extraData, uint32_t ipsr)
{
	struct EmuCpuState* ctx = NULL;
	uint32_t mmfar, bfar;
	char *msg, *line;
			
	static const char causes[][5] = {
		[EXC_m0_CAUSE_MEM_ACCESS_FAIL] = "BUS",
		[EXC_m0_CAUSE_NMI] = "NMI",
		[EXC_m0_CAUSE_UNALIGNED] = "ALGN",
		[EXC_m0_CAUSE_UNDEFINSTR] = "UNDF",
		[EXC_m0_CAUSE_BKPT_HIT] = "BKPT",
		[EXC_m0_CAUSE_UNCLASSIFIABLE] = "nCLS",
	};
	static uint32_t causeHasAddressBits = (1 << EXC_m0_CAUSE_MEM_ACCESS_FAIL) | (1 << EXC_m0_CAUSE_UNALIGNED);
	
	const char *causeStr = (cause < sizeof(causes) / sizeof(*causes)) ? causes[cause] : NULL;
	bool causeHasAddress = (causeHasAddressBits >> cause) & 1;
	
	if (!causeStr)
		causeStr = "UNK";
		
	#ifdef CUSTOM_MMU_FAULT_HANDLER
	
		//0x82 = MMARVALID + DACCVIOL
		if (cause == EXC_m0_CAUSE_MEM_ACCESS_FAIL && CUSTOM_MMU_FAULT_HANDLER(exc, extraData))
			return;
	
	#endif
	
	asm volatile("cpsid i");	//XXX: let us report....
	
	loge("%s FAULT (%08xh)\n", causeStr, extraData);
	msg = halErrorGetBuffer();
	spr(msg, "%s@%08x\n", causeStr, extraData);
	line = msg + strlen(msg) + 1;
	line[0] = 0;
	
	loge("  SR  = 0x%08x\n", exc->sr);
	loge("  R0  = 0x%08x    R8  = 0x%08x\n", exc->r0, pushedRegs->regs8_11[8  - 8]);
	loge("  R1  = 0x%08x    R9  = 0x%08x\n", exc->r1, pushedRegs->regs8_11[9  - 8]);
	loge("  R2  = 0x%08x    R10 = 0x%08x\n", exc->r2, pushedRegs->regs8_11[10 - 8]);
	loge("  R3  = 0x%08x    R11 = 0x%08x\n", exc->r3, pushedRegs->regs8_11[11 - 8]);
	loge("  R4  = 0x%08x    R12 = 0x%08x\n", pushedRegs->regs4_7[4 - 4], exc->r12);
	loge("  R5  = 0x%08x    SP  = 0x%08x\n", pushedRegs->regs4_7[5 - 4], exc + 1);
	loge("  R6  = 0x%08x    LR  = 0x%08x\n", pushedRegs->regs4_7[6 - 4], exc->lr);
	loge("  R7  = 0x%08x    PC  = 0x%08x\n", pushedRegs->regs4_7[7 - 4], exc->pc);
	
	spr(line + strlen(line), "sr %08x regs: %08x\n", exc->sr, exc->r0);
	spr(line + strlen(line), "%08x %08x %08x\n", exc->r1, exc->r2, exc->r3);
	spr(line + strlen(line), "%08x %08x %08x\n", pushedRegs->regs4_7[4 - 4], pushedRegs->regs4_7[5 - 4], pushedRegs->regs4_7[6 - 4]);
	spr(line + strlen(line), "%08x %08x %08x\n", pushedRegs->regs4_7[7 - 4], pushedRegs->regs8_11[8  - 8], pushedRegs->regs8_11[9  - 8]);
	spr(line + strlen(line), "%08x %08x %08x\n", pushedRegs->regs8_11[10 - 8], pushedRegs->regs8_11[10 - 8], exc->r12);
	spr(line + strlen(line), "%08x_%08x_%08x\n", exc + 1, exc->lr, exc->pc);	//underscores show where regs and and SP dump begins
	
	loge("  some words at SP:\n");
	generalFaultHandlerShowSpWords((uint32_t*)(exc + 1), 128);
	generalFaultHandlerLogSpWords(line, (uint32_t*)(exc + 1), 96);
	
	impl_HALErrDisplay(msg, false, NULL, false);	//let it draw
	
	#ifdef EXPLICIT_EMU_CTX
	
		ctx = schedGetCurEmuContextFromFaultContext();
		
	#endif
	#ifdef IMPLICIT_EMU_CTX
	
		extern uint8_t emuCpuRunCodeStart[], emuCpuRunCodeEnd[];
		if (exc->pc >= (uintptr_t)emuCpuRunCodeStart && exc->pc < (uintptr_t)emuCpuRunCodeEnd) 
			ctx = (struct EmuCpuState*)pushedRegs->regs4_7[4 - 4];
		
	#endif
	
	if (ctx) {
		loge("IN ARM EMULATION MODE. CONTEXT:\n");
		loge("  SR  = 0x%08x\n", ctx->sr);
		loge("  R0  = 0x%08x    R8  = 0x%08x\n", ctx->regs[0], ctx->regs[ 8]);
		loge("  R1  = 0x%08x    R9  = 0x%08x\n", ctx->regs[1], ctx->regs[ 9]);
		loge("  R2  = 0x%08x    R10 = 0x%08x\n", ctx->regs[2], ctx->regs[10]);
		loge("  R3  = 0x%08x    R11 = 0x%08x\n", ctx->regs[3], ctx->regs[11]);
		loge("  R4  = 0x%08x    R12 = 0x%08x\n", ctx->regs[4], ctx->regs[12]);
		loge("  R5  = 0x%08x    SP  = 0x%08x\n", ctx->regs[5], ctx->regs[13]);
		loge("  R6  = 0x%08x    LR  = 0x%08x\n", ctx->regs[6], ctx->regs[14]);
		loge("  R7  = 0x%08x    PC  = 0x%08x (may be advanced past instr)\n", ctx->regs[7], ctx->regs[15]);
		loge("  some words at emulated SP:\n");
		generalFaultHandlerShowSpWords((uint32_t*)ctx->regs[13], 64);
	}
	
	kernelLogCurTaskForExc();
	
	asm volatile ("cpsid if	\n\t");
	while(1); //asm volatile ("wfi");
}


//called from asm and given an exc frame
void __attribute__((used,naked)) faultHandlerWithExcFrame(struct CortexExcFrame *exc, uint32_t cause, uint32_t extraData)
{
	asm volatile(
		"	push    {r4-r7, lr}				\n\t"		//push r4-r7 + lr (for returning)
		"	mov     r4, r8					\n\t"
		"	mov     r5, r9					\n\t"
		"	mov     r6, r10					\n\t"
		"	mov     r7, r11					\n\t"
		"	push    {r4-r7}					\n\t"		//push r8..r11
		"	mov     r3, r2					\n\t"		//extra data for cause
		"	mov     r2, r1					\n\t"		//cause value
		"	mov   	r1, sp					\n\t"
		"	mrs     r4, IPSR				\n\t"
		"	push    {r4}					\n\t"
		"	bl    	generalFaultHandler		\n\t"
		"	add     sp, #4					\n\t"
		"	pop		{r0-r7}					\n\t"
		"	mov     r8, r0					\n\t"
		"	mov     r9, r1					\n\t"
		"	mov     r10, r2					\n\t"
		"	mov     r11, r3					\n\t"
		"	pop     {pc}					\n\t"
	);
}

void __attribute__((used,naked)) NMI_Handler(void)
{
	asm volatile(
		".syntax unified							\n\t"
		"	mov   r0, lr							\n\t"
		"	lsrs  r0, #3							\n\t"
		"	bcs   1f								\n\t"
		"	mrs   r0, msp							\n\t"	//grab the appropriate SP
		"	b     2f								\n\t"
		"1:											\n\t"
		"	mrs   r0, psp							\n\t"
		"2:											\n\t"
		"	movs  r1, %0							\n\t"
		"	ldr   r2, =faultHandlerWithExcFrame		\n\t"
		"	bx    r2								\n\t"
		"	.ltorg									\n\t"
		:
		:"I"(EXC_m0_CAUSE_NMI)
	);
}

static void kernelFirstTask(void *param)
{
	void (*entryFunc)(void) = (void (*)())param;

	//give the supervisor mode the entire initial stack (it currently has some of it). Safe to do since we're currently on PSP
	asm("msr MSP, %0\n":: "r"(__stack_top));
	
	entryFunc();
	__builtin_unreachable();
}

void kernelInit(void (*entryFunc)(void), void* hyperFuncIfAny)
{
	kstatus_t sta;
	void* stack;
	tid_t tid;
	
	machInit(STAGE_INIT_SET_VTOR, NULL);
	machInit(STAGE_SETUP_HEAPS, NULL);
	
	hwrInit();
	timersInit();
	
	if (!mpuInit())
		fatal("MPU init failed\n");
	
	machInit(STAGE_INIT_MPU, NULL);
	
	machInit(STAGE_INIT_PRE_SCHED, NULL);
	if (KERN_STATUS_OK != schedInit())
		fatal("SCHED init failed\n");
	
	stack = kheapAlloc(ZEROTH_TASK_STACK_SZ);
	if (!stack)
		fatal("cannot allocate initial stack\n");
	
	sta = KTaskCreate(CREATE_4CC('t','s','k','0'), (void*)kernelFirstTask, stack, ZEROTH_TASK_STACK_SZ, NULL, SCHED_DEFAULT_PRIO, false, &tid);
	if (sta != KERN_STATUS_OK)
		fatal("cannot create first task with code %u\n", sta);

	sta = schedStart(tid, (void*)entryFunc);
	if (sta != KERN_STATUS_OK)
		fatal("sched start failed with code %u\n", sta);
	
	fatal("kernel sched failed to init\n");
}

kstatus_t kernelInitLate(void)
{
	//nothing yet
	
	return KERN_STATUS_OK;
}

//some runtime stuffs for c-m0 since gcc's runtime funcs suck
void __attribute((naked, used))__gnu_thumb1_case_uhi(void)
{
	asm volatile(
		".syntax unified			\n\t"
		"	mov   r12, r1			\n\t"
		"	mov   r1, lr			\n\t"
		"	subs  r1, #1			\n\t"
		"	adds  r1, r0			\n\t"
		"	ldrh  r1, [r1, r0]		\n\t"
		"	lsls  r1, #1			\n\t"
		"	add   lr, r1			\n\t"
		"	mov   r1, r12			\n\t"
		"	bx    lr				\n\t"
		:::"cc"
	);
}

void __attribute((naked, used))__gnu_thumb1_case_shi(void)
{
	asm volatile(
		".syntax unified			\n\t"
		"	mov   r12, r1			\n\t"
		"	mov   r1, lr			\n\t"
		"	subs  r1, #1			\n\t"
		"	adds  r1, r0			\n\t"
		"	ldrsh r1, [r1, r0]		\n\t"
		"	lsls  r1, #1			\n\t"
		"	add   lr, r1			\n\t"
		"	mov   r1, r12			\n\t"
		"	bx    lr				\n\t"
		:::"cc"
	);
}

void __attribute((naked, used))__gnu_thumb1_case_uqi(void)
{
	asm volatile(
		".syntax unified			\n\t"
		"	mov   r12, r1			\n\t"
		"	mov   r1, lr			\n\t"
		"	subs  r1, #1			\n\t"
		"	ldrb  r1, [r1, r0]		\n\t"
		"	lsls  r1, #1			\n\t"
		"	add   lr, r1			\n\t"
		"	mov   r1, r12			\n\t"
		"	bx    lr				\n\t"
		:::"cc"
	);
}

void __attribute((naked, used))__gnu_thumb1_case_sqi(void)
{
	asm volatile(
		".syntax unified			\n\t"
		"	mov   r12, r1			\n\t"
		"	mov   r1, lr			\n\t"
		"	subs  r1, #1			\n\t"
		"	ldrsb r1, [r1, r0]		\n\t"
		"	lsls  r1, #1			\n\t"
		"	add   lr, r1			\n\t"
		"	mov   r1, r12			\n\t"
		"	bx    lr				\n\t"
		:::"cc"
	);
}