.type  emuCpuRun, %function
.globl emuCpuRun
.globl emuCpuRunCodeStart
.globl emuCpuRunCodeEnd



.syntax unified
.thumb



.globl emuCoreInit
emuCoreInit:
	bx lr


//REGS:
//	r0 - temp
//	r1 - temp
//	r2 - instr (till clobbered)
//	r3 - temp
//	r4 - emulState
//	r5 - temp
//	r6 - emulated PC [pointer to next instr - like state.regs[15] but better since it doesnt need a load and ist offet)
//  r7 - emulated SR [cached here, written out on exit only]
//  r8 - cc dispatch table addr
//	r12 - temp (hireg, so only used in cases where it would be of no extra cost)

#define epc r6
#define esr r7

.macro dispatch_next_instr
	lsrs   r1, r2, #24
	ldr    pc, [r8, r1, lsl #2]
.endm

.macro nextinstr
	ldmia  epc!, {r2}							//read instr
	dispatch_next_instr
.endm

.macro maybe_pc_written_interworked,regNo,regWithVal	//assumes regs.pc was written
	cmp    \regNo, #15							//did we load pc?
	beq    1f
	nextinstr
1:
	lsrs   r1, \regWithVal, #1					//test for thumb target
	bcs    emu_out								//if so, go directly out
	mov    epc, \regWithVal
	nextinstr
.endm

.macro maybe_pc_written_not_interworked,regNo,regWithVal
	cmp    \regNo, #15
	it     eq
	moveq  epc, \regWithVal
	nextinstr
.endm

.macro set_proper_pc_in_regs					//doing it only when cc check is done is faster. clobbers flags
	adds   r1, epc, #4							//advance "pc" too
	str    r1, [r4, #0x3c]
.endm

.macro dispatch_tab_entry,cc,name
	.word \name\()\cc + 1
.endm

.macro dispatch_tab_row,cc
	dispatch_tab_entry \cc,dp_reg
	dispatch_tab_entry \cc,dp_reg
	dispatch_tab_entry \cc,dp_imm
	dispatch_tab_entry \cc,dp_imm
	dispatch_tab_entry \cc,mem_imm
	dispatch_tab_entry \cc,mem_imm
	dispatch_tab_entry \cc,mem_reg
	dispatch_tab_entry \cc,mem_reg
	dispatch_tab_entry \cc,mem_mul
	dispatch_tab_entry \cc,mem_mul
	dispatch_tab_entry \cc,inst_b
	dispatch_tab_entry \cc,inst_bl
	dispatch_tab_entry \cc,inst_udf		//0x0C
	dispatch_tab_entry \cc,inst_udf		//0x0D
	dispatch_tab_entry \cc,inst_udf		//0x0E
	dispatch_tab_entry \cc,inst_swi		//0x0F
.endm

.macro dispatch_code_entry_direct,name,cc,bit,maskcc
\name\()_\cc:
	lsls    r1, esr, #32 - \bit			//one cy, one hword if esr is a loreg
	b\maskcc    \name
	nextinstr
.endm

.macro dispatch_code_entry_indirect,name,cc
\name\()_\cc:
	msr    APSR_nzcvq, esr						//move SR to APSR
	b\cc    \name
	nextinstr
.endm

.macro dispatch_code_entry_row,name
	dispatch_code_entry_direct   \name,eq,30,cs
	dispatch_code_entry_direct   \name,ne,30,cc
	dispatch_code_entry_direct   \name,cs,29,cs
	dispatch_code_entry_direct   \name,cc,29,cc
	dispatch_code_entry_direct   \name,mi,31,cs
	dispatch_code_entry_direct   \name,pl,31,cc
	dispatch_code_entry_direct   \name,vs,28,cs
	dispatch_code_entry_direct   \name,vc,28,cc
	dispatch_code_entry_indirect \name,hi
	dispatch_code_entry_indirect \name,ls
	dispatch_code_entry_indirect \name,ge
	dispatch_code_entry_indirect \name,lt
	dispatch_code_entry_indirect \name,gt
	dispatch_code_entry_indirect \name,le
	.equ \name\()_al, \name
	//no nv
.endm

.align 2
dispatchtab:
	dispatch_tab_row _eq
	dispatch_tab_row _ne
	dispatch_tab_row _cs
	dispatch_tab_row _cc
	dispatch_tab_row _mi
	dispatch_tab_row _pl
	dispatch_tab_row _vs
	dispatch_tab_row _vc
	dispatch_tab_row _hi
	dispatch_tab_row _ls
	dispatch_tab_row _ge
	dispatch_tab_row _lt
	dispatch_tab_row _gt
	dispatch_tab_row _le
	dispatch_tab_row
	//dispatch for "nv" opcode
	.word inst_udf + 1			//0x0
	.word inst_udf + 1			//0x1
	.word inst_udf + 1			//0x2
	.word inst_udf + 1			//0x3
	.word inst_udf + 1			//0x4
	.word inst_pld + 1			//0x5
	.word inst_udf + 1			//0x6
	.word inst_pld + 1			//0x7
	.word inst_udf + 1			//0x8
	.word inst_udf + 1			//0x9
	.word inst_blx_imm + 1		//0xA
	.word inst_blx_imm + 1		//0xB
	.word inst_udf + 1			//0xC
	.word inst_udf + 1			//0xD
	.word inst_udf + 1			//0xE
	.word inst_udf + 1			//0xF

emuCpuRunCodeStart:				//if crash is from here to emuCpuRunCodeEnd, emul state iwll be printed. all executable code should be between these two labels

emuCpuRun:
	mov    r4, r0
	adr    r8, dispatchtab
	ldrd   epc, esr, [r4, #0x3c]
	nextinstr

inst_blx_imm:
	str    epc, [r4, #0x38]						//save LR
	lsls   r3, r2, #8
	add    r1, epc, r3, asr #6					//pc += sext(imm24) << 2 	//note we use epc which is really "PC - 4", we'll fix this later
	ubfx   r2, r2, #24, #1
	bfi    r1, r2, #1, #1						//pc[1] = instr[24]
	adds   r1, #5								//set low bit (mandatory) and fix the above use of epc instead of expected PC value
	str    r1, [r4, #0x3c]
	b      emu_out

inst_pld:
	nextinstr									//PLD is a no-op

//dispatch, here for short branches
	dispatch_code_entry_row inst_udf

inst_udf:
	udf    #0x00


//dispatch, here for short branches
	dispatch_code_entry_row inst_b

inst_bl:
	str    epc, [r4, #0x38]						//save LR
												//fallthrough on purpose
inst_b:
	lsls   r2, #8
	add    epc, epc, r2, asr #6					//reads PC - 4
	//load next instr, set epc as needed
	ldr    r2, [epc, #4]						//"correct for that -4"
	add    epc, #8
	dispatch_next_instr

//dispatch, here for short branches (backwards)
	dispatch_code_entry_row inst_bl

//dispatch, here for short branches
	dispatch_code_entry_row mem_imm

mem_imm:
	set_proper_pc_in_regs						//someone might use regs.pc so set it
	ubfx   r1, r2, #0, #12
	b      adr_mode_2_common

//dispatch, here for short branches
	dispatch_code_entry_row mem_reg

mem_reg:
	set_proper_pc_in_regs						//someone might use regs.pc so set it
	ubfx   r3, r2, #7, #5						//get shift amt
	ubfx   r0, r2, #4, #3						//get shift type and zero bit
	ubfx   r1, r2, #0, #4						//get Rm
	ldr    r1, [r4, r1, lsl #2]					//get Rm's value
	tbb    [pc, r0]
adr_mod_2_shft_tab:
	.byte  (a2_lsl - adr_mod_2_shft_tab) / 2
	.byte  (a2_und - adr_mod_2_shft_tab) / 2
	.byte  (a2_lsr - adr_mod_2_shft_tab) / 2
	.byte  (a2_und - adr_mod_2_shft_tab) / 2
	.byte  (a2_asr - adr_mod_2_shft_tab) / 2
	.byte  (a2_und - adr_mod_2_shft_tab) / 2
	.byte  (a2_ror - adr_mod_2_shft_tab) / 2
	.byte  (a2_und - adr_mod_2_shft_tab) / 2

a2_und:
	udf    #0x00

a2_lsr:
	cmp    r3, #0
	ite    eq
	lsreq  r1, #32
	lsrne  r1, r3
	b      adr_mode_2_common

a2_asr:
	cmp    r3, #0
	ite    eq
	asreq  r1, #32
	asrne  r1, r3
	b      adr_mode_2_common

a2_ror:
	cmp    r3, #0
	beq    a2_rrx
	rors   r1, r3
	b      adr_mode_2_common

a2_rrx:
	msr    APSR_nzcvq, esr						//move SR to APSR
	rrx    r1, r1
	b      adr_mode_2_common

a2_lsl:											//most common case so it gets the fallthrough (& thus speed improvement)
	lsls   r1, r3
	//fallthrough

adr_mode_2_common:								//adr mode value is in r1
	ubfx   r5, r2, #12, #4						//get Rd -> r5
	ubfx   r0, r2,  #20, #5						//get 'PUBWL' bits
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r2, [r4, r3, lsl #2]					//get Rn's value -> r2
	tbh    [pc, r0]
adr_mod_2_op_typ:
	.hword  (a2_00000 - adr_mod_2_op_typ) / 2	//STR   Rd, [Rn], #-addr_mode
	.hword  (a2_00001 - adr_mod_2_op_typ) / 2	//LDR   Rd, [Rn], #-addr_mode
	.hword  (a2_und_2 - adr_mod_2_op_typ) / 2	//STRT  Rd, [Rn], #-addr_mode - not supported
	.hword  (a2_und_2 - adr_mod_2_op_typ) / 2	//LDRT  Rd, [Rn], #-addr_mode - not supported
	.hword  (a2_00100 - adr_mod_2_op_typ) / 2	//STRB  Rd, [Rn], #-addr_mode
	.hword  (a2_00101 - adr_mod_2_op_typ) / 2	//LDRB  Rd, [Rn], #-addr_mode
	.hword  (a2_und_2 - adr_mod_2_op_typ) / 2	//STRBT Rd, [Rn], #-addr_mode - not supported
	.hword  (a2_und_2 - adr_mod_2_op_typ) / 2	//LDRBT Rd, [Rn], #-addr_mode - not supported
	.hword  (a2_01000 - adr_mod_2_op_typ) / 2	//STR   Rd, [Rn], #+addr_mode
	.hword  (a2_01001 - adr_mod_2_op_typ) / 2	//LDR   Rd, [Rn], #+addr_mode
	.hword  (a2_und_2 - adr_mod_2_op_typ) / 2	//STRT  Rd, [Rn], #+addr_mode - not supported
	.hword  (a2_und_2 - adr_mod_2_op_typ) / 2	//LDRT  Rd, [Rn], #+addr_mode - not supported
	.hword  (a2_01100 - adr_mod_2_op_typ) / 2	//STRB  Rd, [Rn], #+addr_mode
	.hword  (a2_01101 - adr_mod_2_op_typ) / 2	//LDRB  Rd, [Rn], #+addr_mode
	.hword  (a2_und_2 - adr_mod_2_op_typ) / 2	//STRBT Rd, [Rn], #+addr_mode - not supported
	.hword  (a2_und_2 - adr_mod_2_op_typ) / 2	//LDRBT Rd, [Rn], #+addr_mode - not supported
	.hword  (a2_10000 - adr_mod_2_op_typ) / 2	//STR   Rd, [Rn, #-addr_mode]
	.hword  (a2_10001 - adr_mod_2_op_typ) / 2	//LDR   Rd, [Rn, #-addr_mode]
	.hword  (a2_10010 - adr_mod_2_op_typ) / 2	//STR   Rd, [Rn, #-addr_mode]!
	.hword  (a2_10011 - adr_mod_2_op_typ) / 2	//LDR   Rd, [Rn, #-addr_mode]!
	.hword  (a2_10100 - adr_mod_2_op_typ) / 2	//STRB  Rd, [Rn, #-addr_mode]
	.hword  (a2_10101 - adr_mod_2_op_typ) / 2	//LDRB  Rd, [Rn, #-addr_mode]
	.hword  (a2_10110 - adr_mod_2_op_typ) / 2	//STRB  Rd, [Rn, #-addr_mode]!
	.hword  (a2_10111 - adr_mod_2_op_typ) / 2	//LDRB  Rd, [Rn, #-addr_mode]!
	.hword  (a2_11000 - adr_mod_2_op_typ) / 2	//STR   Rd, [Rn, #+addr_mode]
	.hword  (a2_11001 - adr_mod_2_op_typ) / 2	//LDR   Rd, [Rn, #+addr_mode]
	.hword  (a2_11010 - adr_mod_2_op_typ) / 2	//STR   Rd, [Rn, #+addr_mode]!
	.hword  (a2_11011 - adr_mod_2_op_typ) / 2	//LDR   Rd, [Rn, #+addr_mode]!
	.hword  (a2_11100 - adr_mod_2_op_typ) / 2	//STRB  Rd, [Rn, #+addr_mode]
	.hword  (a2_11101 - adr_mod_2_op_typ) / 2	//LDRB  Rd, [Rn, #+addr_mode]
	.hword  (a2_11110 - adr_mod_2_op_typ) / 2	//STRB  Rd, [Rn, #+addr_mode]!
	.hword  (a2_11111 - adr_mod_2_op_typ) / 2	//LDRB  Rd, [Rn, #+addr_mode]!

a2_00000:										// STR   Rd, [Rn], #-addr_mode
	subs   r1, r2, r1							//calc value to writeback (postindexing value)
	ldr    r0, [r4, r5, lsl #2]					//get Rd's value -> r0
	str    r1, [r4, r3, lsl #2]					//store Rn's value back after writeback
	str    r0, [r2]								//execute the store
	nextinstr

a2_00001:										// LDR   Rd, [Rn], #-addr_mode
	subs   r1, r2, r1							//calc value to writeback (postindexing value)
	ldr    r0, [r2]								//execute the load
	str    r1, [r4, r3, lsl #2]					//store Rn's value back after writeback
	str    r0, [r4, r5, lsl #2]					//set Rd's value <- r0
	maybe_pc_written_interworked r5,r0

a2_00100:										// STRB  Rd, [Rn], #-addr_mode
	subs   r1, r2, r1							//calc value to writeback (postindexing value)
	ldr    r0, [r4, r5, lsl #2]					//get Rd's value -> r0
	str    r1, [r4, r3, lsl #2]					//store Rn's value back after writeback
	strb   r0, [r2]								//execute the store
	nextinstr

a2_00101:										// LDRB  Rd, [Rn], #-addr_mode
	subs   r1, r2, r1							//calc value to writeback (postindexing value)
	ldrb   r0, [r2]								//execute the load
	str    r1, [r4, r3, lsl #2]					//store Rn's value back after writeback
	str    r0, [r4, r5, lsl #2]					//set Rd's value <- r0 (loading PC using this instr is undefined so we do not check for it)
	nextinstr

a2_01000:										// STR   Rd, [Rn], #+addr_mode
	adds   r1, r2, r1							//calc value to writeback (postindexing value)
	ldr    r0, [r4, r5, lsl #2]					//get Rd's value -> r0
	str    r1, [r4, r3, lsl #2]					//store Rn's value back after writeback
	str    r0, [r2]								//execute the store
	nextinstr

a2_01001:										// LDR   Rd, [Rn], #+addr_mode
	adds   r1, r2, r1							//calc value to writeback (postindexing value)
	ldr    r0, [r2]								//execute the load
	str    r1, [r4, r3, lsl #2]					//store Rn's value back after writeback
	str    r0, [r4, r5, lsl #2]					//set Rd's value <- r0
	maybe_pc_written_interworked r5,r0

a2_01100:										// STRB  Rd, [Rn], #+addr_mode
	adds   r1, r2, r1							//calc value to writeback (postindexing value)
	ldr    r0, [r4, r5, lsl #2]					//get Rd's value -> r0
	str    r1, [r4, r3, lsl #2]					//store Rn's value back after writeback
	strb   r0, [r2]								//execute the store
	nextinstr

a2_01101:										// LDRB  Rd, [Rn], #+addr_mode
	adds   r1, r2, r1							//calc value to writeback (postindexing value)
	ldrb   r0, [r2]								//execute the load
	str    r1, [r4, r3, lsl #2]					//store Rn's value back after writeback
	str    r0, [r4, r5, lsl #2]					//set Rd's value <- r0 (loading PC using this instr is undefined so we do not check for it)
	nextinstr

a2_10000:										// STR   Rd, [Rn, #-addr_mode]
	subs   r2, r1								//apply preindexing value
	ldr    r0, [r4, r5, lsl #2]					//get Rd's value -> r0
	str    r0, [r2]								//execute the store
	nextinstr

a2_10001:										// LDR   Rd, [Rn, #-addr_mode]
	subs   r2, r1								//apply preindexing value
	ldr    r0, [r2]								//execute the load
	str    r0, [r4, r5, lsl #2]					//set Rd's value <- r0
	maybe_pc_written_interworked r5,r0

a2_10010:										// STR   Rd, [Rn, #-addr_mode]!
	subs   r2, r1								//apply preindexing value
	ldr    r0, [r4, r5, lsl #2]					//get Rd's value -> r0
	str    r2, [r4, r3, lsl #2]					//store Rn's value back after writeback
	str    r0, [r2]								//execute the store
	nextinstr

a2_10011:										// LDR   Rd, [Rn, #-addr_mode]!
	subs   r2, r1								//apply preindexing value
	ldr    r0, [r2]								//execute the load
	str    r2, [r4, r3, lsl #2]					//store Rn's value back after writeback
	str    r0, [r4, r5, lsl #2]					//set Rd's value <- r0
	maybe_pc_written_interworked r5,r0

a2_10100:										// STRB  Rd, [Rn, #-addr_mode]
	subs   r2, r1								//apply preindexing value
	ldr    r0, [r4, r5, lsl #2]					//get Rd's value -> r0
	strb   r0, [r2]								//execute the store
	nextinstr

a2_10101:										// LDRB  Rd, [Rn, #-addr_mode]
	subs   r2, r1								//apply preindexing value
	ldrb   r0, [r2]								//execute the load
	str    r0, [r4, r5, lsl #2]					//set Rd's value <- r0 (loading PC using this instr is undefined so we do not check for it)
	nextinstr

a2_10110:										// STRB  Rd, [Rn, #-addr_mode]!
	subs   r2, r1								//apply preindexing value
	ldr    r0, [r4, r5, lsl #2]					//get Rd's value -> r0
	str    r2, [r4, r3, lsl #2]					//store Rn's value back after writeback
	strb   r0, [r2]								//execute the store
	nextinstr

a2_10111:										// LDRB  Rd, [Rn, #-addr_mode]!
	subs   r2, r1								//apply preindexing value
	ldrb   r0, [r2]								//execute the load
	str    r2, [r4, r3, lsl #2]					//store Rn's value back after writeback
	str    r0, [r4, r5, lsl #2]					//set Rd's value <- r0 (loading PC using this instr is undefined so we do not check for it)
	nextinstr

a2_11000:										// STR   Rd, [Rn, #+addr_mode]
	ldr    r0, [r4, r5, lsl #2]					//get Rd's value -> r0
	str    r0, [r2, r1]							//execute the store with preindexing
	nextinstr

a2_11001:										// LDR   Rd, [Rn, #+addr_mode]
	ldr    r0, [r2, r1]							//execute the load with preindexing
	str    r0, [r4, r5, lsl #2]					//set Rd's value <- r0
	maybe_pc_written_interworked r5,r0

a2_11010:										// STR   Rd, [Rn, #+addr_mode]!
	adds   r2, r1								//apply preindexing value
	ldr    r0, [r4, r5, lsl #2]					//get Rd's value -> r0
	str    r2, [r4, r3, lsl #2]					//store Rn's value back after writeback
	str    r0, [r2]								//execute the store
	nextinstr

a2_11011:										// LDR   Rd, [Rn, #+addr_mode]!
	adds   r2, r1								//apply preindexing value
	ldr    r0, [r2]								//execute the load
	str    r2, [r4, r3, lsl #2]					//store Rn's value back after writeback
	str    r0, [r4, r5, lsl #2]					//set Rd's value <- r0
	maybe_pc_written_interworked r5,r0

a2_11100:										// STRB  Rd, [Rn, #+addr_mode]
	ldr    r0, [r4, r5, lsl #2]					//get Rd's value -> r0
	strb   r0, [r2, r1]							//execute the store with preindexing
	nextinstr

a2_11101:										// LDRB  Rd, [Rn, #+addr_mode]
	ldrb   r0, [r2, r1]							//execute the load with preindexing
	str    r0, [r4, r5, lsl #2]					//set Rd's value <- r0 (loading PC using this instr is undefined so we do not check for it)
	nextinstr

a2_11110:										// STRB  Rd, [Rn, #+addr_mode]!
	adds   r2, r1								//apply preindexing value
	ldr    r0, [r4, r5, lsl #2]					//get Rd's value -> r0
	str    r2, [r4, r3, lsl #2]					//store Rn's value back after writeback
	strb   r0, [r2]								//execute the store
	nextinstr

a2_11111:										// LDRB  Rd, [Rn, #+addr_mode]!
	adds   r2, r1								//apply preindexing value
	ldrb   r0, [r2]								//execute the load
	str    r2, [r4, r3, lsl #2]					//store Rn's value back after writeback
	str    r0, [r4, r5, lsl #2]					//set Rd's value <- r0 (loading PC using this instr is undefined so we do not check for it)
	nextinstr

a2_und_2:
	udf    #0x00


//dispatch, here for short branches
	dispatch_code_entry_row mem_mul

mem_mul:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ubfx   r0, r2,  #20, #5						//get 'PUSWL' bits (we check "W" later actually for code dedup)
	set_proper_pc_in_regs						//someone might use regs.pc so set it
	ldr    r1, [r4, r3, lsl #2]					//get Rn's value -> r1
	tbh    [pc, r0]
adr_mod_4_op_typ:
	.hword  (a4_stmda - adr_mod_4_op_typ) / 2	//STMDA Rn, {}
	.hword  (a4_ldmda - adr_mod_4_op_typ) / 2	//LDMDA Rn, {}
	.hword  (a4_stmda - adr_mod_4_op_typ) / 2	//STMDA Rn!, {}
	.hword  (a4_ldmda - adr_mod_4_op_typ) / 2	//LDMDA Rn!, {}
	.hword  (a4_undef - adr_mod_4_op_typ) / 2	//STM with "S" bit unsupported
	.hword  (a4_ldmda_s - adr_mod_4_op_typ) / 2	//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)
	.hword  (a4_undef - adr_mod_4_op_typ) / 2	//STM with "S" bit unsupported
	.hword  (a4_ldmda_s - adr_mod_4_op_typ) / 2	//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)
	.hword  (a4_stmia - adr_mod_4_op_typ) / 2	//STMIA Rn, {}
	.hword  (a4_ldmia - adr_mod_4_op_typ) / 2	//LDMIA Rn, {}
	.hword  (a4_stmia - adr_mod_4_op_typ) / 2	//STMIA Rn!, {}
	.hword  (a4_ldmia - adr_mod_4_op_typ) / 2	//LDMIA Rn!, {}
	.hword  (a4_undef - adr_mod_4_op_typ) / 2	//STM with "S" bit unsupported
	.hword  (a4_ldmia_s - adr_mod_4_op_typ) / 2	//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)
	.hword  (a4_undef - adr_mod_4_op_typ) / 2	//STM with "S" bit unsupported
	.hword  (a4_ldmia_s - adr_mod_4_op_typ) / 2	//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)
	.hword  (a4_stmdb - adr_mod_4_op_typ) / 2	//STMDB Rn, {}
	.hword  (a4_ldmdb - adr_mod_4_op_typ) / 2	//LDMDB Rn, {}
	.hword  (a4_stmdb - adr_mod_4_op_typ) / 2	//STMDB Rn!, {}
	.hword  (a4_ldmdb - adr_mod_4_op_typ) / 2	//LDMDB Rn!, {}
	.hword  (a4_undef - adr_mod_4_op_typ) / 2	//STM with "S" bit unsupported
	.hword  (a4_ldmdb_s - adr_mod_4_op_typ) / 2	//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)
	.hword  (a4_undef - adr_mod_4_op_typ) / 2	//STM with "S" bit unsupported
	.hword  (a4_ldmdb_s - adr_mod_4_op_typ) / 2	//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)
	.hword  (a4_stmib - adr_mod_4_op_typ) / 2	//STMIB Rn, {}
	.hword  (a4_ldmib - adr_mod_4_op_typ) / 2	//LDMIB Rn, {}
	.hword  (a4_stmib - adr_mod_4_op_typ) / 2	//STMIB Rn!, {}
	.hword  (a4_ldmib - adr_mod_4_op_typ) / 2	//LDMIB Rn!, {}
	.hword  (a4_undef - adr_mod_4_op_typ) / 2	//STM with "S" bit unsupported
	.hword  (a4_ldmib_s - adr_mod_4_op_typ) / 2	//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)
	.hword  (a4_undef - adr_mod_4_op_typ) / 2	//STM with "S" bit unsupported
	.hword  (a4_ldmib_s - adr_mod_4_op_typ) / 2	//LDM with "S" bit unsupported (but sometimes used with PC in the list & apparently works)


a4_undef:
	udf    0x00

a4_stmda:

.macro stmda_step,toreg,fromreg,shiftamt,ofst_top_reg,want_last_jump
		lsls   \toreg, \fromreg, #\shiftamt			//we can test two bits at once by shifting 2 at a time. top goes into C flag, second form top into N flag
		itt    cs
		ldrcs  r0, [r4, #\ofst_top_reg]
		strcs  r0, [r1], #-4						//store and adjust Rn's value
		itt    mi
		ldrmi  r0, [r4, #\ofst_top_reg - 4]
		strmi  r0, [r1], #-4						//store and adjust Rn's value
		.if \want_last_jump
			beq   stmda_done
		.endif
.endm

	stmda_step r5, r2, 17, 0x3c,1					//PC & LR round, initialize r5
	stmda_step r5, r5, 2, 0x34,1					//SP & R12 round
	stmda_step r5, r5, 2, 0x2C,1					//R11 & R10 round
	stmda_step r5, r5, 2, 0x24,1					//R9 & R8 round
	stmda_step r5, r5, 2, 0x1C,1					//R7 & R6 round
	stmda_step r5, r5, 2, 0x14,1					//R5 & R4 round
	stmda_step r5, r5, 2, 0x0C,1					//R3 & R2 round
	stmda_step r5, r5, 2, 0x04,0					//R1 & R0 round

stmda_done:
	lsrs   r0, r2, #22
	it     cs
	strcs  r1, [r4, r3, lsl #2]						//write back Rn's value <- r1
	nextinstr


a4_stmdb:

.macro stmdb_step,toreg,fromreg,shiftamt,ofst_top_reg,want_last_jump
		lsls   \toreg, \fromreg, #\shiftamt			//we can test two bits at once by shifting 2 at a time. top goes into C flag, second form top into N flag
		itt    cs
		ldrcs  r0, [r4, #\ofst_top_reg]
		strcs  r0, [r1, #-4]!						//store and adjust Rn's value
		itt    mi
		ldrmi  r0, [r4, #\ofst_top_reg - 4]
		strmi  r0, [r1, #-4]!						//store and adjust Rn's value
		.if \want_last_jump
			beq   stmdb_done
		.endif
.endm

	stmdb_step r5, r2, 17, 0x3c,1					//PC & LR round, initialize r5
	stmdb_step r5, r5, 2, 0x34,1					//SP & R12 round
	stmdb_step r5, r5, 2, 0x2C,1					//R11 & R10 round
	stmdb_step r5, r5, 2, 0x24,1					//R9 & R8 round
	stmdb_step r5, r5, 2, 0x1C,1					//R7 & R6 round
	stmdb_step r5, r5, 2, 0x14,1					//R5 & R4 round
	stmdb_step r5, r5, 2, 0x0C,1					//R3 & R2 round
	stmdb_step r5, r5, 2, 0x04,0					//R1 & R0 round

stmdb_done:
	lsrs   r0, r2, #22
	it     cs
	strcs  r1, [r4, r3, lsl #2]						//write back Rn's value <- r1
	nextinstr

a4_ldmda_s:
#ifndef ARM_EMU_DEFINE_THE_UNDEFINED
	b      a4_undef
#else
	//Bejeweled likes to use shit like "LDMDB   R11, {R4-R8,R11,SP,PC}^" to
	// return from armlets. This is undefined in user and system mode since there
	// is no SPSR to copy to CPSR in that mode. Apparently works in user mode
	// by just pretending that the S bit is clear. Verified on real HW.
	//this patch verifies that if S is set, PC is in the list, since else it is an
	//entirely different instr. If so, falls through to the normal ldm code
	lsrs   r5, r2, #16
	bcc    a4_undef
	//fallthrough to normal ldmda code below
#endif

a4_ldmda:

.macro ldmda_step,toreg,fromreg,shiftamt,ofst_top_reg,want_last_jump,haspc
		lsls   \toreg, \fromreg, #\shiftamt			//we can test two bits at once by shifting 2 at a time. top goes into C flag, second form top into N flag
		.if \haspc
			it     cs
			ldrcs  epc, [r1], #-4					//load and adjust PC's value
		.else
			itt    cs
			ldrcs  r0, [r1], #-4					//load and adjust Rn's value
			strcs  r0, [r4, #\ofst_top_reg]
		.endif
		itt    mi
		ldrmi  r0, [r1], #-4						//load and adjust Rn's value
		strmi  r0, [r4, #\ofst_top_reg - 4]
		.if \want_last_jump
			beq   ldmda_done
		.endif
.endm

	ldmda_step r5, r2, 17, 0x3c,1,1					//PC & LR round, initialize r5
	ldmda_step r5, r5, 2, 0x34,1,0					//SP & R12 round
	ldmda_step r5, r5, 2, 0x2C,1,0					//R11 & R10 round
	ldmda_step r5, r5, 2, 0x24,1,0					//R9 & R8 round
	ldmda_step r5, r5, 2, 0x1C,1,0					//R7 & R6 round
	ldmda_step r5, r5, 2, 0x14,1,0					//R5 & R4 round
	ldmda_step r5, r5, 2, 0x0C,1,0					//R3 & R2 round
	ldmda_step r5, r5, 2, 0x04,0,0					//R1 & R0 round

ldmda_done:
	lsrs   r0, r2, #22
	it     cs
	strcs  r1, [r4, r3, lsl #2]						//write back Rn's value <- r1
	lsrs   r5, epc, #1								//we may have loaded pc. if we did, bail now
	bcs    emu_out_with_epc							//if we loaded an arm address to pc, we loaded it into epc so all is good
	nextinstr

a4_ldmdb_s:
#ifndef ARM_EMU_DEFINE_THE_UNDEFINED
	b      a4_undef
#else
	//Bejeweled likes to use shit like "LDMDB   R11, {R4-R8,R11,SP,PC}^" to
	// return from armlets. This is undefined in user and system mode since there
	// is no SPSR to copy to CPSR in that mode. Apparently works in user mode
	// by just pretending that the S bit is clear. Verified on real HW.
	//this patch verifies that if S is set, PC is in the list, since else it is an
	//entirely different instr. If so, falls through to the normal ldm code
	lsrs   r5, r2, #16
	bpl    a4_undef
	//fallthrough to normal ldmdb code below
#endif

a4_ldmdb:

.macro ldmdb_step,toreg,fromreg,shiftamt,ofst_top_reg,want_last_jump,haspc
		lsls   \toreg, \fromreg, #\shiftamt			//we can test two bits at once by shifting 2 at a time. top goes into C flag, second form top into N flag
		.if \haspc
			it     cs
			ldrcs  epc, [r1, #-4]!					//load and adjust PC's value
		.else
			itt    cs
			ldrcs  r0, [r1, #-4]!					//load and adjust Rn's value
			strcs  r0, [r4, #\ofst_top_reg]
		.endif
		itt    mi
		ldrmi  r0, [r1, #-4]!						//load and adjust Rn's value
		strmi  r0, [r4, #\ofst_top_reg - 4]
		.if \want_last_jump
			beq   ldmdb_done
		.endif
.endm

	ldmdb_step r5, r2, 17, 0x3c,1,1					//PC & LR round, initialize r5
	ldmdb_step r5, r5, 2, 0x34,1,0					//SP & R12 round
	ldmdb_step r5, r5, 2, 0x2C,1,0					//R11 & R10 round
	ldmdb_step r5, r5, 2, 0x24,1,0					//R9 & R8 round
	ldmdb_step r5, r5, 2, 0x1C,1,0					//R7 & R6 round
	ldmdb_step r5, r5, 2, 0x14,1,0					//R5 & R4 round
	ldmdb_step r5, r5, 2, 0x0C,1,0					//R3 & R2 round
	ldmdb_step r5, r5, 2, 0x04,0,0					//R1 & R0 round

ldmdb_done:
	lsrs   r0, r2, #22
	it     cs
	strcs  r1, [r4, r3, lsl #2]						//write back Rn's value <- r1
	lsrs   r5, epc, #1								//we may have loaded pc. if we did, bail now
	bcs    emu_out_with_epc							//if we loaded an arm address to pc, we loaded it into epc so all is good
	nextinstr

a4_stmia:
	//there is no clever way to check two bits at once in the increasing direction (ror sets C and N to the same value)
	//so we reverse the bitmask first so we can do what we do for decrementing direction
	//we also need to mask off the top bits since else our shortcut exit (beq) won't work
	//the cycle to do that may or may not be worth it. TBD
	uxth   r5, r2
	rbit   r5, r5

.macro stmia_step,shiftamt,ofst_bottom_reg,want_last_jump
		lsls     r5, r5, #\shiftamt					//we can test two bits at once by shifting 2 at a time. top goes into C flag, second from top into N flag
		itt      cs
		ldrcs    r0, [r4, #\ofst_bottom_reg]
		stmiacs  r1!, {r0}							//store and adjust Rn's value
		itt      mi
		ldrmi    r0, [r4, #\ofst_bottom_reg + 4]
		stmiami  r1!, {r0}							//store and adjust Rn's value
		.if \want_last_jump
			beq   stmia_done
		.endif
.endm

	stmia_step 1,0x00,1								//R0 & R1 round
	stmia_step 2,0x08,1								//R2 & R3 round
	stmia_step 2,0x10,1								//R4 & R5 round
	stmia_step 2,0x18,1								//R6 & R7 round
	stmia_step 2,0x20,1								//R8 & R9 round
	stmia_step 2,0x28,1								//R10 & R11 round
	stmia_step 2,0x30,1								//R12 & SP round
	stmia_step 2,0x38,0								//LR & PC round

stmia_done:
	lsrs   r0, r2, #22
	it     cs
	strcs  r1, [r4, r3, lsl #2]						//write back Rn's value <- r1
	nextinstr

a4_stmib:
	//there is no clever way to check two bits at once in the increasing direction (ror sets C and N to the same value)
	//so we reverse the bitmask first so we can do what we do for decrementing direction
	//we also need to mask off the top bits since else our shortcut exit (beq) won't work
	//the cycle to do that may or may not be worth it. TBD
	uxth   r5, r2
	rbit   r5, r5
	
	//"IB" is just an "IA" with an extra increment before (and a decrement after if writeback)
	//we get a speed gain from using shorter "stmia" over a longer "str with writeback", likely worth the extra cycle to preincrement
	adds   r1, #4

.macro stmib_step,shiftamt,ofst_bottom_reg,want_last_jump
		lsls     r5, r5, #\shiftamt					//we can test two bits at once by shifting 2 at a time. top goes into C flag, second from top into N flag
		itt      cs
		ldrcs    r0, [r4, #\ofst_bottom_reg]
		stmiacs  r1!, {r0}							//store and adjust Rn's value
		itt      mi
		ldrmi    r0, [r4, #\ofst_bottom_reg + 4]
		stmiami  r1!, {r0}							//store and adjust Rn's value
		.if \want_last_jump
			beq   stmib_done
		.endif
.endm

	stmib_step 1,0x00,1								//R0 & R1 round
	stmib_step 2,0x08,1								//R2 & R3 round
	stmib_step 2,0x10,1								//R4 & R5 round
	stmib_step 2,0x18,1								//R6 & R7 round
	stmib_step 2,0x20,1								//R8 & R9 round
	stmib_step 2,0x28,1								//R10 & R11 round
	stmib_step 2,0x30,1								//R12 & SP round
	stmib_step 2,0x38,0								//LR & PC round

stmib_done:
	lsrs   r0, r2, #22
	itt    cs
	subcs  r1, #4									//undo our preincrement (we did it to use stm in the body)
	strcs  r1, [r4, r3, lsl #2]						//write back Rn's value <- r1
	nextinstr

a4_ldmia_s:
#ifndef ARM_EMU_DEFINE_THE_UNDEFINED
	b      a4_undef
#else
	//Bejeweled likes to use shit like "LDMDB   R11, {R4-R8,R11,SP,PC}^" to
	// return from armlets. This is undefined in user and system mode since there
	// is no SPSR to copy to CPSR in that mode. Apparently works in user mode
	// by just pretending that the S bit is clear. Verified on real HW.
	//this patch verifies that if S is set, PC is in the list, since else it is an
	//entirely different instr. If so, falls through to the normal ldm code
	lsrs   r5, r2, #16
	bcc    a4_undef
	//fallthrough to normal ldmia code below
#endif


a4_ldmia:
	//there is no clever way to check two bits at once in the increasing direction (ror sets C and N to the same value)
	//so we reverse the bitmask first so we can do what we do for decrementing direction
	//we also need to mask off the top bits since else our shortcut exit (beq) won't work
	//the cycle to do that may or may not be worth it. TBD
	uxth   r5, r2
	rbit   r5, r5

.macro ldmia_step,shiftamt,ofst_bottom_reg,want_last_jump,haspc
		lsls    r5, r5, #\shiftamt					//we can test two bits at once by shifting 2 at a time. top goes into C flag, second form top into N flag
		itt     cs
		ldmiacs r1!, {r0}							//load and adjust Rn's value
		strcs   r0, [r4, #\ofst_bottom_reg]
		.if \haspc
			it      mi
			ldmiami r1!, {epc}						//load PC and adjust Rn's value
		.else
			itt     mi
			ldmiami r1!, {r0}						//load and adjust Rn's value
			strmi   r0, [r4, #\ofst_bottom_reg + 4]
		.endif
		.if \want_last_jump
			beq   ldmia_done
		.endif
.endm

	ldmia_step 1,0x00,1,0							//R0 & R1 round
	ldmia_step 2,0x08,1,0							//R2 & R3 round
	ldmia_step 2,0x10,1,0							//R4 & R5 round
	ldmia_step 2,0x18,1,0							//R6 & R7 round
	ldmia_step 2,0x20,1,0							//R8 & R9 round
	ldmia_step 2,0x28,1,0							//R10 & R11 round
	ldmia_step 2,0x30,1,0							//R12 & SP round
	ldmia_step 2,0x38,0,1							//LR & PC round
	
ldmia_done:
	lsrs   r0, r2, #22
	it     cs
	strcs  r1, [r4, r3, lsl #2]						//write back Rn's value <- r1
	lsrs   r5, epc, #1								//we may have loaded pc. if we did, bail now
	bcs    emu_out_with_epc							//if we loaded an arm address to pc, we loaded it into epc so all is good
	nextinstr

a4_ldmib_s:
#ifndef ARM_EMU_DEFINE_THE_UNDEFINED
	b      a4_undef
#else
	//Bejeweled likes to use shit like "LDMDB   R11, {R4-R8,R11,SP,PC}^" to
	// return from armlets. This is undefined in user and system mode since there
	// is no SPSR to copy to CPSR in that mode. Apparently works in user mode
	// by just pretending that the S bit is clear. Verified on real HW.
	//this patch verifies that if S is set, PC is in the list, since else it is an
	//entirely different instr. If so, falls through to the normal ldm code
	lsrs   r5, r2, #16
	bpl    a4_undef
	//fallthrough to normal ldmib code below
#endif

a4_ldmib:
	//there is no clever way to check two bits at once in the increasing direction (ror sets C and N to the same value)
	//so we reverse the bitmask first so we can do what we do for decrementing direction
	//we also need to mask off the top bits since else our shortcut exit (beq) won't work
	//the cycle to do that may or may not be worth it. TBD
	uxth   r5, r2
	rbit   r5, r5
	
	//"IB" is just an "IA" with an extra increment before (and a decrement after if writeback)
	//we get a speed gain from using shorter "stmia" over a longer "str with writeback", likely worth the extra cycle to preincrement
	adds   r1, #4

.macro ldmib_step,shiftamt,ofst_bottom_reg,want_last_jump,haspc
		lsls    r5, r5, #\shiftamt					//we can test two bits at once by shifting 2 at a time. top goes into C flag, second form top into N flag
		itt     cs
		ldmiacs r1!, {r0}							//load and adjust Rn's value
		strcs   r0, [r4, #\ofst_bottom_reg]
		.if \haspc
			it      mi
			ldmiami r1!, {epc}						//load PC and adjust Rn's value
		.else
			itt     mi
			ldmiami r1!, {r0}						//load and adjust Rn's value
			strmi   r0, [r4, #\ofst_bottom_reg + 4]
		.endif
		.if \want_last_jump
			beq   ldmib_done
		.endif
.endm

	ldmib_step 1,0x00,1,0							//R0 & R1 round
	ldmib_step 2,0x08,1,0							//R2 & R3 round
	ldmib_step 2,0x10,1,0							//R4 & R5 round
	ldmib_step 2,0x18,1,0							//R6 & R7 round
	ldmib_step 2,0x20,1,0							//R8 & R9 round
	ldmib_step 2,0x28,1,0							//R10 & R11 round
	ldmib_step 2,0x30,1,0							//R12 & SP round
	ldmib_step 2,0x38,0,1							//LR & PC round
	
ldmib_done:
	lsrs   r0, r2, #22
	itt    cs
	subcs  r1, #4
	strcs  r1, [r4, r3, lsl #2]						//write back Rn's value <- r1
	lsrs   r5, epc, #1								//we may have loaded pc. if we did, bail now
	bcs    emu_out_with_epc							//if we loaded an arm address to pc, we loaded it into epc so all is good
	nextinstr


//dispatch, here for short branches
	dispatch_code_entry_row dp_imm

dp_imm:
	set_proper_pc_in_regs						//someone might use regs.pc so set it
	uxtb   r0, r2								//get imm8
	ubfx   r1, r2, #8, #4
	lsls   r1, #1
	beq    dp_imm_no_rot
dp_imm_has_rot:
	rors   r0, r1
	mrs    r1, APSR								//grab shifter carry out
	b      dp_dispatch
dp_imm_no_rot:
	mov    r1, esr								//grab existing C bit as shifter carry out
	b      dp_dispatch

//dispatch, here for short branches
	dispatch_code_entry_row dp_reg
	
dp_reg:
	set_proper_pc_in_regs						//someone might use regs.pc so set it
	ubfx   r5, r2, #0, #4						//get Rm -> r5
	ubfx   r3, r2, #7, #5						//get imm or "Rs << 1" -> r3
	ubfx   r0, r2, #4, #4						//grab bits 4..7 which include shift andother vals that tell us what this is
	tbh    [pc, r0]								//dispatch onshift type and other bits
dp_reg_sh_disp:
	.hword (dp_r_lsli  - dp_reg_sh_disp) / 2
	.hword (dp_r_lslr  - dp_reg_sh_disp) / 2
	.hword (dp_r_lsri  - dp_reg_sh_disp) / 2
	.hword (dp_r_lsrr  - dp_reg_sh_disp) / 2
	.hword (dp_r_asri  - dp_reg_sh_disp) / 2
	.hword (dp_r_asrr  - dp_reg_sh_disp) / 2
	.hword (dp_r_rori  - dp_reg_sh_disp) / 2	//could be RRX
	.hword (dp_r_rorr  - dp_reg_sh_disp) / 2
	.hword (dp_r_lsli  - dp_reg_sh_disp) / 2
	.hword (dp_mul_swp - dp_reg_sh_disp) / 2	// table 3.2 (SWP, SWPB, MUL, MLA, UMULL, UMLAL, SMULL, SMLAL)
	.hword (dp_r_lsri  - dp_reg_sh_disp) / 2
	.hword (dp_halfw   - dp_reg_sh_disp) / 2	// table 3.2 (LDRH,STRH)
	.hword (dp_r_asri  - dp_reg_sh_disp) / 2
	.hword (dp_ldrd_sb - dp_reg_sh_disp) / 2	// table 3.2 (LDRD, LDRSB)
	.hword (dp_r_rori  - dp_reg_sh_disp) / 2	//could be RRX
	.hword (dp_strd_sh - dp_reg_sh_disp) / 2	// table 3.2 (STRD, LDRSH)

dp_r_lsli:
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	cmp    r3, #0
	beq    dp_r_lsli_noshift
	lsls   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	b      dp_dispatch
dp_r_lsli_noshift:
	mov    r1, esr								//grab existing C bit as shifter carry out
	b      dp_dispatch

dp_r_lslr:
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldrb   r3, [r4, r3, lsl #1]					//get Rs's value's low 8 bits -> r3
	lsls   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	b      dp_dispatch

dp_r_lsri:
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	cmp    r3, #0
	it     eq
	moveq  r3, #32
	lsrs   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	b      dp_dispatch

dp_r_lsrr:
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldrb   r3, [r4, r3, lsl #1]					//get Rs's value's low 8 bits -> r3
	lsrs   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	b      dp_dispatch

dp_r_asri:
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	cmp    r3, #0
	it     eq
	moveq  r3, #32
	asrs   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	b      dp_dispatch

dp_r_asrr:
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldrb   r3, [r4, r3, lsl #1]					//get Rs's value's low 8 bits -> r3
	asrs   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	b      dp_dispatch

dp_r_rori:
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	cmp    r3, #0
	beq	   dp_r_rrx
	rors   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	b      dp_dispatch
dp_r_rrx:
	mov    r3, esr								//grab existing C bit for RRX
	msr    APSR_nzcvq, esr						//stash SR into APSR so RRX can use it
	rrxs   r0, r0
	mrs    r1, APSR								//grab shifter carry out
	b      dp_dispatch

dp_r_rorr:
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldrb   r3, [r4, r3, lsl #1]					//get Rs's value's low 8 bits -> r3
	rors   r0, r3
	mrs    r1, APSR								//grab shifter carry out
	b      dp_dispatch

dp_mul_swp:
	ubfx   r3, r2, #20, #5						//isolate dispatchable bits
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	tbh    [pc, r3]
dp_mul_dis_tb:
	.hword (dp_mul     - dp_mul_dis_tb) / 2
	.hword (dp_muls    - dp_mul_dis_tb) / 2
	.hword (dp_mla     - dp_mul_dis_tb) / 2
	.hword (dp_mlas    - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_umull   - dp_mul_dis_tb) / 2
	.hword (dp_umulls  - dp_mul_dis_tb) / 2
	.hword (dp_umlal   - dp_mul_dis_tb) / 2
	.hword (dp_umlals  - dp_mul_dis_tb) / 2
	.hword (dp_smull   - dp_mul_dis_tb) / 2
	.hword (dp_smulls  - dp_mul_dis_tb) / 2
	.hword (dp_smlal   - dp_mul_dis_tb) / 2
	.hword (dp_smlals  - dp_mul_dis_tb) / 2
	.hword (dp_swp     - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_swpb    - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2
	.hword (dp_mul_und - dp_mul_dis_tb) / 2

dp_mul:
	ubfx   r1, r2, #8, #4						//get Rs -> r1
	ldr    r1, [r4, r1, lsl #2]					//get Rs's value -> r1
	muls   r1, r0
	ubfx   r0, r2, #16, #4						//get Rd -> r0
	str    r1, [r4, r0, lsl #2]					//set Rd's value <- r1
	nextinstr

dp_muls:
	msr    APSR_nzcvq, esr						//stash SR into APSR
	ubfx   r1, r2, #8, #4						//get Rs -> r1
	ldr    r1, [r4, r1, lsl #2]					//get Rs's value -> r1
	muls   r1, r0
	mrs    esr, APSR							//grab resulting CPSR into SR
	ubfx   r0, r2, #16, #4						//get Rd -> r0
	str    r1, [r4, r0, lsl #2]					//set Rd's value <- r1
	nextinstr

dp_mla:
	ubfx   r1, r2, #8, #4						//get Rs -> r1
	ubfx   r5, r2, #12, #4						//get Rn -> r5
	ldr    r1, [r4, r1, lsl #2]					//get Rs's value -> r1
	ldr    r5, [r4, r5, lsl #2]					//get Rn's value -> r5
	mla    r1, r0, r1, r5
	ubfx   r0, r2, #16, #4						//get Rd -> r0
	str    r1, [r4, r0, lsl #2]					//set Rd's value <- r1
	nextinstr

dp_mlas:										//T2 has no flag-setting version of this, so we synthesize it. Luckily nobody uses this
	msr    APSR_nzcvq, esr						//stash it into APSR
	ubfx   r1, r2, #8, #4						//get Rs -> r1
	ubfx   r5, r2, #12, #4						//get Rn -> r5
	ldr    r1, [r4, r1, lsl #2]					//get Rs's value -> r1
	ldr    r5, [r4, r5, lsl #2]					//get Rn's value -> r5
	mla    r1, r0, r1, r5
	tst    r1, r1								//get condition codes
	mrs    esr, APSR							//grab resulting CPSR into SR
	ubfx   r0, r2, #16, #4						//get Rd -> r0
	str    r1, [r4, r0, lsl #2]					//set Rd's value <- r1
	nextinstr

dp_smull:
	ubfx   r1, r2, #8, #4						//get Rs -> r1
	ldr    r1, [r4, r1, lsl #2]					//get Rs's value -> r1
	smull  r1, r3, r0, r1
	ubfx   r12, r2, #12, #4						//get RdLo -> r12
	ubfx   r0, r2, #16, #4						//get RdHi -> r0
	str    r1, [r4, r12, lsl #2]				//set RdLo's value <- r1
	str    r3, [r4, r0, lsl #2]					//set RdHi's value <- r3
	nextinstr

dp_smulls:										//T2 has no flag-setting version of this, so we synthesize it. Luckily nobody uses this
	ubfx   r1, r2, #8, #4						//get Rs -> r1
	ldr    r1, [r4, r1, lsl #2]					//get Rs's value -> r1
	smull  r1, r3, r0, r1
	ubfx   r12, r2, #12, #4						//get RdLo -> r12
	ubfx   r0, r2, #16, #4						//get RdHi -> r0
	str    r1, [r4, r12, lsl #2]				//set RdLo's value <- r1
	str    r3, [r4, r0, lsl #2]					//set RdHi's value <- r3
	lsrs   r0, r3, #31							//top bit of RdHi into r0
	lsls   r0, #1								//prepare it to go into N and a zero into Z
	bfi    esr, r0, #30, #2						//set SR's N to top bit of RdHi and Z to 0
	orrs   r1, r3								//test for entire result being zero
	it     eq
	orreq  esr, esr, #0x40000000				//set SR's Z
	nextinstr

dp_smlal:
	ubfx   r1, r2, #8, #4						//get Rs -> r1
	ubfx   r3, r2, #12, #4						//get RdLo -> r3
	ubfx   r12, r2, #16, #4						//get RdHi -> r12
	ldr    r1, [r4, r1, lsl #2]					//get Rs's value -> r1
	ldr    r5, [r4, r3, lsl #2]					//get RdLo's value -> r5
	ldr    r2, [r4, r12, lsl #2]				//get RdHi's value -> r2
	smlal  r5, r2, r0, r1
	str    r2, [r4, r12, lsl #2]				//set RdHi's value <- r2
	str    r5, [r4, r3, lsl #2]					//set RdLo's value <- r5
	nextinstr

dp_smlals:										//T2 has no flag-setting version of this, so we synthesize it. Luckily nobody uses this
	ubfx   r1, r2, #8, #4						//get Rs -> r1
	ubfx   r3, r2, #12, #4						//get RdLo -> r3
	ubfx   r12, r2, #16, #4						//get RdHi -> r12
	ldr    r1, [r4, r1, lsl #2]					//get Rs's value -> r1
	ldr    r5, [r4, r3, lsl #2]					//get RdLo's value -> r5
	ldr    r2, [r4, r12, lsl #2]				//get RdHi's value -> r2
	smlal  r5, r2, r0, r1
	str    r2, [r4, r12, lsl #2]				//set RdHi's value <- r2
	str    r5, [r4, r3, lsl #2]					//set RdLo's value <- r5
	lsrs   r0, r2, #31							//top bit of RdHi into r0
	lsls   r0, #1								//prepare it to go into N and a zero into Z
	bfi    esr, r0, #30, #2						//set SR's N to top bit of RdHi and Z to 0
	orrs   r5, r2								//test for entire result being zero
	it     eq
	orreq  esr, esr, #0x40000000				//set SR's Z
	nextinstr

dp_umull:
	ubfx   r1, r2, #8, #4						//get Rs -> r1
	ldr    r1, [r4, r1, lsl #2]					//get Rs's value -> r1
	umull  r1, r3, r0, r1
	ubfx   r12, r2, #12, #4						//get RdLo -> r12
	ubfx   r0, r2, #16, #4						//get RdHi -> r0
	str    r1, [r4, r12, lsl #2]				//set RdLo's value <- r1
	str    r3, [r4, r0, lsl #2]					//set RdHi's value <- r3
	nextinstr

dp_umulls:										//T2 has no flag-setting version of this, so we synthesize it. Luckily nobody uses this
	ubfx   r1, r2, #8, #4						//get Rs -> r1
	ldr    r1, [r4, r1, lsl #2]					//get Rs's value -> r1
	umull  r1, r3, r0, r1
	ubfx   r12, r2, #12, #4						//get RdLo -> r12
	ubfx   r0, r2, #16, #4						//get RdHi -> r0
	str    r1, [r4, r12, lsl #2]				//set RdLo's value <- r1
	str    r3, [r4, r0, lsl #2]					//set RdHi's value <- r3
	lsrs   r0, r3, #31							//top bit of RdHi into r0
	lsls   r0, #1								//prepare it to go into N and a zero into Z
	bfi    esr, r0, #30, #2						//set SR's N to top bit of RdHi and Z to 0
	orrs   r1, r3								//test for entire result being zero
	it     eq
	orreq  esr, esr, #0x40000000				//set SR's Z
	nextinstr

dp_umlal:
	ubfx   r1, r2, #8, #4						//get Rs -> r1
	ubfx   r3, r2, #12, #4						//get RdLo -> r3
	ubfx   r12, r2, #16, #4						//get RdHi -> r12
	ldr    r1, [r4, r1, lsl #2]					//get Rs's value -> r1
	ldr    r5, [r4, r3, lsl #2]					//get RdLo's value -> r5
	ldr    r2, [r4, r12, lsl #2]				//get RdHi's value -> r2
	umlal  r5, r2, r0, r1
	str    r2, [r4, r12, lsl #2]				//set RdHi's value <- r2
	str    r5, [r4, r3, lsl #2]					//set RdLo's value <- r5
	nextinstr

dp_umlals:										//T2 has no flag-setting version of this, so we synthesize it. Luckily nobody uses this
	ubfx   r1, r2, #8, #4						//get Rs -> r1
	ubfx   r3, r2, #12, #4						//get RdLo -> r3
	ubfx   r12, r2, #16, #4						//get RdHi -> r12
	ldr    r1, [r4, r1, lsl #2]					//get Rs's value -> r1
	ldr    r5, [r4, r3, lsl #2]					//get RdLo's value -> r5
	ldr    r2, [r4, r12, lsl #2]				//get RdHi's value -> r2
	umlal  r5, r2, r0, r1
	str    r2, [r4, r12, lsl #2]				//set RdHi's value <- r2
	str    r5, [r4, r3, lsl #2]					//set RdLo's value <- r5
	lsrs   r0, r2, #31							//top bit of RdHi into r0
	lsls   r0, #1								//prepare it to go into N and a zero into Z
	bfi    esr, r0, #30, #2						//set SR's N to top bit of RdHi and Z to 0
	orrs   r5, r2								//test for entire result being zero
	it     eq
	orreq  esr, esr, #0x40000000				//set SR's Z
	nextinstr

dp_swp:
	ubfx   r1, r2, #16, #4						//get Rn -> r1
	ldr    r1, [r4, r1, lsl #2]					//get Rn's value -> r1
1:
	ldrex  r3, [r1]
	strex  r5, r0, [r1]
	cmp    r5, #0
	bne    1b
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	str    r3, [r4, r1, lsl #2]					//set Rd's value <- r3
	nextinstr

dp_swpb:
	ubfx   r1, r2, #16, #4						//get Rn -> r1
	ldr    r1, [r4, r1, lsl #2]					//get Rn's value -> r1
1:
	ldrexb r3, [r1]
	strexb r5, r0, [r1]
	cmp    r5, #0
	bne    1b
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	str    r3, [r4, r1, lsl #2]					//set Rd's value <- r3
	nextinstr

dp_mul_und:
	udf    #0x00

dp_halfw:
	ubfx   r1, r2, #20, #5						//dispatch on 'PUiWL'
	tbh    [pc, r1]
dp_halfw_disp:
	.hword (dp_hw_00000 - dp_halfw_disp) / 2	//STRH Rd, [Rn], -Rm
	.hword (dp_hw_00001 - dp_halfw_disp) / 2	//LDRH Rd, [Rn], -Rm
	.hword (dp_hw_undef - dp_halfw_disp) / 2
	.hword (dp_hw_undef - dp_halfw_disp) / 2
	.hword (dp_hw_00100 - dp_halfw_disp) / 2	//STRH Rd, [Rn], -#imm
	.hword (dp_hw_00101 - dp_halfw_disp) / 2	//LDRH Rd, [Rn], -#imm
	.hword (dp_hw_undef - dp_halfw_disp) / 2
	.hword (dp_hw_undef - dp_halfw_disp) / 2
	.hword (dp_hw_01000 - dp_halfw_disp) / 2	//STRH Rd, [Rn], +Rm
	.hword (dp_hw_01001 - dp_halfw_disp) / 2	//LDRH Rd, [Rn], +Rm
	.hword (dp_hw_undef - dp_halfw_disp) / 2
	.hword (dp_hw_undef - dp_halfw_disp) / 2
	.hword (dp_hw_01100 - dp_halfw_disp) / 2	//STRH Rd, [Rn], +#imm
	.hword (dp_hw_01101 - dp_halfw_disp) / 2	//LDRH Rd, [Rn], +#imm
	.hword (dp_hw_undef - dp_halfw_disp) / 2
	.hword (dp_hw_undef - dp_halfw_disp) / 2
	.hword (dp_hw_10000 - dp_halfw_disp) / 2	//STRH Rd, [Rn, -Rm]
	.hword (dp_hw_10001 - dp_halfw_disp) / 2	//LDRH Rd, [Rn, -Rm]
	.hword (dp_hw_10010 - dp_halfw_disp) / 2	//STRH Rd, [Rn, -Rm]!
	.hword (dp_hw_10011 - dp_halfw_disp) / 2	//LDRH Rd, [Rn, -Rm]!
	.hword (dp_hw_10100 - dp_halfw_disp) / 2	//STRH Rd, [Rn, -#imm]
	.hword (dp_hw_10101 - dp_halfw_disp) / 2	//LDRH Rd, [Rn, -#imm]
	.hword (dp_hw_10110 - dp_halfw_disp) / 2	//STRH Rd, [Rn, -#imm]!
	.hword (dp_hw_10111 - dp_halfw_disp) / 2	//LDRH Rd, [Rn, -#imm]!
	.hword (dp_hw_11000 - dp_halfw_disp) / 2	//STRH Rd, [Rn, +Rm]
	.hword (dp_hw_11001 - dp_halfw_disp) / 2	//LDRH Rd, [Rn, +Rm]
	.hword (dp_hw_11010 - dp_halfw_disp) / 2	//STRH Rd, [Rn, +Rm]!
	.hword (dp_hw_11011 - dp_halfw_disp) / 2	//LDRH Rd, [Rn, +Rm]!
	.hword (dp_hw_11100 - dp_halfw_disp) / 2	//STRH Rd, [Rn, +#imm]
	.hword (dp_hw_11101 - dp_halfw_disp) / 2	//LDRH Rd, [Rn, +#imm]
	.hword (dp_hw_11111 - dp_halfw_disp) / 2	//STRH Rd, [Rn, +#imm]!
	.hword (dp_hw_11111 - dp_halfw_disp) / 2	//LDRH Rd, [Rn, +#imm]!

dp_hw_undef:
	udf    #0x00

dp_hw_00000:									//STRH Rd, [Rn], -Rm
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldr    r2, [r4, r1, lsl #2]					//get Rd's value -> r2
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	strh   r2, [r5]								//perform the store
	subs   r5, r0								//positincrement
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hw_00001:									//LDRH Rd, [Rn], -Rm
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrh   r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	subs   r5, r0								//positincrement
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hw_00100:									//STRH Rd, [Rn], -#imm
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r2, [r4, r1, lsl #2]					//get Rd's value -> r2
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	strh   r2, [r5]								//perform the store
	subs   r5, r0								//positincrement
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hw_00101:									//LDRH Rd, [Rn], -#imm
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrh   r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	subs   r5, r0								//positincrement
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hw_01000:									//STRH Rd, [Rn], +Rm
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldr    r2, [r4, r1, lsl #2]					//get Rd's value -> r2
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	strh   r2, [r5]								//perform the store
	adds   r5, r0								//positincrement
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hw_01001:									//LDRH Rd, [Rn], +Rm
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrh   r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	adds   r5, r0								//positincrement
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hw_01100:									//STRH Rd, [Rn], +#imm
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r2, [r4, r1, lsl #2]					//get Rd's value -> r2
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	strh   r2, [r5]								//perform the store
	adds   r5, r0								//positincrement
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hw_01101:									//LDRH Rd, [Rn], +#imm
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrh   r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	adds   r5, r0								//positincrement
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hw_10000:									//STRH Rd, [Rn, -Rm]
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldr    r2, [r4, r1, lsl #2]					//get Rd's value -> r2
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	strh   r2, [r5]								//perform the store
	nextinstr

dp_hw_10001:									//LDRH Rd, [Rn, -Rm]
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	ldrh   r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_hw_10010:									//STRH Rd, [Rn, -Rm]!
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldr    r2, [r4, r1, lsl #2]					//get Rd's value -> r2
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	strh   r2, [r5]								//perform the store
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_hw_10011:									//LDRH Rd, [Rn, -Rm]!
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	ldrh   r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_hw_10100:									//STRH Rd, [Rn, -#imm]
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r2, [r4, r1, lsl #2]					//get Rd's value -> r2
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	strh   r2, [r5]								//perform the store
	nextinstr

dp_hw_10101:									//LDRH Rd, [Rn, -#imm]
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	ldrh   r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_hw_10110:									//STRH Rd, [Rn, -#imm]!
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r2, [r4, r1, lsl #2]					//get Rd's value -> r2
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	strh   r2, [r5]								//perform the store
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_hw_10111:									//LDRH Rd, [Rn, -#imm]!
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	ldrh   r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_hw_11000:									//STRH Rd, [Rn, +Rm]
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldr    r2, [r4, r1, lsl #2]					//get Rd's value -> r2
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	strh   r2, [r5, r0]							//perform the store
	nextinstr

dp_hw_11001:									//LDRH Rd, [Rn, +Rm]
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrh   r2, [r5, r0]							//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_hw_11010:									//STRH Rd, [Rn, +Rm]!
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldr    r2, [r4, r1, lsl #2]					//get Rd's value -> r2
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	adds   r5, r0								//preincrement
	strh   r2, [r5]								//perform the store
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_hw_11011:									//LDRH Rd, [Rn, +Rm]!
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	adds   r5, r0								//preincrement
	ldrh   r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_hw_11100:									//STRH Rd, [Rn, +#imm]
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r2, [r4, r1, lsl #2]					//get Rd's value -> r2
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	strh   r2, [r5, r0]							//perform the store
	nextinstr

dp_hw_11101:									//LDRH Rd, [Rn, +#imm]
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrh   r2, [r5, r0]							//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_hw_11110:									//STRH Rd, [Rn, +#imm]!
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r2, [r4, r1, lsl #2]					//get Rd's value -> r2
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	adds   r5, r0								//preincrement
	strh   r2, [r5]								//perform the store
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_hw_11111:									//LDRH Rd, [Rn, +#imm]!
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	adds   r5, r0								//preincrement
	ldrh   r2, [r5]								//perform the load
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_ldrd_sb:
	ubfx   r1, r2, #20, #5						//dispatch on 'PUiWL'
	tbh    [pc, r1]
dp_ldrdsb_dsp:
	.hword (dp_sd_00000 - dp_ldrdsb_dsp) / 2	//LDRD  Rd, [Rn], -Rm
	.hword (dp_sd_00001 - dp_ldrdsb_dsp) / 2	//LDRSB Rd, [Rn], -Rm
	.hword (dp_sd_undef - dp_ldrdsb_dsp) / 2
	.hword (dp_sd_undef - dp_ldrdsb_dsp) / 2
	.hword (dp_sd_00100 - dp_ldrdsb_dsp) / 2	//LDRD  Rd, [Rn], -#imm
	.hword (dp_sd_00101 - dp_ldrdsb_dsp) / 2	//LDRSB Rd, [Rn], -#imm
	.hword (dp_sd_undef - dp_ldrdsb_dsp) / 2
	.hword (dp_sd_undef - dp_ldrdsb_dsp) / 2
	.hword (dp_sd_01000 - dp_ldrdsb_dsp) / 2	//LDRD  Rd, [Rn], +Rm
	.hword (dp_sd_01001 - dp_ldrdsb_dsp) / 2	//LDRSB Rd, [Rn], +Rm
	.hword (dp_sd_undef - dp_ldrdsb_dsp) / 2
	.hword (dp_sd_undef - dp_ldrdsb_dsp) / 2
	.hword (dp_sd_01100 - dp_ldrdsb_dsp) / 2	//LDRD  Rd, [Rn], +#imm
	.hword (dp_sd_01101 - dp_ldrdsb_dsp) / 2	//LDRSB Rd, [Rn], +#imm
	.hword (dp_sd_undef - dp_ldrdsb_dsp) / 2
	.hword (dp_sd_undef - dp_ldrdsb_dsp) / 2
	.hword (dp_sd_10000 - dp_ldrdsb_dsp) / 2	//LDRD  Rd, [Rn, -Rm]
	.hword (dp_sd_10001 - dp_ldrdsb_dsp) / 2	//LDRSB Rd, [Rn, -Rm]
	.hword (dp_sd_10010 - dp_ldrdsb_dsp) / 2	//LDRD  Rd, [Rn, -Rm]!
	.hword (dp_sd_10011 - dp_ldrdsb_dsp) / 2	//LDRSB Rd, [Rn, -Rm]!
	.hword (dp_sd_10100 - dp_ldrdsb_dsp) / 2	//LDRD  Rd, [Rn, -#imm]
	.hword (dp_sd_10101 - dp_ldrdsb_dsp) / 2	//LDRSB Rd, [Rn, -#imm]
	.hword (dp_sd_10110 - dp_ldrdsb_dsp) / 2	//LDRD  Rd, [Rn, -#imm]!
	.hword (dp_sd_10111 - dp_ldrdsb_dsp) / 2	//LDRSB Rd, [Rn, -#imm]!
	.hword (dp_sd_11000 - dp_ldrdsb_dsp) / 2	//LDRD  Rd, [Rn, +Rm]
	.hword (dp_sd_11001 - dp_ldrdsb_dsp) / 2	//LDRSB Rd, [Rn, +Rm]
	.hword (dp_sd_11010 - dp_ldrdsb_dsp) / 2	//LDRD  Rd, [Rn, +Rm]!
	.hword (dp_sd_11011 - dp_ldrdsb_dsp) / 2	//LDRSB Rd, [Rn, +Rm]!
	.hword (dp_sd_11100 - dp_ldrdsb_dsp) / 2	//LDRD  Rd, [Rn, +#imm]
	.hword (dp_sd_11101 - dp_ldrdsb_dsp) / 2	//LDRSB Rd, [Rn, +#imm]
	.hword (dp_sd_11111 - dp_ldrdsb_dsp) / 2	//LDRD  Rd, [Rn, +#imm]!
	.hword (dp_sd_11111 - dp_ldrdsb_dsp) / 2	//LDRSB Rd, [Rn, +#imm]!

dp_sd_undef:
	udf    #0x00

dp_sd_00000:									//LDRD Rd, [Rn], -Rm
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrd   r12, r2, [r5]						//perform the doubleload
	add    r1, r4, r1, lsl #2					//calc where we'll store the two regs
	strd   r12, r2, [r1]						//store the two words into (Rd, Rd+1)
	subs   r5, r0								//positincrement
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_sd_00001:									//LDRSB Rd, [Rn], -Rm
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r0, r5, r0							//positincrement
	ldrsb  r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r0, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_sd_00100:									//LDRD Rd, [Rn], -#imm
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrd   r12, r2, [r5]						//perform the doubleload
	add    r1, r4, r1, lsl #2					//calc where we'll store the two regs
	subs   r5, r0								//positincrement
	strd   r12, r2, [r1]						//store the two words into (Rd, Rd+1)
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_sd_00101:									//LDRSB Rd, [Rn], -#imm
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsb  r2, [r5]								//perform the load
	subs   r0, r5, r0							//positincrement
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r0, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_sd_01000:									//LDRD Rd, [Rn], +Rm
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrd   r12, r2, [r5]						//perform the doubleload
	add    r1, r4, r1, lsl #2					//calc where we'll store the two regs
	adds   r5, r0								//positincrement
	strd   r12, r2, [r1]						//store the two words into (Rd, Rd+1)
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_sd_01001:									//LDRSB Rd, [Rn], +Rm
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsb  r2, [r5, r0]							//perform the load (this instr form is shorter and faster)
	adds   r5, r0								//positincrement
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_sd_01100:									//LDRD Rd, [Rn], +#imm
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrd   r12, r2, [r5]						//perform the doubleload
	add    r1, r4, r1, lsl #2					//calc where we'll store the two regs
	adds   r5, r0								//positincrement
	strd   r12, r2, [r1]						//store the two words into (Rd, Rd+1)
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_sd_01101:									//LDRSB Rd, [Rn], +#imm
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsb  r2, [r5, r0]							//perform the load (this instr form is shorter and faster)
	adds   r5, r0								//positincrement
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_sd_10000:									//LDRD Rd, [Rn, -Rm]
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	ldrd   r12, r2, [r5]						//perform the doubleload
	add    r1, r4, r1, lsl #2					//calc where we'll store the two regs
	strd   r12, r2, [r1]						//store the two words into (Rd, Rd+1)
	nextinstr

dp_sd_10001:									//LDRSB Rd, [Rn, -Rm]
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	ldrsb  r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_sd_10010:									//LDRD Rd, [Rn, -Rm]!
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	ldrd   r12, r2, [r5]						//perform the doubleload
	add    r1, r4, r1, lsl #2					//calc where we'll store the two regs
	strd   r12, r2, [r1]						//store the two words into (Rd, Rd+1)
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_sd_10011:									//LDRSB Rd, [Rn, -Rm]!
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsb  r2, [r5]								//perform the load
	subs   r0, r5, r0							//preincrement
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r0, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_sd_10100:									//LDRD Rd, [Rn, -#imm]
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	add    r1, r4, r1, lsl #2					//calc where we'll store the two regs
	subs   r5, r0								//preincrement
	ldrd   r12, r2, [r5]						//perform the doubleload
	strd   r12, r2, [r1]						//store the two words into (Rd, Rd+1)
	nextinstr

dp_sd_10101:									//LDRSB Rd, [Rn, -#imm]
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	ldrsb  r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_sd_10110:									//LDRD Rd, [Rn, -#imm]!
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	ldrd   r12, r2, [r5]						//perform the doubleload
	add    r1, r4, r1, lsl #2					//calc where we'll store the two regs
	strd   r12, r2, [r1]						//store the two words into (Rd, Rd+1)
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_sd_10111:									//LDRSB Rd, [Rn, -#imm]!
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	ldrsb  r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_sd_11000:									//LDRD Rd, [Rn, +Rm]
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	adds   r5, r0								//preincrement
	ldrd   r12, r2, [r5]						//perform the doubleload
	add    r1, r4, r1, lsl #2					//calc where we'll store the two regs
	strd   r12, r2, [r1]						//store the two words into (Rd, Rd+1)
	nextinstr

dp_sd_11001:									//LDRSB Rd, [Rn, +Rm]
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsb  r2, [r5, r0]							//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_sd_11010:									//LDRD Rd, [Rn, +Rm]!
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	adds   r5, r0								//preincrement
	ldrd   r12, r2, [r5]						//perform the doubleload
	add    r1, r4, r1, lsl #2					//calc where we'll store the two regs
	strd   r12, r2, [r1]						//store the two words into (Rd, Rd+1)
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_sd_11011:									//LDRSB Rd, [Rn, +Rm]!
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsb  r2, [r5, r0]							//perform the load (this instr form is shorter and faster)
	adds   r5, r0								//preincrement
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_sd_11100:									//LDRD Rd, [Rn, +#imm]
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrd   r12, r2, [r5]						//perform the doubleload
	add    r1, r4, r1, lsl #2					//calc where we'll store the two regs
	strd   r12, r2, [r1]						//store the two words into (Rd, Rd+1)
	nextinstr

dp_sd_11101:									//LDRSB Rd, [Rn, +#imm]
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsb  r2, [r5, r0]							//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_sd_11110:									//LDRD Rd, [Rn, +#imm]!
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	adds   r5, r0								//preincrement
	ldrd   r12, r2, [r5]						//perform the doubleload
	add    r1, r4, r1, lsl #2					//calc where we'll store the two regs
	strd   r12, r2, [r1]						//store the two words into (Rd, Rd+1)
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_sd_11111:									//LDRSB Rd, [Rn, +#imm]!
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsb  r2, [r5, r0]							//perform the load (this instr form is shorter and faster)
	adds   r5, r0								//preincrement
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_strd_sh:										// table 3.2 (STRD, LDRSH)
	ubfx   r1, r2, #20, #5						//dispatch on 'PUiWL'
	tbh    [pc, r1]
dp_strdsh_dsp:
	.hword (dp_hd_00000 - dp_strdsh_dsp) / 2	//STRD  Rd, [Rn], -Rm
	.hword (dp_hd_00001 - dp_strdsh_dsp) / 2	//LDRSH Rd, [Rn], -Rm
	.hword (dp_hd_undef - dp_strdsh_dsp) / 2
	.hword (dp_hd_undef - dp_strdsh_dsp) / 2
	.hword (dp_hd_00100 - dp_strdsh_dsp) / 2	//STRD  Rd, [Rn], -#imm
	.hword (dp_hd_00101 - dp_strdsh_dsp) / 2	//LDRSH Rd, [Rn], -#imm
	.hword (dp_hd_undef - dp_strdsh_dsp) / 2
	.hword (dp_hd_undef - dp_strdsh_dsp) / 2
	.hword (dp_hd_01000 - dp_strdsh_dsp) / 2	//STRD  Rd, [Rn], +Rm
	.hword (dp_hd_01001 - dp_strdsh_dsp) / 2	//LDRSH Rd, [Rn], +Rm
	.hword (dp_hd_undef - dp_strdsh_dsp) / 2
	.hword (dp_hd_undef - dp_strdsh_dsp) / 2
	.hword (dp_hd_01100 - dp_strdsh_dsp) / 2	//STRD  Rd, [Rn], +#imm
	.hword (dp_hd_01101 - dp_strdsh_dsp) / 2	//LDRSH Rd, [Rn], +#imm
	.hword (dp_hd_undef - dp_strdsh_dsp) / 2
	.hword (dp_hd_undef - dp_strdsh_dsp) / 2
	.hword (dp_hd_10000 - dp_strdsh_dsp) / 2	//STRD  Rd, [Rn, -Rm]
	.hword (dp_hd_10001 - dp_strdsh_dsp) / 2	//LDRSH Rd, [Rn, -Rm]
	.hword (dp_hd_10010 - dp_strdsh_dsp) / 2	//STRD  Rd, [Rn, -Rm]!
	.hword (dp_hd_10011 - dp_strdsh_dsp) / 2	//LDRSH Rd, [Rn, -Rm]!
	.hword (dp_hd_10100 - dp_strdsh_dsp) / 2	//STRD  Rd, [Rn, -#imm]
	.hword (dp_hd_10101 - dp_strdsh_dsp) / 2	//LDRSH Rd, [Rn, -#imm]
	.hword (dp_hd_10110 - dp_strdsh_dsp) / 2	//STRD  Rd, [Rn, -#imm]!
	.hword (dp_hd_10111 - dp_strdsh_dsp) / 2	//LDRSH Rd, [Rn, -#imm]!
	.hword (dp_hd_11000 - dp_strdsh_dsp) / 2	//STRD  Rd, [Rn, +Rm]
	.hword (dp_hd_11001 - dp_strdsh_dsp) / 2	//LDRSH Rd, [Rn, +Rm]
	.hword (dp_hd_11010 - dp_strdsh_dsp) / 2	//STRD  Rd, [Rn, +Rm]!
	.hword (dp_hd_11011 - dp_strdsh_dsp) / 2	//LDRSH Rd, [Rn, +Rm]!
	.hword (dp_hd_11100 - dp_strdsh_dsp) / 2	//STRD  Rd, [Rn, +#imm]
	.hword (dp_hd_11101 - dp_strdsh_dsp) / 2	//LDRSH Rd, [Rn, +#imm]
	.hword (dp_hd_11111 - dp_strdsh_dsp) / 2	//STRD  Rd, [Rn, +#imm]!
	.hword (dp_hd_11111 - dp_strdsh_dsp) / 2	//LDRSH Rd, [Rn, +#imm]!

dp_hd_undef:
	udf    #0x00

dp_hd_00000:									//STRD Rd, [Rn], -Rm
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	add    r1, r4, r1, lsl #2					//calc where we'll get the two regs
	ldrd   r12, r2, [r1]						//get the two words into (Rd, Rd+1)
	strd   r12, r2, [r5]						//perform the doublestore
	subs   r5, r0								//positincrement
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hd_00001:									//LDRSH Rd, [Rn], -Rm
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsh  r2, [r5]								//perform the load
	subs   r5, r0								//positincrement
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hd_00100:									//STRD Rd, [Rn], -#imm
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	add    r1, r4, r1, lsl #2					//calc where we'll get the two regs
	ldrd   r12, r2, [r1]						//get the two words into (Rd, Rd+1)
	strd   r12, r2, [r5]						//perform the doublestore
	subs   r5, r0								//positincrement
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hd_00101:									//LDRSH Rd, [Rn], -#imm
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsh  r2, [r5]								//perform the load
	subs   r5, r0								//positincrement
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hd_01000:									//STRD Rd, [Rn], +Rm
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	add    r1, r4, r1, lsl #2					//calc where we'll get the two regs
	ldrd   r12, r2, [r1]						//get the two words into (Rd, Rd+1)
	strd   r12, r2, [r5]						//perform the doublestore
	adds   r5, r0								//positincrement
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hd_01001:									//LDRSH Rd, [Rn], +Rm
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsh  r2, [r5]								//perform the load
	adds   r5, r0								//positincrement
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hd_01100:									//STRD Rd, [Rn], +#imm
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	add    r1, r4, r1, lsl #2					//calc where we'll get the two regs
	ldrd   r12, r2, [r1]						//get the two words into (Rd, Rd+1)
	strd   r12, r2, [r5]						//perform the doublestore
	adds   r5, r0								//positincrement
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hd_01101:									//LDRSH Rd, [Rn], +#imm
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsh  r2, [r5]								//perform the load
	adds   r5, r0								//positincrement
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5
	nextinstr

dp_hd_10000:									//STRD Rd, [Rn, -Rm]
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	add    r1, r4, r1, lsl #2					//calc where we'll get the two regs
	ldrd   r12, r2, [r1]						//get the two words into (Rd, Rd+1)
	strd   r12, r2, [r5]						//perform the doublestore
	nextinstr

dp_hd_10001:									//LDRSH Rd, [Rn, -Rm]
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	ldrsh  r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_hd_10010:									//STRD Rd, [Rn, -Rm]!
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	add    r1, r4, r1, lsl #2					//calc where we'll get the two regs
	ldrd   r12, r2, [r1]						//get the two words into (Rd, Rd+1)
	strd   r12, r2, [r5]						//perform the doublestore
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_hd_10011:									//LDRSH Rd, [Rn, -Rm]!
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	ldrsh  r2, [r5]								//perform the load
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_hd_10100:									//STRD Rd, [Rn, -#imm]
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	add    r1, r4, r1, lsl #2					//calc where we'll get the two regs
	ldrd   r12, r2, [r1]						//get the two words into (Rd, Rd+1)
	strd   r12, r2, [r5]						//perform the doublestore
	nextinstr

dp_hd_10101:									//LDRSH Rd, [Rn, -#imm]
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	ldrsh  r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_hd_10110:									//STRD Rd, [Rn, -#imm]!
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	add    r1, r4, r1, lsl #2					//calc where we'll get the two regs
	ldrd   r12, r2, [r1]						//get the two words into (Rd, Rd+1)
	strd   r12, r2, [r5]						//perform the doublestore
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_hd_10111:									//LDRSH Rd, [Rn, -#imm]!
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	subs   r5, r0								//preincrement
	ldrsh  r2, [r5]								//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_hd_11000:									//STRD Rd, [Rn, +Rm]
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	adds   r5, r0								//preincrement
	add    r1, r4, r1, lsl #2					//calc where we'll get the two regs
	ldrd   r12, r2, [r1]						//get the two words into (Rd, Rd+1)
	strd   r12, r2, [r5]						//perform the doublestore
	nextinstr

dp_hd_11001:									//LDRSH Rd, [Rn, +Rm]
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsh  r2, [r5, r0]							//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_hd_11010:									//STRD Rd, [Rn, +Rm]!
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	adds   r5, r0								//preincrement
	add    r1, r4, r1, lsl #2					//calc where we'll get the two regs
	ldrd   r12, r2, [r1]						//get the two words into (Rd, Rd+1)
	strd   r12, r2, [r5]						//perform the doublestore
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_hd_11011:									//LDRSH Rd, [Rn, +Rm]!
	ldr    r0, [r4, r5, lsl #2]					//get Rm's value -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsh  r2, [r5, r0]							//perform the load (this instr form is shorter and faster)
	adds   r5, r0								//preincrement
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_hd_11100:									//STRD Rd, [Rn, +#imm]
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	add    r1, r4, r1, lsl #2					//calc where we'll get the two regs
	ldrd   r12, r2, [r1]						//get the two words into (Rd, Rd+1)
	strd   r12, r2, [r5]						//perform the doublestore
	nextinstr

dp_hd_11101:									//LDRSH Rd, [Rn, +#imm]
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsh  r2, [r5, r0]							//perform the load
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	nextinstr

dp_hd_11110:									//STRD Rd, [Rn, +#imm]!
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	adds   r5, r0								//preincrement
	add    r1, r4, r1, lsl #2					//calc where we'll get the two regs
	ldrd   r12, r2, [r1]						//get the two words into (Rd, Rd+1)
	strd   r12, r2, [r5]						//perform the doublestore
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_hd_11111:									//LDRSH Rd, [Rn, +#imm]!
	ubfx   r0, r2, #8, #4						//get immedH
	add    r0, r5, r0, lsl #4					//calculate imm -> r0
	ubfx   r1, r2, #12, #4						//get Rd -> r1
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r5, [r4, r3, lsl #2]					//get Rn's value -> r5
	ldrsh  r2, [r5, r0]							//perform the load (this instr form is shorter and faster)
	adds   r5, r0								//preincrement
	str    r2, [r4, r1, lsl #2]					//set Rd's value <- r2
	str    r5, [r4, r3, lsl #2]					//set Rn's value <- r5 (writeback)
	nextinstr

dp_dispatch:									//r0 has op2, r1 has APSR with C bit set to shifter carry out, r2 still has instr
	ubfx   r3, r2, #20, #6						//dispatch on [[25]opcode..S]
	tbh    [pc, r3]
dp_disp_tab:
	.hword (dp_and  - dp_disp_tab) / 2
	.hword (dp_ands - dp_disp_tab) / 2
	.hword (dp_eor  - dp_disp_tab) / 2
	.hword (dp_eors - dp_disp_tab) / 2
	.hword (dp_sub  - dp_disp_tab) / 2
	.hword (dp_subs - dp_disp_tab) / 2
	.hword (dp_rsb  - dp_disp_tab) / 2
	.hword (dp_rsbs - dp_disp_tab) / 2
	.hword (dp_add  - dp_disp_tab) / 2
	.hword (dp_adds - dp_disp_tab) / 2
	.hword (dp_adc  - dp_disp_tab) / 2
	.hword (dp_adcs - dp_disp_tab) / 2
	.hword (dp_sbc  - dp_disp_tab) / 2
	.hword (dp_sbcs - dp_disp_tab) / 2
	.hword (dp_rsc  - dp_disp_tab) / 2
	.hword (dp_rscs - dp_disp_tab) / 2
	.hword (dp_0x10 - dp_disp_tab) / 2			//MRS Rx, CPSR or edsp instrs
	.hword (dp_tst  - dp_disp_tab) / 2
	.hword (dp_0x12 - dp_disp_tab) / 2			//MSR CPSR, Rx or BX or BLX or BKPT or edsp instrs
	.hword (dp_teq  - dp_disp_tab) / 2
	.hword (dp_0x14 - dp_disp_tab) / 2			//MRS Rx, SPSR or edsp instrs
	.hword (dp_cmp  - dp_disp_tab) / 2
	.hword (dp_0x16 - dp_disp_tab) / 2			//MSR SPSR, Rx or CLZ or edsp instrs
	.hword (dp_cmn  - dp_disp_tab) / 2
	.hword (dp_orr  - dp_disp_tab) / 2
	.hword (dp_orrs - dp_disp_tab) / 2
	.hword (dp_mov  - dp_disp_tab) / 2
	.hword (dp_movs - dp_disp_tab) / 2
	.hword (dp_bic  - dp_disp_tab) / 2
	.hword (dp_bics - dp_disp_tab) / 2
	.hword (dp_mvn  - dp_disp_tab) / 2
	.hword (dp_mvns - dp_disp_tab) / 2
	.hword (dp_and  - dp_disp_tab) / 2
	.hword (dp_ands - dp_disp_tab) / 2
	.hword (dp_eor  - dp_disp_tab) / 2
	.hword (dp_eors - dp_disp_tab) / 2
	.hword (dp_sub  - dp_disp_tab) / 2
	.hword (dp_subs - dp_disp_tab) / 2
	.hword (dp_rsb  - dp_disp_tab) / 2
	.hword (dp_rsbs - dp_disp_tab) / 2
	.hword (dp_add  - dp_disp_tab) / 2
	.hword (dp_adds - dp_disp_tab) / 2
	.hword (dp_adc  - dp_disp_tab) / 2
	.hword (dp_adcs - dp_disp_tab) / 2
	.hword (dp_sbc  - dp_disp_tab) / 2
	.hword (dp_sbcs - dp_disp_tab) / 2
	.hword (dp_rsc  - dp_disp_tab) / 2
	.hword (dp_rscs - dp_disp_tab) / 2
	.hword (dp_udf  - dp_disp_tab) / 2
	.hword (dp_tst  - dp_disp_tab) / 2
	.hword (dp_msri - dp_disp_tab) / 2			//MSR CPSR, imm
	.hword (dp_teq  - dp_disp_tab) / 2
	.hword (dp_udf  - dp_disp_tab) / 2
	.hword (dp_cmp  - dp_disp_tab) / 2
	.hword (dp_udf  - dp_disp_tab) / 2			//MSR SPSR, imm - unsupported
	.hword (dp_cmn  - dp_disp_tab) / 2
	.hword (dp_orr  - dp_disp_tab) / 2
	.hword (dp_orrs - dp_disp_tab) / 2
	.hword (dp_mov  - dp_disp_tab) / 2
	.hword (dp_movs - dp_disp_tab) / 2
	.hword (dp_bic  - dp_disp_tab) / 2
	.hword (dp_bics - dp_disp_tab) / 2
	.hword (dp_mvn  - dp_disp_tab) / 2
	.hword (dp_mvns - dp_disp_tab) / 2

dp_udf:
	udf    #0x00

dp_and:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	ands   r3, r0
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_ands:
	msr    APSR_nzcvq, esr						//move SR to APSR just to get the V bit in place
	lsrs   r1, #30								//shift out the desired shifter carry out to C, will keep V unchanged, will clobber Z&N which is ok
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	ands   r3, r0
	mrs    esr, APSR							//grab resulting CPSR into SR
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_eor:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	eors   r3, r0
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_eors:
	msr    APSR_nzcvq, esr						//move SR to APSR
	lsrs   r1, #30								//shift out the desired shifter carry out to C, will keep V unchanged, will clobber Z&N which is ok
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	eors   r3, r0
	mrs    esr, APSR							//grab resulting CPSR into SR
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_orr:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	orrs   r3, r0
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_orrs:
	msr    APSR_nzcvq, esr						//move SR to APSR
	lsrs   r1, #30								//shift out the desired shifter carry out to C, will keep V unchanged, will clobber Z&N which is ok
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	orrs   r3, r0
	mrs    esr, APSR							//grab resulting CPSR into SR
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_bic:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	bics   r3, r0
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_bics:
	msr    APSR_nzcvq, esr						//move SR to APSR
	lsrs   r1, #30								//shift out the desired shifter carry out to C, will keep V unchanged, will clobber Z&N which is ok
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	bics   r3, r0
	mrs    esr, APSR							//grab resulting CPSR into SR
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_mvn:
	mvns   r3, r0
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_mvns:
	msr    APSR_nzcvq, esr						//move SR to APSR
	lsrs   r1, #30								//shift out the desired shifter carry out to C, will keep V unchanged, will clobber Z&N which is ok
	mvns   r0, r0
	mrs    esr, APSR							//grab resulting CPSR into SR
	ubfx   r3, r2, #12, #4						//get Rd -> r3
	str    r0, [r4, r3, lsl #2]					//set Rd's value <- r0
	maybe_pc_written_not_interworked r3,r0

dp_sub:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	subs   r3, r0
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_subs:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	subs   r3, r0
	mrs    esr, APSR							//grab resulting CPSR into SR
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_add:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	adds   r3, r0
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_adds:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	adds   r3, r0
	mrs    esr, APSR							//grab resulting CPSR into SR
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_rsb:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	subs   r0, r3
	ubfx   r3, r2, #12, #4						//get Rd -> r3
	str    r0, [r4, r3, lsl #2]					//set Rd's value <- r0
	maybe_pc_written_not_interworked r3, r0

dp_rsbs:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	subs   r0, r3
	mrs    esr, APSR							//grab resulting CPSR into SR
	ubfx   r3, r2, #12, #4						//get Rd -> r3
	str    r0, [r4, r3, lsl #2]					//set Rd's value <- r0
	maybe_pc_written_not_interworked r3, r0

dp_adc:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	msr    APSR_nzcvq, esr						//move SR to APSR
	adcs   r3, r0
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_adcs:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	msr    APSR_nzcvq, esr						//move SR to APSR
	adcs   r3, r0
	mrs    esr, APSR							//grab resulting CPSR into SR
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_sbc:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	msr    APSR_nzcvq, esr						//move SR to APSR
	sbcs   r3, r0
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_sbcs:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	msr    APSR_nzcvq, esr						//move SR to APSR
	sbcs   r3, r0
	mrs    esr, APSR							//grab resulting CPSR into SR
	ubfx   r0, r2, #12, #4						//get Rd -> r0
	str    r3, [r4, r0, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r0,r3

dp_rsc:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	msr    APSR_nzcvq, esr						//move SR to APSR
	sbcs   r0, r3
	ubfx   r3, r2, #12, #4						//get Rd -> r3
	str    r0, [r4, r3, lsl #2]					//set Rd's value <- r0
	maybe_pc_written_not_interworked r3, r0

dp_rscs:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	msr    APSR_nzcvq, esr						//move SR to APSR
	sbcs   r0, r3
	mrs    esr, APSR							//grab resulting CPSR into SR
	ubfx   r3, r2, #12, #4						//get Rd -> r3
	str    r0, [r4, r3, lsl #2]					//set Rd's value <- r0
	maybe_pc_written_not_interworked r3, r0

dp_mov:
	//r0 already has op2
	ubfx   r3, r2, #12, #4						//get Rd -> r0
	str    r0, [r4, r3, lsl #2]					//set Rd's value <- r3
	maybe_pc_written_not_interworked r3, r0

dp_movs:
	msr    APSR_nzcvq, esr						//move SR to APSR
	lsrs   r1, #30								//shift out the desired shifter carry out to C, will keep V unchanged, will clobber Z&N which is ok
	tst    r0, r0								//set NZ
	mrs    esr, APSR							//grab resulting CPSR into SR
	ubfx   r3, r2, #12, #4						//get Rd -> r3
	str    r0, [r4, r3, lsl #2]					//set Rd's value <- r0
	maybe_pc_written_not_interworked r3, r0

dp_cmp:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	cmp    r3, r0
	mrs    esr, APSR							//grab resulting CPSR into SR
	nextinstr

dp_cmn:
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	cmn    r3, r0
	mrs    esr, APSR							//grab resulting CPSR into SR
	nextinstr

dp_tst:
	msr    APSR_nzcvq, esr						//move SR to APSR
	lsrs   r1, #30								//shift out the desired shifter carry out to C, will keep V unchanged, will clobber Z&N which is ok
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	tst    r3, r0
	mrs    esr, APSR							//grab resulting CPSR into SR
	nextinstr

dp_teq:
	msr    APSR_nzcvq, esr						//move SR to APSR
	lsrs   r1, #30								//shift out the desired shifter carry out to C, will keep V unchanged, will clobber Z&N which is ok
	ubfx   r3, r2, #16, #4						//get Rn -> r3
	ldr    r3, [r4, r3, lsl #2]					//get Rn's value -> r3
	eors   r3, r0								//eors does same as teq, but is one cycle faster
	mrs    esr, APSR							//grab resulting CPSR into SR
	nextinstr

dp_0x10:										//table 3.3: MRS Rx, CPSR or edsp instrs	[these are suboptimal since we calculated addr mode 1's op2 uselessly before we got here]
	ubfx   r3, r2, #4, #4						//get bits 4..7 for dispatch
	tbb    [pc, r3]
dp_0x10_disp:
	.byte  (dp_mrs     - dp_0x10_disp) / 2
	.byte  (dp_x_undef - dp_0x10_disp) / 2
	.byte  (dp_x_undef - dp_0x10_disp) / 2
	.byte  (dp_x_undef - dp_0x10_disp) / 2
	.byte  (dp_x_undef - dp_0x10_disp) / 2
	.byte  (dp_qadd    - dp_0x10_disp) / 2
	.byte  (dp_x_undef - dp_0x10_disp) / 2
	.byte  (dp_x_undef - dp_0x10_disp) / 2
	.byte  (dp_smlabb  - dp_0x10_disp) / 2
	.byte  (dp_x_undef - dp_0x10_disp) / 2
	.byte  (dp_smlatb  - dp_0x10_disp) / 2
	.byte  (dp_x_undef - dp_0x10_disp) / 2
	.byte  (dp_smlabt  - dp_0x10_disp) / 2
	.byte  (dp_x_undef - dp_0x10_disp) / 2
	.byte  (dp_smlatt  - dp_0x10_disp) / 2
	.byte  (dp_x_undef - dp_0x10_disp) / 2

dp_0x14:										//table 3.3: edsp instrs	[these are suboptimal since we calculated addr mode 1's op2 uselessly before we got here]
	ubfx   r3, r2, #4, #4						//get bits 4..7 for dispatch
	tbb    [pc, r3]
dp_0x14_disp:
	.byte  (dp_x_undef - dp_0x14_disp) / 2
	.byte  (dp_x_undef - dp_0x14_disp) / 2
	.byte  (dp_x_undef - dp_0x14_disp) / 2
	.byte  (dp_x_undef - dp_0x14_disp) / 2
	.byte  (dp_x_undef - dp_0x14_disp) / 2
	.byte  (dp_qdadd   - dp_0x14_disp) / 2
	.byte  (dp_x_undef - dp_0x14_disp) / 2
	.byte  (dp_x_undef - dp_0x14_disp) / 2
	.byte  (dp_smlalbb - dp_0x14_disp) / 2
	.byte  (dp_x_undef - dp_0x14_disp) / 2
	.byte  (dp_smlaltb - dp_0x14_disp) / 2
	.byte  (dp_x_undef - dp_0x14_disp) / 2
	.byte  (dp_smlalbt - dp_0x14_disp) / 2
	.byte  (dp_x_undef - dp_0x14_disp) / 2
	.byte  (dp_smlaltt - dp_0x14_disp) / 2
	.byte  (dp_x_undef - dp_0x14_disp) / 2

dp_0x16:										//table 3.3: CLZ or edsp instrs	[these are suboptimal since we calculated addr mode 1's op2 uselessly before we got here]
	ubfx   r3, r2, #4, #4						//get bits 4..7 for dispatch
	tbb    [pc, r3]
dp_0x16_disp:
	.byte  (dp_x_undef - dp_0x16_disp) / 2
	.byte  (dp_clz     - dp_0x16_disp) / 2
	.byte  (dp_x_undef - dp_0x16_disp) / 2
	.byte  (dp_x_undef - dp_0x16_disp) / 2
	.byte  (dp_x_undef - dp_0x16_disp) / 2
	.byte  (dp_qsub    - dp_0x16_disp) / 2
	.byte  (dp_x_undef - dp_0x16_disp) / 2
	.byte  (dp_x_undef - dp_0x16_disp) / 2
	.byte  (dp_smulbb  - dp_0x16_disp) / 2
	.byte  (dp_x_undef - dp_0x16_disp) / 2
	.byte  (dp_smultb  - dp_0x16_disp) / 2
	.byte  (dp_x_undef - dp_0x16_disp) / 2
	.byte  (dp_smulbt  - dp_0x16_disp) / 2
	.byte  (dp_x_undef - dp_0x16_disp) / 2
	.byte  (dp_smultt  - dp_0x16_disp) / 2
	.byte  (dp_x_undef - dp_0x16_disp) / 2

dp_clz:
	ubfx   r1, r2, #0, #4						//get Rm
	ldr    r1, [r4, r1, lsl #2]					//get Rm's value -> r1
	clz    r1, r1
	ubfx   r0, r2, #12, #4						//get Rd
	str    r1, [r4, r0, lsl #2]					//get Rd's value <- r1
	nextinstr

dp_0x12:										//table 3.3: MSR CPSR, Rx or BX or BLX or BKPT or edsp instrs	[these are suboptimal since we calculated addr mode 1's op2 uselessly before we got here]
	ubfx   r3, r2, #4, #4						//get bits 4..7 for dispatch
	tbb    [pc, r3]
dp_0x12_disp:
	.byte  (dp_msrr    - dp_0x12_disp) / 2
	.byte  (dp_bx      - dp_0x12_disp) / 2
	.byte  (dp_x_undef - dp_0x12_disp) / 2
	.byte  (dp_blx     - dp_0x12_disp) / 2
	.byte  (dp_x_undef - dp_0x12_disp) / 2
	.byte  (dp_qsub    - dp_0x12_disp) / 2
	.byte  (dp_x_undef - dp_0x12_disp) / 2
	.byte  (dp_x_undef - dp_0x12_disp) / 2
	.byte  (dp_smlawb  - dp_0x12_disp) / 2
	.byte  (dp_x_undef - dp_0x12_disp) / 2
	.byte  (dp_smulwb  - dp_0x12_disp) / 2
	.byte  (dp_x_undef - dp_0x12_disp) / 2
	.byte  (dp_smlawt  - dp_0x12_disp) / 2
	.byte  (dp_x_undef - dp_0x12_disp) / 2
	.byte  (dp_smulwt  - dp_0x12_disp) / 2
	.byte  (dp_x_undef - dp_0x12_disp) / 2

dp_x_undef:
	udf    #0x00
	
dp_msrr:
	ubfx   r0, r2, #0, #4						//get Rm -> r0
	ldr    r0, [r4, r0, lsl #2]					//get Rm's value -> r0
	//fallthrough to dp_msri
dp_msri:										//expects value in r0
	lsrs   r1, r2, #20							//see if field mask [3] is set
	it     cs
	andcs  esr, r0, #0xFF000000					//if needed, set SR
	nextinstr

dp_mrs:
	and    r0, esr, #0xFF000000					//get SR's top bits
	adds   r0, #0x10							//user mode
	ubfx   r1, r2, #12, #4						//get Rd -> r0
	str    r0, [r4, r1, lsl #2]					//store sr into Rd
	nextinstr

dp_blx:
	ubfx   r1, r2, #0, #4						//get Rm
	ldr    r1, [r4, r1, lsl #2]					//get Rm's value
	str    epc, [r4, #0x38]						//store next instr address into LR
	mov    epc, r1
	lsrs   r1, #1								//if thumb address, bail
	bcs    emu_out_with_epc
	nextinstr
	
dp_bx:
	ubfx   r1, r2, #0, #4						//get Rm
	ldr    epc, [r4, r1, lsl #2]					//get Rm's value directly into epc
	tst    epc, #1								//check for low bit here - it is faster since we already have it in a reg
	bne    emu_out_with_epc
	nextinstr

//MRS:		cccc 0001 0000 1111 dddd 0000 0000 0000
//QADD:		cccc 0001 0000 nnnn dddd 0000 0101 mmmm
//SMLAxy	cccc 0001 0000 dddd nnnn ssss 1yx0 mmmm

//MSR r:	cccc 0001 0010 ffff 1111 0000 0000 mmmm
//BX:		cccc 0001 0010 1111 1111 1111 0001 mmmm
//BLX:		cccc 0001 0010 1111 1111 1111 0011 mmmm
//QSUB:		cccc 0001 0010 nnnn dddd 0000 0101 mmmm
//SMLAWy	cccc 0001 0010 dddd nnnn ssss 1y00 mmmm
//SMULWy	cccc 0001 0010 dddd nnnn ssss 1y10 mmmm


//QDADD:	cccc 0001 0100 nnnn dddd 0000 0101 mmmm
//SMLALxy	cccc 0001 0100 dddd nnnn ssss 1yx0 mmmm

//CLZ:		cccc 0001 0110 1111 dddd 1111 0001 mmmm
//QDSUB:	cccc 0001 0110 nnnn dddd 0000 0101 mmmm
//SMULxy	cccc 0001 0110 dddd nnnn ssss 1yx0 mmmm

dp_qadd:
dp_qsub:
dp_qdadd:
dp_qdsub:
dp_smlabb:
dp_smlabt:
dp_smlatb:
dp_smlatt:
dp_smlalbb:
dp_smlalbt:
dp_smlaltb:
dp_smlaltt:
dp_smulbb:
dp_smulbt:
dp_smultb:
dp_smultt:
dp_smlawb:
dp_smulwb:
dp_smlawt:
dp_smulwt:
						//these all fall through to undef
	udf   #01

emu_out_with_epc:		//out with pc only in epc
	str    epc, [r4, #0x3c]
	//fallthrough
	
emu_out:
	str    esr, [r4, #0x40]	//store SR (likely nobody cares but let's be accurate)
	mov    r0, r4
	bx     lr


//dispatch, here for short branches
	dispatch_code_entry_row inst_swi

inst_swi:
	lsls   r0, r2, #4
	ldr    r1, =0xf1234560
	cmp    r0, r1
	beq    is_semihosting

inst_swi_udf:
	udf    #0x00

is_semihosting:
	add    r0, r4, #4 * 12			//point to r12
	ldmia  r0, {r0, r1, r3, r5}		//get r1, sp, lr ,pc
	push   {r0, r1, r3, r5, lr}		//put them on stack for params, save lr to not clobber it with "bl"
	mov    r0, r4					//point to r0...r3 for params
	adds   r1, r0, #4
	adds   r2, r1, #4
	adds   r3, r2, #4
	bl     kernelSemihostingHandle
	add    sp, #0x10				//pop params off the stack that we had pushed
	pop    {lr}						//re-get LR
	cmp    r0, #0
	beq    inst_swi_udf				//verify success
	nextinstr

//REGS:
//	r0 - temp
//	r1 - temp
//	r2 - instr (till clobbered)
//	r3 - temp
//	r4 - emulState
//	r5 - temp
//	r6 - emulated PC [pointer to next instr - like state.regs[15] but better since it doesnt need a load and ist offet)
//  r7 - emulated SR [cached here, written out on exit only]
//  r8 - cc dispatch table addr
//	r12 - temp (hireg, so only used in cases where it would be of no extra cost)


emuCpuRunCodeEnd:
