; Our zero-page vars sx = $80 ; i16: screen pixel x sy = $82 ; i16: screen pixel y ox = $84 ; fixed4.12: center point x oy = $86 ; fixed4.12: center point y cx = $88 ; fixed4.12: c_x cy = $8a ; fixed4.12: c_y zx = $8c ; fixed4.12: z_x zy = $8e ; fixed4.12: z_y zx_2 = $90 ; fixed4.12: z_x^2 zy_2 = $92 ; fixed4.12: z_y^2 zx_zy = $94 ; fixed4.12: z_x * z_y dist = $96 ; fixed4.12: z_x^2 + z_y^2 iter = $a0 ; u8: iteration count zoom = $a1 ; u8: zoom shift level count_frames = $a2 ; u8 count_pixels = $a3 ; u8 total_ms = $a4 ; float48 total_pixels = $aa ; float48 z_buffer_active = $b0 ; boolean: 1 if we triggered the lake, 0 if not z_buffer_start = $b1 ; u8: index into z_buffer z_buffer_end = $b2 ; u8: index into z_buffer temp = $b4 ; u16 pixel_ptr = $b6 ; u16 pixel_color = $b8 ; u8 pixel_mask = $b9 ; u8 pixel_shift = $ba ; u8 pixel_offset = $bb ; u8 fill_level = $bc ; u8 ; FP registers in zero page FR0 = $d4 ; float48 FRE = $da FR1 = $e0 ; float48 FR2 = $e6 ; float48 CIX = $f2 ; u8 - index into INBUFF INBUFF = $f3 ; u16 - pointer to ascii FLPTR = $fc ; u16 - pointer to user buffer float48 CH1 = $02f2 ; previous character read from keyboard CH = $02fc ; current character read from keyboard LBUFF = $0580 ; result buffer for FASC routine ; FP ROM routine vectors FASC = $D8E6 ; FLOATING POINT TO ASCII (output in INBUFF, last char has high bit set) IFP = $D9AA ; INTEGER TO FLOATING POINT CONVERSION (FR0:u16 -> FR0:float48) FADD = $DA66 ; ADDITION (FR0 += FR1) FSUB = $DA60 ; SUBTRACTION (FR0 -= FR1) FMUL = $DADB ; MULTIPLICATION (FR0 *= FR1) FDIV = $DB28 ; DIVISION (FR0 /= FR1) ZF1 = $DA46 ; CLEAR ZERO PAGE FLOATING POINT NUMBER (XX) FLD0R = $DD89 ; LOAD FR0 WITH FLOATING POINT NUMBER (YYXX) FLD1R = $DD98 ; LOAD FR1 WITH FLOATING POINT NUMBER (YYXX) FST0R = $DDA7 ; STORE FR0 IN USER BUFFER (YYXX) FMOVE = $DDB6 ; MOVE FR0 TO FR1 ; High data framebuffer_top = $8000 textbuffer = $8f00 framebuffer_bottom = $9000 display_list = $9f00 framebuffer_end = $a000 height = 184 half_height = height >> 1 width = 160 half_width = width >> 1 stride = width >> 2 DMACTL = $D400 DLISTL = $D402 DLISTH = $D403 WSYNC = $D40A ; OS shadow registers SDLSTL = $230 SDLSTH = $231 ; interrupt stuff SYSVBV = $E45F XITVBV = $E462 SETVBV = $E45C ; Keycodes! KEY_PLUS = $06 KEY_MINUS = $0e KEY_UP = $8e KEY_DOWN = $8f KEY_LEFT = $86 KEY_RIGHT = $87 .struct float48 exponent .byte mantissa .byte 6 .endstruct .data strings: str_self: .byte "MANDEL-6502" str_self_end: str_speed: .byte " ms/px" str_speed_end: str_run: .byte " RUN" str_run_end: str_done: .byte "DONE" str_done_end: str_self_len = str_self_end - str_self str_speed_len = str_speed_end - str_speed str_run_len = str_run_end - str_run str_done_len = str_done_end - str_done speed_precision = 6 speed_start = 40 - str_done_len - str_speed_len - speed_precision - 1 speed_len = 14 + str_speed_len char_map: ; Map ATASCII string values to framebuffer font entries ; Sighhhhh .repeat 32, i .byte i + 64 .endrepeat .repeat 64, i .byte i .endrepeat .repeat 32, i .byte 96 + i .endrepeat hex_chars: .byte "0123456789abcdef" aspect: ; aspect ratio! ; pixels at 320w are 5:6 (narrow) ; pixels at 160w are 5:3 (wide) ; ; cy = (sy << (8 - zoom)) * (96 / 128 = 3 / 4) ; cx = (sx << (8 - zoom)) * ((3 / 4) * (5 / 3) = 5 / 4) ; ; so vertical range -92 .. 91.9 is -2.15625 .. 2.15624 ; &horizontal range -80 .. 79.9 is -3.125 .. 3.124 ; ; 184h is the equiv of 220.8h at square pixels ; 320 / 220.8 = 1.45 display aspect ratio aspect_x: ; fixed4.16 5/4 .word 5 << (12 - 2) aspect_y: ; fixed4.16 3/4 .word 3 << (12 - 2) ms_per_frame: ; float48 16.66666667 .byte 64 ; exponent/sign .byte $16 ; BCD digits .byte $66 .byte $66 .byte $66 .byte $67 display_list_start: ; 24 lines overscan .repeat 3 .byte $70 ; 8 blank lines .endrep ; 8 scan lines, 1 row of 40-column text .byte $42 .addr textbuffer ; 184 lines graphics ; ANTIC mode e (160px 2bpp, 1 scan line per line) .byte $4e .addr framebuffer_top .repeat half_height - 1 .byte $0e .endrep .byte $4e .addr framebuffer_bottom .repeat half_height - 1 .byte $0e .endrep .byte $41 ; jump and blank .addr display_list display_list_end: display_list_len = display_list_end - display_list_start color_map: .byte 0 .repeat 85 .byte 1 .byte 2 .byte 3 .endrepeat .code z_buffer_len = 16 z_buffer_mask = z_buffer_len - 1 z_buffer: ; the last N zx/zy values .repeat z_buffer_len .word 0 .word 0 .endrepeat .export start max_fill_level = 6 fill_masks: .byte %00011111 .byte %00001111 .byte %00000111 .byte %00000011 .byte %00000001 .byte %00000000 ; 2 + 9 * byte cycles .macro add bytes, dest, arg1, arg2 clc ; 2 cyc .repeat bytes, byte ; 9 * byte cycles lda arg1 + byte adc arg2 + byte sta dest + byte .endrepeat .endmacro .macro add16 dest, arg1, arg2 add 2, dest, arg1, arg2 .endmacro .macro add32 dest, arg1, arg2 add 4, dest, arg2, dest .endmacro ; 2 + 9 * byte cycles .macro sub bytes, dest, arg1, arg2 sec ; 2 cyc .repeat bytes, byte ; 9 * byte cycles lda arg1 + byte sbc arg2 + byte sta dest + byte .endrepeat .endmacro .macro sub16 dest, arg1, arg2 sub 2, dest, arg1, arg2 .endmacro .macro sub32 dest, arg1, arg2 sub 4, dest, arg1, arg2 .endmacro .macro shl bytes, arg asl arg .repeat bytes-1, i rol arg + 1 + i .endrepeat .endmacro .macro shl16 arg shl 2, arg .endmacro .macro shl24 arg shl 3, arg .endmacro .macro shl32 arg shl 4, arg .endmacro ; 6 * bytes cycles .macro copy bytes, dest, arg .repeat bytes, byte ; 6 * bytes cycles lda arg + byte ; 3 cyc sta dest + byte ; 3 cyc .endrepeat .endmacro .macro copy16 dest, arg copy 2, dest, arg .endmacro .macro copy32 dest, arg copy 4, dest, arg .endmacro .macro copyfloat dest, arg copy 6, dest, arg .endmacro ; 2 + 8 * byte cycles .macro neg bytes, arg sec ; 2 cyc .repeat bytes, byte ; 8 * byte cycles lda #00 ; 2 cyc sbc arg + byte ; 3 cyc sta arg + byte ; 3 cyc .endrepeat .endmacro ; 18 cycles .macro neg16 arg neg 2, arg .endmacro ; 34 cycles .macro neg32 arg neg 4, arg .endmacro ; inner loop for imul16 ; bitnum < 8: 25 or 41 cycles ; bitnum >= 8: 30 or 46 cycles .macro bitmul16 arg1, arg2, result, bitnum .local zero .local one .local next ; does 16-bit adds ; arg1 and arg2 are treated as unsigned ; negative signed inputs must be flipped first ; 7 cycles up to the branch ; check if arg1 has 0 or 1 bit in this place ; 5 cycles either way .if bitnum < 8 lda arg1 ; 3 cyc and #(1 << (bitnum)) ; 2 cyc .else lda arg1 + 1 ; 3 cyc and #(1 << ((bitnum) - 8)) ; 2 cyc .endif bne one ; 2 cyc zero: ; 18 cyc, 23 cyc lsr result + 3 ; 5 cyc jmp next ; 3 cyc one: ; 32 cyc, 37 cyc ; 16-bit add on the top bits clc ; 2 cyc lda result + 2 ; 3 cyc adc arg2 ; 3 cyc sta result + 2 ; 3 cyc lda result + 3 ; 3 cyc adc arg2 + 1 ; 3 cyc ror a ; 2 cyc - get a jump on the shift sta result + 3 ; 3 cyc next: ror result + 2 ; 5 cyc ror result + 1 ; 5 cyc .if bitnum >= 8 ; we can save 5 cycles * 8 bits = 40 cycles total by skipping this byte ; when it's all uninitialized data ror result ; 5 cyc .endif .endmacro ; 5 to 25 cycles .macro check_sign arg ; Check sign bit and flip argument to postive, ; keeping a count of sign bits in the X register. .local positive lda arg + 1 ; 3 cyc bpl positive ; 2 cyc neg16 arg ; 18 cyc inx ; 2 cyc positive: .endmacro ; 518 - 828 cyc .macro imul16 dest, arg1, arg2 copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc jsr imul16_func ; 470-780 cyc copy32 dest, FR2 ; 24 cyc .endmacro .macro shift_round_16 arg, shift .repeat shift shl32 arg .endrepeat round16 arg .endmacro .macro imul16_round dest, arg1, arg2, shift copy16 FR0, arg1 ; 12 cyc copy16 FR1, arg2 ; 12 cyc jsr imul16_func ; 470-780 cyc shift_round_16 FR2, shift copy16 dest, FR2 + 2 ; 12 cyc .endmacro ; min 470 cycles ; max 780 cycles .proc imul16_func arg1 = FR0 ; 16-bit arg (clobbered) arg2 = FR1 ; 16-bit arg (clobbered) result = FR2 ; 32-bit result ldx #0 ; 2 cyc ; counts the number of sign bits in X check_sign arg1 ; 5 to 25 cyc check_sign arg2 ; 5 to 25 cyc ; zero out the 32-bit temp's top 16 bits lda #0 ; 2 cyc sta result + 2 ; 3 cyc sta result + 3 ; 3 cyc ; the bottom two bytes will get cleared by the shifts ; unrolled loop for maximum speed, at the cost ; of a larger routine ; 440 to 696 cycles .repeat 16, bitnum ; bitnum < 8: 25 or 41 cycles ; bitnum >= 8: 30 or 46 cycles bitmul16 arg1, arg2, result, bitnum .endrepeat ; In case of mixed input signs, return a negative result. cpx #1 ; 2 cyc bne positive_result ; 2 cyc neg32 result ; 34 cyc positive_result: rts ; 6 cyc .endproc .macro round16 arg ; Round top 16 bits of 32-bit fixed-point number in-place .local increment .local high_half .local check_sign .local next ; low word > $8000: round up ; = $8000: round up if positive ; round down if negative ; < $8000: round down lda arg + 1 cmp #$80 beq high_half bpl increment bmi next high_half: lda arg beq check_sign bpl increment bmi next check_sign: lda arg + 3 bmi next increment: ; 5-10 cyc inc arg + 2 ; 5 cyc bne next ; 2 cyc inc arg + 3 ; 5 cyc next: .endmacro .proc mandelbrot ; input: ; cx: position scaled to 4.12 fixed point - -8..+7.9 ; cy: position scaled to 4.12 ; ; output: ; iter: iteration count at escape or 0 ; zx = 0 ; zy = 0 ; zx_2 = 0 ; zy_2 = 0 ; zx_zy = 0 ; dist = 0 ; iter = 0 lda #00 ldx #(iter - zx + 1) initloop: sta zx - 1,x dex bne initloop sta z_buffer_start sta z_buffer_end loop: ; iter++ & max-iters break inc iter bne keep_going jmp exit_path keep_going: .macro quick_exit arg, max .local positive .local negative .local nope_out .local first_equal .local all_done ; check sign bit lda arg + 1 bmi negative positive: cmp #((max) << 4) bmi all_done ; 'less than' jmp exit_path negative: cmp #(256 - ((max) << 4)) beq first_equal ; 'equal' on first byte bpl all_done ; 'greater than' nope_out: jmp exit_path first_equal: lda arg beq nope_out ; 2nd byte 0 shows it's really 'equal' all_done: .endmacro ; 4.12: (-8 .. +7.9) ; zx = zx_2 - zy_2 + cx sub16 zx, zx_2, zy_2 add16 zx, zx, cx quick_exit zx, 2 ; zy = zx_zy + zx_zy + cy add16 zy, zx_zy, zx_zy add16 zy, zy, cy quick_exit zy, 2 ; zx_2 = zx * zx imul16_round zx_2, zx, zx, 4 ; zy_2 = zy * zy imul16_round zy_2, zy, zy, 4 ; zx_zy = zx * zy imul16_round zx_zy, zx, zy, 4 ; dist = zx_2 + zy_2 add16 dist, zx_2, zy_2 quick_exit dist, 4 ; if may be in the lake, look for looping output with a small buffer ; as an optimization vs running to max iters lda z_buffer_active beq skip_z_buffer ldx z_buffer_start cpx z_buffer_end beq z_nothing_to_read z_buffer_loop: .macro z_compare arg .local compare_no_match lda z_buffer,x inx cmp arg bne compare_no_match iny compare_no_match: .endmacro .macro z_advance .local skip_reset_x cpx #(z_buffer_len * 4) bmi skip_reset_x ldx #0 skip_reset_x: .endmacro .macro z_store arg lda arg sta z_buffer,x inx .endmacro ; Compare the previously stored z values ldy #0 z_compare zx z_compare zx + 1 z_compare zy z_compare zy + 1 cpy #4 bne z_no_matches jmp z_exit z_no_matches: z_advance cpx z_buffer_end bne z_buffer_loop z_nothing_to_read: ; Store and expand z_store zx z_store zx + 1 z_store zy z_store zy + 1 z_advance stx z_buffer_end ; Increment the start roller if necessary (limit size) lda iter cmp #(z_buffer_len * 4) bmi skip_inc_start lda z_buffer_start clc adc #4 tax z_advance stx z_buffer_start skip_inc_start: skip_z_buffer: jmp loop z_exit: lda #0 sta iter exit_path: ldx #0 lda iter bne next inx next: stx z_buffer_active rts .endproc .macro scale_zoom dest ; clobbers X, flags .local cont .local enough ; cx = (sx << (8 - zoom)) ldx zoom cont: cpx #8 beq enough shl16 dest inx jmp cont enough: .endmacro .macro zoom_factor dest, src, zoom, aspect ; clobbers A, X, flags, etc copy16 dest, src scale_zoom dest ; cy = cy * (3 / 4) ; cx = cx * (5 / 4) imul16_round dest, dest, aspect, 4 .endmacro .proc pset ; screen coords in signed sx,sy ; iter holds the target to use ; @todo implement ; iter -> color ldx iter lda color_map,x sta pixel_color lda #(255 - 3) sta pixel_mask ; sy -> line base address in temp lda sy bpl positive negative: ; temp1 = top half lda #.lobyte(framebuffer_top + stride * half_height) sta pixel_ptr lda #.hibyte(framebuffer_top + stride * half_height) sta pixel_ptr + 1 jmp point positive: lda #.lobyte(framebuffer_bottom) sta pixel_ptr lda #.hibyte(framebuffer_bottom) sta pixel_ptr + 1 point: ; pixel_ptr += sy * stride ; temp * 40 ; = temp * 32 + temp * 8 ; = (temp << 5) + (temp << 3) copy16 temp, sy shl16 temp shl16 temp shl16 temp add16 pixel_ptr, pixel_ptr, temp shl16 temp shl16 temp add16 pixel_ptr, pixel_ptr, temp ; Ok so temp1 points to the start of the line, which is 40 bytes. ; Get the byte and bit offsets lda sx clc adc #half_width sta temp ; pixel_shift = temp & 3 ; pixel_color <<= pixel_shift (shifting in zeros) ; pixel_mask <<= pixel_shift (shifting in ones) and #3 sta pixel_shift lda #3 sec sbc pixel_shift tax shift_loop: beq shift_done asl pixel_color asl pixel_color sec rol pixel_mask sec rol pixel_mask dex jmp shift_loop shift_done: ; pixel_offset = temp >> 2 lda temp lsr a lsr a sta pixel_offset tay ; read, mask, or, write lda (pixel_ptr),y and pixel_mask ora pixel_color sta (pixel_ptr),y rts .endproc .macro draw_text_indirect col, len, strptr ; clobbers A, X .local loop .local done ldx #0 loop: cpx #len beq done txa tay lda (strptr),y tay lda char_map,y sta textbuffer + col,x inx jmp loop done: .endmacro .macro draw_text col, len, cstr ; clobbers A, X .local loop .local done ldx #0 loop: cpx #len beq done ldy cstr,x lda char_map,y sta textbuffer + col,x inx jmp loop done: .endmacro .proc vblank_handler inc count_frames jmp XITVBV .endproc .proc update_speed ; convert frames (u16) to fp ; add to frames_total ; convert pixels (u16) to fp ; add to pixels_total ; (frames_total * 16.66666667) / pixels_total ; convert to ATASCII ; draw text .endproc .proc keycheck ; clobbers all ; returns 255 in A if state change or 0 if no change ; check keyboard buffer lda CH cmp #$ff beq skip_char ; Clear the keyboard buffer and re-enable interrupts ldx #$ff stx CH tay lda zoom cpy #KEY_PLUS beq plus cpy #KEY_MINUS beq minus ; temp = $0010 << (8 - zoom) lda #$10 sta temp lda #$00 sta temp + 1 scale_zoom temp cpy #KEY_UP beq up cpy #KEY_DOWN beq down cpy #KEY_LEFT beq left cpy #KEY_RIGHT beq right skip_char: lda #0 rts plus: cmp #8 bpl skip_char inc zoom jmp done minus: cmp #1 bmi skip_char dec zoom jmp done up: sub16 oy, oy, temp jmp done down: add16 oy, oy, temp jmp done left: sub16 ox, ox, temp jmp done right: add16 ox, ox, temp done: lda #255 rts .endproc .proc clear_screen ; zero the range from framebuffer_top to display_list lda #.lobyte(framebuffer_top) sta temp lda #.hibyte(framebuffer_top) sta temp + 1 zero_page_loop: lda #0 ldy #0 zero_byte_loop: sta (temp),y iny bne zero_byte_loop inc temp + 1 lda temp + 1 cmp #.hibyte(display_list) bne zero_page_loop rts .endproc .proc status_bar ; Status bar draw_text 0, str_self_len, str_self draw_text 40 - str_run_len, str_run_len, str_run rts .endproc .proc start ; ox = 0; oy = 0; zoom = 0 ; count_frames = 0; count_pixels = 0 lda #0 sta ox sta ox + 1 sta oy sta oy + 1 sta count_frames sta count_pixels ; total_ms = 0.0; total_pixels = 0.0 ldx #total_ms jsr ZF1 ldx #total_pixels jsr ZF1 ; zoom = 2x lda #1 sta zoom ; Disable display DMA lda #0 sta DMACTL jsr clear_screen ; Copy the display list into properly aligned memory ; Can't cross 1024-byte boundaries :D ldx #0 copy_byte_loop: lda display_list_start,x sta display_list,x inx cpx #display_list_len bne copy_byte_loop ; Set up the display list lda #.lobyte(display_list) sta DLISTL ; actual register sta SDLSTL ; shadow register the OS will copy in lda #.hibyte(display_list) sta DLISTH ; actual register sta SDLSTH ; shadow register the OS will copy in ; Re-enable display DMA lda #$22 sta DMACTL ; install the vblank handler lda #7 ; deferred ldx #.hibyte(vblank_handler) ldy #.lobyte(vblank_handler) jsr SETVBV main_loop: jsr clear_screen jsr status_bar lda #0 sta fill_level fill_loop: ; sy = -92 .. 91 lda #(256-half_height) sta sy lda #(256-1) sta sy + 1 loop_sy: ; sx = -80 .. 79 lda #(256-half_width) sta sx lda #(256-1) sta sx + 1 loop_sx: ; check the fill mask ldy #0 loop_skip_level: cpy fill_level beq current_level lda fill_masks,y and sx bne not_skipped_mask1 lda fill_masks,y and sy beq skipped_mask not_skipped_mask1: iny jmp loop_skip_level current_level: lda fill_masks,y and sx bne skipped_mask lda fill_masks,y and sy beq not_skipped_mask skipped_mask: jmp skipped not_skipped_mask: ; run the fractal! zoom_factor cx, sx, zoom, aspect_x add16 cx, cx, ox zoom_factor cy, sy, zoom, aspect_y add16 cy, cy, oy jsr mandelbrot jsr pset jsr keycheck beq no_key ; @fixme clear the pixel stats jmp main_loop no_key: ; check if we should update the counters ; ; count_pixels >= width? update! inc count_pixels lda count_pixels cmp #width bmi update_status ; count_frames >= 120? update! lda count_frames cmp #120 ; >= 2 seconds bmi skipped update_status: ; FR0 = (float)count_pixels & clear count_pixels lda count_pixels sta FR0 lda #0 sta FR0 + 1 sta count_pixels jsr IFP ; FR1 = total_pixels ldx #.lobyte(total_pixels) ldy #.hibyte(total_pixels) jsr FLD1R ; FR0 += FR1 jsr FADD ; total_pixels = FR0 ldx #.lobyte(total_pixels) ldy #.hibyte(total_pixels) jsr FST0R ; FR0 = (float)count_frames & clear count_frames ; warning: this should really disable interrupts @TODO lda count_frames sta FR0 lda #0 sta FR0 + 1 sta count_frames jsr IFP ; FR0 *= ms_per_frame ldx #.lobyte(ms_per_frame) ldy #.hibyte(ms_per_frame) jsr FLD1R jsr FMUL ; FR0 += total_ms ldx #total_ms ldy #0 jsr FLD1R jsr FADD ; total_ms = FR0 ldx #total_ms ldy #0 jsr FST0R ; FR0 /= total_pixels ldx #total_pixels ldy #0 jsr FLD1R jsr FDIV ; convert to ASCII in INBUFF jsr FASC ; print the first 6 digits draw_text_indirect speed_start, speed_precision, INBUFF draw_text speed_start + speed_precision, str_speed_len, str_speed skipped: clc lda sx adc #1 sta sx lda sx + 1 adc #0 sta sx + 1 lda sx cmp #half_width beq loop_sx_done jmp loop_sx loop_sx_done: clc lda sy adc #1 sta sy lda sy + 1 adc #0 sta sy + 1 lda sy cmp #half_height beq loop_sy_done jmp loop_sy loop_sy_done: draw_text 40 - str_done_len, str_done_len, str_done fill_loop_done: inc fill_level lda fill_level cmp #max_fill_level beq loop jmp fill_loop loop: ; finished jsr keycheck beq loop jmp main_loop .endproc