;This program goes directly to PMODE from RMODE.
;This program will not work if a VCPI, DPMI or XMS driver is loaded.
;So you can not have EMM386,QEMM,386MAX,Windoze, OS/2 or anything loaded
;while using this little example (don't worry, eventually this will
;run under those programs but not yet).

; source : PMODE v3.07 (for the A20 enabling and processor detection)
;                      (and the jmps between each mode)

.386p   ;allow USE32 and 386 instructions

;some EQUates
stack16siz equ 1024
stack32siz equ 1024

bptr equ byte ptr
wptr equ word ptr
dptr equ dword ptr

ERROR equ -1

;first define all segments that will be needed
code16seg segment use16 'code'
code16seg ends
data16seg segment use16
data16seg ends
stack16seg segment use16 stack
  db stack16siz dup (?)
  TOS16 equ $   ;top of stack
stack16seg ends

code32seg segment use32 'code'
code32seg ends
data32seg segment use32
data32seg ends
stack32seg segment use32 'stack'
  db stack32siz dup (?)
  TOS32 equ $   ;top of stack
stack32seg ends

;put all segments in one
segs16 group code16seg,data16seg,stack16seg
segs32 group code32seg,data32seg,stack32seg

assume cs:segs16,ds:segs16,ss:segs16,es:NOTHING

desc struct   ;define our descriptor structure
  limit_lo dw ?    ;limit bits 15-0
  base_lo dw ?     ;base bits 15-0
  base_mid db ?    ;base bits 23-16
  type1 db ?       ;type of selector
  limit_hi db ?    ;limit bits 19-16 and other info
  base_hi db ?     ;base bits 31-24
desc ends

data16seg segment use16  ;start defining our 16bit data

;define our GDTR
  gdtr label fword
  gdt_limit dw gdt_size-1
  gdt_addr dd ?
;define our IDTR
  idtr label fword
  idt_limit dw idt_size-1
  idt_addr dd ?

;define our GDT
  gdt_start equ $
  null desc <0,0,0,0,0,0>
    ;this is the 1st GDT entry called NULL which we can not use
  code16 desc <0ffffh,?,?,10011110b,0h,?>
    ;Limit=64 KBs (this is a 16bit segment so only IP is used while running)
    ;Base=? (this will be setup later in the program)
    ;10011110b = P=1 (present), DPL=0, S=1(code/data segment) T=1 (code)
    ;            C=1(conforming), R=1 (readable), A=0 (not accessed)
    ;0h = limit bits 19-16=0, G=0 (1byte granularity), D=0 (16bit segment)
  data16 desc <0ffffh,?,?,10010010b,0h,?>
    ;Limit=64 KBs
    ;Base=? (this will be setup later in the program)
    ;10010010b = P=1 (present), DPL=0, S=1(code/data segment) T=0 (data)
    ;            E=0 (do not expand down), W=1 (writable), A=0 (not accessed)
    ;0h = limit bits 19-16=0, G=0 (1byte granularity), D=0 (16bit segment)
  vid16 desc <0ffffh,8000h,0bh,10010010b,0h,0>
    ;Limit=64 KBs
    ;Base=0b8000h (makes it easy to access video RAM while in PMODE-16)
    ;10010010b = P=1 (present), DPL=0, S=1(code/data segment) T=0 (data)
    ;            E=0 (do not expand down), W=1 (writable), A=0 (not accessed)
    ;0h = limit bits 19-16=0, G=0 (1byte granularity), D=0 (16bit segment)
  code32 desc <0ffffh,?,?,10011110b,11001111b,?>
    ;Limit=4 GBs (0ffffh * 4k = 4 GBs)
    ;Base=?
    ;     the beginning of memory)
    ;10011110b = P=1 (present), DPL=0, S=1(code/data segment) T=1 (code)
    ;            C=1(conforming), R=1 (readable), A=0 (not accessed)
    ;11001111b = limit bits 19-16=0fh, G=1 (4k granularity), D=1 (32bit segment)
  data32 desc <0ffffh,?,?,10010010b,11001111b,?>
    ;Limit=4 GBs
    ;Base=?
    ;10010010b = P=1 (present), DPL=0, S=1(code/data segment) T=0 (data)
    ;            E=0 (do not expand down), W=1 (writable), A=0 (not accessed)
    ;11001111b = limit bits 19-16=0fh, G=1 (4k granularity), D=1 (32bit segment)
  vid32 desc <0ffffh,08000h,0bh,10010010b,01000000b,0>
    ;Limit=64 KBs
    ;Base=0b8000h (easy access to the video RAM area)
    ;10010010b = P=1 (present), DPL=0, S=1(code/data segment) T=0 (data)
    ;            E=0 (do not expand down), W=1 (writable), A=0 (not accessed)
    ;01000000b = limit bits 19-16=0, G=0 (1byte granularity), D=1 (32bit segment)
  ldt desc <0,0,0,10000010b,0h,0>
    ;this is what will be loaded into our LDTR.  Because we don't want
    ;to use the LDT this is setup so that our LDT is empty
    ;Limit=1 byte (limits of 0 are not possible)
    ;Base=0
    ;10000010b = P=1 (present), DPL=0, S=0(system segment), TYPE=2 (LDT)
    ;       0h = limit bits 19-16=0, G=0 (1byte granularity), D=0 (16bit segment)
  gdt_size equ ($-gdt_start)

;define our IDT
  idt_start equ $
  idt desc 32 dup (<offset exc_handler,selcode16,0,10001110b,0,0>)
  idt_size equ ($-idt_start)

;define our selectors
  selcode16 equ (offset code16-gdt_start)
  seldata16 equ (offset data16-gdt_start)
  selvid16 equ  (offset vid16 -gdt_start)
  seldata32 equ (offset data32-gdt_start)
  selcode32 equ (offset code32-gdt_start)
  selvid32 equ  (offset vid32 -gdt_start)
  selldt equ    (offset ldt   -gdt_start)

;define other variables needed
xms_max dw 1024  ;Max RAM to Alloc is 1024 KBs
xms_min dw 64    ;Min RAM to Alloc is 64 KBs
xms_left dw ?    ;int 15h will report this much is free
xms_base dd ?    ;the start of our XMS RAM we have alloc via int 15h
xms_size dd ?    ;the size of our XMS RAM

cpu db ?         ;detected CPU  (3=386 4=486 5=586)

prg_base16 dd ?    ;CS*10h
prg_base32 dd ?    ;CS*10h

old_int15h dd ?

;messages
msg_welcome db 'DOS extender Tutorial : Stage #1',13,10,'$'
msg_XMS db 'XMS driver detected!  Can not continue!',13,10,'$'
msg_XMS_low db 'Insufficent XMS memory found!',13,10,'$'
msg_80386 db '80386 or better required!',13,10,'$'
msg_v86 db 'Another PMODE software is already loaded!',13,10,'$'
msg_clean_boot db 'Please use a clean boot to use this tutorial!',13,10,'$'
msg_a20 db 'Unable to enable A20?',13,10,'$'
data16seg ends

code16seg segment use16  ;start the 16bit code segment

;this section is used to report errors and quit the program
exit16error:
  mov ah,9
  int 21h  ;print error message
  mov dx,offset msg_clean_boot
  mov ah,9
  int 21h  ;print clean boot message
  mov ax,4c00h
  int 21h  ;exit program

;this will detect if the CPU is a 80386 or better
;Return : cl = processor type  (2=286 3=386 4=486)
detect_processor proc             ; get processor: 286, 386, 486, or 586
  xor cx,cx                       ; processor type 0 in case of exit

  pushf                           ; transfer FLAGS to BX
  pop bx

  mov ax,bx                       ; try to clear high 4 bits of FLAGS
  and ah,0fh

  push ax                         ; transfer AX to FLAGS
  popf
  pushf                           ; transfer FLAGS back to AX
  pop ax

  and ah,0f0h                     ; isolate high 4 bits
  cmp ah,0f0h
  je short detect_processordone   ; if bits are set, CPU is 8086/8

  mov cl,2                        ; processor type 2 in case of exit

  or bh,0f0h                      ; try to set high 4 bits of FLAGS

  push bx                         ; transfer BX to FLAGS
  popf
  pushf                           ; transfer FLAGS to AX
  pop ax

  and ah,0f0h                     ; isolate high 4 bits
  jz short detect_processordone   ; if bits are not set, CPU is 80286

  inc cx                          ; processor type 3 in case of exit

  push eax                        ; preserve 32bit registers
  push ebx     

  pushfd                          ; transfer EFLAGS to EBX
  pop ebx

  mov eax,ebx                     ; try to flip AC bit in EFLAGS
  xor eax,40000h

  push eax                        ; transfer EAX to EFLAGS
  popfd
  pushfd                          ; transfer EFLAGS back to EAX
  pop eax

  xor eax,ebx                     ; AC bit fliped?
  jz short detect_processordone2  ; if no, CPU is 386

  inc cx                          ; processor type 4 in case of exit

  mov eax,ebx                     ; try to flip ID bit in EFLAGS
  xor eax,200000h

  push eax                        ; transfer EAX to EFLAGS
  popfd
  pushfd                          ; transfer EFLAGS back to EAX
  pop eax

  xor eax,ebx                     ; ID bit fliped?
  jz short detect_processordone2  ; if no, CPU is 486

  inc cx                          ; processor type 5, CPU is 586

detect_processordone2:
  pop ebx                         ; restore 32bit registers
  pop eax    

detect_processordone:
  ret                             ; return
detect_processor endp

int15h proc
  cmp ah,88h
  jnz chain_int15h
  mov ax,cs:xms_left
  iret
chain_int15h:
  jmp cs:[old_int15h]
int15h endp

enablea20 proc                    ; hardware enable gate A20
  pushf
  push fs
  push gs
  cli

  xor ax,ax                       ; set A20 test segments 0 and 0ffffh
  mov fs,ax
  dec ax
  mov gs,ax

  call enablea20test              ; is A20 already enabled?
  jz short enablea20done          ; if yes, done

  in al,92h                       ; PS/2 A20 enable
  or al,2
  jmp short $+2
  jmp short $+2
  jmp short $+2
  out 92h,al

  call enablea20test              ; is A20 enabled?
  jz short enablea20done          ; if yes, done

  call enablea20kbwait            ; AT A20 enable
  jnz short enablea20f0

  mov al,0d1h
  out 64h,al

  call enablea20kbwait
  jnz short enablea20f0

  mov al,0dfh
  out 60h,al

  call enablea20kbwait

enablea20f0:                      ; wait for A20 to enable
  mov cx,800h                     ; do 800h tries

enablea20l0:
  call enablea20test              ; is A20 enabled?
  jz enablea20done                ; if yes, done

  in al,40h                       ; get current tick counter
  jmp short $+2
  jmp short $+2
  jmp short $+2
  in al,40h
  mov ah,al

enablea20l1:                      ; wait a single tick
  in al,40h
  jmp short $+2
  jmp short $+2
  jmp short $+2
  in al,40h
  cmp al,ah
  je enablea20l1

  dec cx
  jnz enablea20l0                 ; loop for another try

  mov ax,ERROR                    ; error, A20 did not enable
  ret

enablea20done:
  pop gs
  pop fs
  popf
  xor ax,ax
  ret

enablea20kbwait:                  ; wait for safe to write to 8042
  xor cx,cx
enablea20kbwaitl0:
  jmp short $+2
  jmp short $+2
  jmp short $+2
  in al,64h                       ; read 8042 status
  test al,2                       ; buffer full?
  loopnz enablea20kbwaitl0        ; if yes, loop
  ret

enablea20test:                    ; test for enabled A20
  mov al,fs:[0]                   ; get byte from 0:0
  mov ah,al                       ; preserve old byte
  not al                          ; modify byte
  xchg al,gs:[10h]                ; put modified byte to 0ffffh:10h
  cmp ah,fs:[0]                   ; set zero if byte at 0:0 not modified
  mov gs:[10h],al                 ; put back old byte at 0ffffh:10h
  ret                             ; return, zero if A20 enabled
enablea20 endp

start16:            ;this is where our program starts after DOS loads it

  cld

  mov ax,cs
  mov ds,ax         
  mov es,ax         ;set DS=ES=CS

  mov dx,offset msg_welcome
  mov ah,9
  int 21h     ;print Welcome message

;determine if CPU is a 80386 or better
  call detect_processor
  cmp cl,3
  jae ok_386
  mov dx,offset msg_80386
  jmp exit16error
ok_386:
  mov cpu,cl  ;save for later

;determine if XMS is loaded
  mov ax,4300h
  int 2fh
  cmp al,80h
  jnz XMS_not_installed
  mov dx,offset msg_XMS
  jmp exit16error
XMS_not_installed:
;no XMS driver loaded so we may continue

;determine if we are in V86 mode (if yes then most likely EMM386,QEMM or
; Windoze is loaded so we can not continue)
  pushfd   ;push 32bit flags
  pop eax  ;pop into eax
  test eax,20000h   ;check bit 17 (VM)
  jz ok_v86
  mov dx,offset msg_v86
  jmp exit16error
ok_v86:

;alloc memory from INT 15h
;this is complex and I will describe how it works later
  mov ah,88h
  int 15h      ;get total XMS 1k blocks free
  cmp ax,XMS_min
  jae ok_XMS
  mov dx,offset msg_XMS_low
  jmp exit16error
ok_XMS:   ;ok there is enough RAM
  mov cx,ax
  sub ax,XMS_max
  jnc ok_XMS2
  mov ax,0    ;leave no memory
ok_XMS2:
  mov XMS_left,ax
  sub cx,ax          ;size of our XMS RAM
  xor ebx,ebx
  mov bx,ax
  mov eax,1024*1024  ;1MB
  shl ebx,10         ;*1k
  add eax,ebx        ;eax = 1MB + XMS_left * 1k  => XMS_base
  mov xms_base,eax
  xor eax,eax
  mov ax,cx
  shl eax,10         ;*1k
  mov xms_size,eax

;Install our own int 15h handler
  mov ax,3515h
  int 21h            ;get int 15h
  mov wptr[old_int15h+2],es
  mov wptr[old_int15h+0],bx
  push ds
  pop es

  mov ax,2515h
  mov dx,offset int15h
  int 21h            ;set int 15h
 
;enable the a20 so we can access RAM above 1MB.
  call enablea20
  cmp ax,ERROR
  jnz ok_a20
  mov dx,offset msg_a20
  jmp exit16error
ok_a20:

;setup some other variables
  xor eax,eax
  mov ax,cs
  shl eax,4    ;linear addr
  mov prg_base16,eax

  xor eax,eax
  mov ax,segs32
  shl eax,4
  mov prg_base32,eax  ;we will need this later

;setup all our descriptors
  mov eax,prg_base16
  mov code16.base_lo,ax
  mov data16.base_lo,ax
  shr eax,16
  mov code16.base_mid,al
  mov data16.base_mid,al
  mov code16.base_hi,ah
  mov data16.base_hi,ah

  mov eax,prg_base32
  mov code32.base_lo,ax
  mov data32.base_lo,ax
  shr eax,16
  mov code32.base_mid,al
  mov data32.base_mid,al
  mov code32.base_hi,ah
  mov data32.base_hi,ah


;setup GDT and IDT base
  mov eax,prg_base16
  add eax,gdt_start
  mov gdt_addr,eax
  mov eax,prg_base16
  add eax,idt_start
  mov idt_addr,eax

;we are now ready to move to 16bit PMODE  

  cli   ;no more IRQs allowed past this point

  lidt [idtr]
  lgdt [gdtr]

;clear NT and IOPL
  pushf
  mov bp,sp
  and wptr [bp+1],08fh ;40h = NT bit , 30h = IOPL bits
  popf 

;goto PMODE!
  mov eax,cr0
  or al,1           ;set PM bit
  mov cr0,eax       ;we are now in 16bit PMODE
  db 0eah           ; JMP FAR PTR SELCODE:$+4
  dw $+4,selcode16  ;  (clear prefetch que)

  mov ax,seldata16  ;reload all values
  mov ds,ax
  mov fs,ax
  mov gs,ax
  mov es,ax

  mov ss,ax
  mov sp,TOS16

;load our LDT (which is empty)  (this is not necessary)

  mov ax,selldt
  lldt ax

;setup a IRETD that will jump into our 32bit segment

  pushfd
  push dptr selcode32
  push dptr offset start32
  iretd   ;goto 32bit segment

exc_handler proc
  ;this is called during any exception or IRQ
  ;we will assume it is always IRQ#0 (timer)
  push eax
  push es
  push ds
  mov ax,selvid32
  mov es,ax
  mov ax,seldata32
  mov ds,ax
assume ds:segs32
  mov al,timer
  inc timer
assume ds:segs16
  mov bptr es:[0],al
  mov al,20h
  out 20h,al              ;ack IRQ
  pop ds
  pop es
  pop eax
  iretd
exc_handler endp
code16seg ends

assume cs:segs32,ds:segs32,ss:segs32,es:NOTHING

data32seg segment use32
  x dd 0     ;coords of cursor on screen
  y dd 0
  timer db 0

;32bit messages
  msg32_timer db ' = TIMER',13,0,0,0,0,0,0
  msg32_welcome db 'Welcome to 32bit PMODE!',13,0,0,0,0,0,0
data32seg ends

code32seg segment use32
  start32:
;reload all segment regs with 32bit selectors
  mov ax,seldata32
  mov ds,ax
  mov ss,ax
  mov fs,ax
  mov gs,ax
  mov esp,TOS32
  mov ax,selvid32
  mov es,ax

  call clrscr   ;clear the screen

  mov esi,offset msg32_timer
  call print

  mov esi,offset msg32_welcome
  call print

;enable timer only
  mov al,0ffh
  out 0a1h,al
  dec al        ;enable IRQ#0
  out 021h,al

  sti  ;enable IRQs

  jmp $  ;this is the end, I'll show how to get back to RMODE
         ;in the next tutorial source

clrscr proc
  xor edi,edi         ;linear addr of 0b800:0  (base=0b8000h)
  mov ecx,80*25
  mov ax,720h         ;07h=white on black 20h=spaces
  rep stosw
  mov x,0
  mov y,0
  ret
clrscr endp

;esi = string to print
print proc
p0:
  mov edi,y
  imul edi,edi,80*2    ;80*2 = # bytes/row (in text mode)
  add edi,x
  add edi,x            ;add twice to skip over color bytes too
p1:
  cmp bptr[esi],0
  jz p3
  cmp bptr[esi],13
  jz p13
  movsb                ;ds:esi => es:edi
  inc edi              ;skip over color byte
  inc x
  cmp x,80
  jnz p1
p2:                    ;skip to next line
  mov x,0
  inc y
  cmp y,25
  jnz p0
  mov y,0
  jmp p0
p13:
  inc esi
  jmp p2
p3:
  ret
print endp

code32seg ends

end start16     ;start program in code16:start16
