comment ! 
FSU - ULTRA	The greatest random number generator that ever was
		or ever will be.  Way beyond Super-Duper.
		(Just kidding, but we think its a good one.)

Authors:	Arif Zaman (arif@stat.fsu.edu) and
		George Marsaglia (geo@stat.fsu.edu).

Date:		27 May 1992

Version:	1.05

Copyright:	To obtain permission to incorporate this program into
		any commercial product, please contact the authors at
		the e-mail address given above or at

		Department of Statistics and
		Supercomputer Computations Research Institute
		Florida State University
		Tallahassee, FL 32306.

See Also:	README		for a brief description
		ULTRA.DOC	for a detailed description

-----------------------------------------------------------------------
! 
;
; This file defines the CODE segment to be included in a main file
; see ultra.doc for details.
;
cong  macro     ; Congruential generator ==============================
;
; argument in dx:ax
; result   in dx:ax = low32bits of 69069 * arg
; clobbered   bx
;
; We need the lower 32 bits of 69069 * dx:ax
; Since we only have 16 bit multiplys with 32 bit answers,
; we use long multiplication. Let (a,b) represent a*2^16 + b.
;
;            69069 =                      (  1, 3533)
;            dx:ax =                 x    ( hi, lo  )
;                                 -------------------
;                     3533 * lo =         ( a1, a2  )
;                     3533 * hi =     ( b1, b2 )
;                        1 * lo =     (  0, lo )
;                                 -------------------
;            low 32 bits of answer = (a1+b2+lo, a2  )         
;
      mov  bx,ax
      mov  ax,3533
      mul  dx                     ; dx:ax = b1,b2 = 3533*hi
      xchg bx,ax
      add  bx,ax                  ; bx = b2+lo;     ax = lo;
      mov  dx,3533
      mul  dx                     ; dx:ax = a1,a2 = 3533*lo
      add  dx,bx                  ; dx = a1+b2+lo
      endm

shreg macro     ; Shift register sequence =============================
;
;  argument is dx:ax
;  result   in dx:ax
;  clobbered   bx
;
;  consider the ax register as one sign bit `as' and the rest `a..r'
;  similarly decompose dx. The operation we want to do is:
;
;          x = x xor (x shr 15);
;          x = x xor (x shl 17);
;
;  Which can be shown in terms of the decomposed registers as:
;
;     x        = ds, d..r, as, a..r
;     x shr 15 =       ds, d..r  as
;                ------------------
;     x        = ds, d..r, as, a..r
;     x shl 17 = a..r   0
;                ------------------
;
;  The above operation is equivalent to:
;
;           ax = ax xor [d..r,as]
;           dx = dx xor [a..r,ds]
;
      mov  bh,ah
      shl  bh,1                 ; mov as into carry
      mov  bx,dx
      rcl  bx,1                 ; bx = [d..r,as]
      pushf                     ; save carry flag (=ds)
      xor  ax,bx                ; ax = ax xor bx
      popf                      ; restore the carry
      mov  bx,ax
      rcl  bx,1                 ; bx = [a..r,ds]
      xor  dx,bx                ; dx = dx xor bx
      endm

rinit proc		; INITIALIZE SEED ARRAY =======================
	Enter2arg
;
; On exit:
;    The SWBseed array is set using sign bits from the McGill
;    Super-Duper generator, which is a mix of a congruential and
;    a shift-register sequence. Flags and counters are reset.
;
; Registers Clobbered:  AX,BX,CX,DX,SI,DI.
;
; Algorithm:
;    Starting from lsm of x[0] to the msb of x[N-1], each bit is
;    formed using the sign bit of the xor of a congruential generator
;    with seed ConX and a shift register sequence with seed ShrX.
;
; Register usage
;   cx contains count for two loops
;      ch counts the outer loop (for each byte of the array x)
;      cl counts for each bit of x[i]
;   bl contains the bits until eight of them are gathered
;   es:di point to where x[i] 
;   dx:ax are used for computations
;
    mov ch,N*4
    mov di,offset swbseed  ; do loop for x[0], ... , x[4*n] (bytes)
nextbyte:
	mov bl,0        ; 8 bits of bl are made bit by bit
	mov cl,8        
nextbit:    push bx                     ; compute the cong and the sh-reg
	    mov  ax,conglo              ; generators in place
	    mov  dx,conghi
	    cong
	    mov  conglo,ax
	    mov  conghi,dx
	    mov  ax,shrglo
	    mov  dx,shrghi
	    shreg
	    mov  shrglo,ax
	    mov  shrghi,dx
	    pop  bx 

	    xor  dh,byte ptr conghi+1   ; xor the top byte of arg1 and arg2
	    rcl  dh,1           ; move the sign bit into the carry
	    rcr  bl,1           ; shift it into bl
	dec cl
	jnz nextbit
	mov al,bl
	stosb               ; Store the answer in swbb[i]
    dec ch
    jnz nextbyte
;
; reset all counters, flags etc. and return.
;
	xor bx,bx
	mov swb32.c,bx
	mov swb16.c,bx
	mov swb8.c,bx
	mov swb1.c,bx
	mov flags,0
	Exit2arg
rinit	endp

SWBfill proc near	; SUBTRACT-WITH-BORROW ========================
;
;  On Entry:
;     DS should point to the data segment.
;     BX contains offset to swb??.x the data array for the answer
;
;  On Exit:
;     SWBseed contains all new values computed by the SWB generator.
;     carry   contains the carry flag from the last subtraction.
;     The array pointed to by [BX] contains the seed xor'ed with
;	a congruential sequence.
;
;  Registers Clobbered: AX,BX,CX,DX,SI,DI
;
;  Algorithm:
;     The following subtractions are performed from right to left.
;     The carry propagates as though these were two long numbers,
;     with each x[i] being a `digit' in base 2^32.
;
;        x[12] ...  x[ 0]      x[36] ...  x[13]
;       -x[36] ... -x[24]     -x[23] ... -x[ 0]
;       ---------------------------------------
;        x[36] ...  x[24]      x[23] ...  x[ 0]
;
;     The x's also could be considered as pairs of base 16 digits,
;     so that x[i] is the pair y[2i+1]y[2i]. This allows us to use
;     only 16 bit subtractions with carry, perfectly suited for all
;     80x86 processors. The same idea could be extended for machines
;     with only eight bit, or even only 1 bit arithmetic.
;
	EnterFill
	push bx			; Save the location for the results
	mov ah,byte ptr flags   ; set carry flag to what it was the
	sahf                    ; last time we exited this routine

	mov cx,24*2             ; will do first loop 48 times
	mov si,offset(swbseed)+13*4; set up ds:si -> x[13]
	mov di,offset(swbseed)     ; set up es:di -> x[0]

loop1:  ; On a 8086, the instructions `lodsw', `stosw' and `loop'
	; all change registers automatically as noted in parentheses

	lodsw                   ;    ax = y[i+26]     ( si = si+2 )
	sbb ax,[di]             ;    ax = ax-y[i]-carry
	stosw                   ;    y[i] = ax        ( di = di+2 )
	loop loop1              ; loop for i=0..47    ( cx = cx-1 )

	mov cx,13*2             ; will do next loop 26 times
	mov si,offset(swbseed)     ; set up ds:si -> y[0]
			        ; es:di is already set up
loop2:
	lodsw                   ;    ax = y[i-48]     ( si = si+2 )
	sbb ax,[di]             ;    ax = ax-y[i]-carry
	stosw                   ;    x[i] = ax        ( di = di+2 )
	loop loop2              ; loop for i=48..73   ( cx = cx-1 )

	lahf
	mov byte ptr flags,ah   ; save carry flag for next time
;
; XOR the elements of x with a congruential generator and put the
; result in swbx
;
	mov  si,offset(swbseed)	; the source is the seed
	pop  di			; the destination is one of swb??.x
	mov  [di-4],di		; set swb??.p = & swb??.x[0]
IFNDEF fast
	mov  cx,N
	mov  ax,conglo
	mov  dx,conghi
loopc:  cong
	mov  bx,ax
	lodsw
	xor  ax,bx
	stosw
	lodsw
	xor  ax,dx
	stosw
	mov  ax,bx
	loop loopc
	mov  conglo,ax
	mov  conghi,dx
ELSE
	mov  cx,2*N
	rep  movsw
ENDIF
	ExitFill
SWBfill	endp

; Random Number procedures ============================================
;
CheckFill MACRO bits,bytes,count
	local ok
	dec	bits.c
	jns	ok
	    mov   bx,offset(bits.x)
	    call  SWBfill
	    mov   bits.c,count-1
ok:	mov	bx,bits.p
	add	bits.p,bytes
ENDM

i32bit proc
	EnterProcedure
	CheckFill swb32,4,N
	mov	ax,[bx]
	mov	dx,[bx+2]
	DwordFn
	ExitProcedure
i32bit endp

i31bit proc
	EnterProcedure
	CheckFill swb32,4,N
	mov	ax,[bx]
	mov	dx,[bx+2]
	and	dh,7Fh
	DwordFn
	ExitProcedure
i31bit endp

i16bit proc
	EnterProcedure
	CheckFill swb16,2,2*N
	mov	ax,[bx]
	WordFn
	ExitProcedure
i16bit endp

i15bit proc
	EnterProcedure
	CheckFill swb16,2,2*N
	mov	ax,[bx]
	and	ah,7Fh
	WordFn
	ExitProcedure
i15bit endp

i8bit proc
	EnterProcedure
	CheckFill swb8,1,4*N
	mov	al,[bx]
	ByteFn
	ExitProcedure
i8bit endp

i7bit proc
	EnterProcedure
	CheckFill swb8,1,4*N
	mov	al,[bx]
	and	al,7Fh
	ByteFn
	ExitProcedure
i7bit endp

i1bit proc
	EnterProcedure di
	dec	swb1.c
	jns	ok1
	    CheckFill  swb32,4,N	; do an i32bit
	    mov   dx,[bx+2]
	    mov   bx,[bx]		; with answer in dx:bx
	    mov   swb1.c,31
	    mov   di,offset(swb1.x)
	    mov   swb1.p,di
	    mov   ax,ds
	    mov   es,ax

	    mov   cx,16
stosb1:	    mov   al,1
	    and   al,bl
	    stosb
	    shr   bx,1
	    loop  stosb1

	    mov	  cl,16
stosb2:	    mov   al,1
	    and   al,dl
	    stosb
	    shr   dx,1
	    loop  stosb2
ok1:	mov	bx,swb1.p
	inc	swb1.p
	mov	al,[bx]
	ByteFn
	ExitProcedure di
i1bit endp

uni proc
	EnterProcedure
	fild	neg31
	CheckFill  swb32,4,N
	and	byte ptr [bx+3],7Fh
	jnz	oku
	    mov	  ax,[bx]
	    mov   tmpw3,ax
	    mov	  ax,[bx+2]
	    mov	  tmpw4,ax
	    CheckFill  swb32,4,N
	    mov	  ax,[bx]
	    mov   tmpw1,ax
	    mov	  ax,[bx]
	    mov   tmpw2,ax
	    fild  tmpq
	    jmp   uxit
oku:	fild	dword ptr [bx]
uxit:	fscale			; scale the integer by the exponent
	fstp st(1)		; dump the exponent from the fpp
	RealFn
	ExitProcedure
uni endp

vni proc
	EnterProcedure
	fild	neg31
	CheckFill  swb32,4,N
	test	byte ptr [bx+3],0FFh
	jnz	okv
	    mov	  ax,[bx]
	    mov   tmpw3,ax
	    mov   ax,[bx+2]
	    mov   tmpw4,ax
	    CheckFill  swb32,4,N
	    mov   ax,[bx]
	    mov   tmpw1,ax
	    mov   ax,[bx+2]
	    mov   tmpw2,ax
	    fild  tmpq
	    jmp   vxit
okv:	fild	dword ptr [bx]
vxit:	fscale			; scale the integer by the exponent
	fstp	st(1)		; dump the exponent from the fpp
	RealFn
	ExitProcedure
vni endp

duni proc
	EnterProcedure
	fild	neg63
	CheckFill  swb32,4,N
	dec	swb32.c
	jns	okdu
	    mov	  ax,[bx]
	    mov   tmpw1,ax
	    mov	  ax,[bx+2]
	    mov   tmpw2,ax
	    mov   bx,offset(swb32.x)
	    call  SWBfill
	    mov   swb32.c,N-1
	    mov   ax,word ptr swb32.x
	    mov   tmpw3,ax
	    mov   ax,word ptr swb32.x+2
	    mov   tmpw4,ax
	    and   byte ptr [tmpq+7],7Fh
	    fild  tmpq
	    jmp   duxit
okdu:	and	byte ptr [bx+7],7Fh
	fild	qword ptr [bx]
duxit:	add	swb32.p,4
	fscale
	fstp	st(1)
	DoubleFn
	ExitProcedure
duni endp

dvni proc
	EnterProcedure
	fild	neg63
	CheckFill  swb32,4,N
	dec	swb32.c
	jns	okdv
	    mov   ax,[bx]
	    mov   tmpw1,ax
	    mov   ax,[bx+2]
	    mov   tmpw2,ax
	    mov   bx,offset(swb32.x)
	    call  SWBfill
	    mov   swb32.c,N-1
	    mov   ax,word ptr swb32.x
	    mov   tmpw3,ax
	    mov   ax,word ptr swb32.x+2
	    mov   tmpw4,ax
	    fild  tmpq
	    jmp   dvxit
okdv:	fild	qword ptr [bx]
dvxit:	add	swb32.p,4
	fscale
	fstp	st(1)
	DoubleFn
	ExitProcedure
dvni endp

