;    Copyright (C) 1989, 1992, 1993 Aladdin Enterprises.  All rights reserved.
;
; This file is part of Ghostscript.
;
; Ghostscript is distributed in the hope that it will be useful, but
; WITHOUT ANY WARRANTY.  No author or distributor accepts responsibility
; to anyone for the consequences of using it or for whether it serves any
; particular purpose or works at all, unless he says so in writing.  Refer
; to the Ghostscript General Public License for full details.
;
; Everyone is granted permission to copy, modify and redistribute
; Ghostscript, but only under the conditions described in the Ghostscript
; General Public License.  A copy of this license is supposed to have been
; given to you along with Ghostscript so you can know your rights and
; responsibilities.  It should be in a file named COPYING.  Among other
; things, the copyright notice and this notice must be preserved on all
; copies.

; iutilasm.asm
; Assembly code for Ghostscript interpreter on MS-DOS systems

	ifdef	FOR80386

	.286c

	endif

utilasm_TEXT	SEGMENT	WORD PUBLIC 'CODE'
	ASSUME	CS:utilasm_TEXT


	ifdef	FOR80386

; Macro for 32-bit operand prefix.
OP32	macro
	db	66h
	endm

	endif					; FOR80386


	ifdef	FOR80386

; Replace the multiply and divide routines in the Turbo C library
; if we are running on an 80386.

; Macro to swap the halves of a 32-bit register.
; Unfortunately, masm won't allow a shift instruction with a count of 16,
; so we have to code it in hex.
swap	macro	regno
	  OP32
	db	0c1h,0c0h+regno,16		; rol regno,16
	endm
regax	equ	0
regcx	equ	1
regdx	equ	2
regbx	equ	3

; Multiply (dx,ax) by (cx,bx) to (dx,ax).
	PUBLIC	LXMUL@
	PUBLIC	F_LXMUL@
F_LXMUL@ proc	far
LXMUL@	proc	far
	swap	regdx
	mov	dx,ax
	swap	regcx
	mov	cx,bx
	  OP32
	db	0fh,0afh,0d1h			; imul dx,cx
	  OP32
	mov	ax,dx
	swap	regdx
	ret
LXMUL@	endp
F_LXMUL@ endp

; Divide two stack operands, leave the result in (dx,ax).

	ifdef	DEBUG

setup32	macro
	mov	bx,sp
	push	bp
	mov	bp,sp
	  OP32
	mov	ax,ss:[bx+4]			; dividend
	endm

ret32	macro	n
	mov	sp,bp
	pop	bp
	ret	n
	endm

	else					; !DEBUG

setup32	macro
	mov	bx,sp
	  OP32
	mov	ax,ss:[bx+4]			; dividend
	endm

ret32	macro	n
	ret	n
	endm

	endif					; (!)DEBUG

	PUBLIC	LDIV@, LUDIV@, LMOD@, LUMOD@
	PUBLIC	F_LDIV@, F_LUDIV@, F_LMOD@, F_LUMOD@
F_LDIV@	proc	far
LDIV@	proc	far
	setup32
	  OP32
	cwd
	  OP32
	idiv	word ptr ss:[bx+8]		; divisor
	  OP32
	mov	dx,ax
	swap	regdx
	ret32	8
LDIV@	endp
F_LDIV@	endp
F_LUDIV@ proc	far
LUDIV@	proc	far
	setup32
	  OP32
	xor	dx,dx
	  OP32
	div	word ptr ss:[bx+8]		; divisor
	  OP32
	mov	dx,ax
	swap	regdx
	ret32	8
LUDIV@	endp
F_LUDIV@ endp
F_LMOD@	proc	far
LMOD@	proc	far
	setup32
	  OP32
	cwd
	  OP32
	idiv	word ptr ss:[bx+8]		; divisor
	  OP32
	mov	ax,dx
	swap	regdx
	ret32	8
LMOD@	endp
F_LMOD@	endp
F_LUMOD@ proc	far
LUMOD@	proc	far
	setup32
	  OP32
	xor	dx,dx
	  OP32
	div	word ptr ss:[bx+8]		; divisor
	  OP32
	mov	ax,dx
	swap	regdx
	ret32	8
LUMOD@	endp
F_LUMOD@ endp

	else					; !FOR80386

; Replace the divide routines in the Turbo C library,
; which do the division one bit at a time (!).

	PUBLIC	LDIV@, LMOD@, LUDIV@, LUMOD@
	PUBLIC	F_LDIV@, F_LMOD@, F_LUDIV@, F_LUMOD@

; Negate a long on the stack.
negbp	macro	offset
	neg	word ptr [bp+offset+2]		; high part
	neg	word ptr [bp+offset]		; low part
	sbb	word ptr [bp+offset+2],0
	endm

; Negate a long in (dx,ax).
negr	macro
	neg	dx
	neg	ax
	sbb	dx,0
	endm

; Divide two unsigned longs on the stack.
; Leave either the quotient or the remainder in (dx,ax).
; Operand offsets assume that bp (and only bp) has been pushed.
nlo	equ	6
nhi	equ	8
dlo	equ	10
dhi	equ	12

; We use an offset in bx to distinguish div from mod,
; and to indicate whether the result should be negated.
odiv	equ	0
omod	equ	2
odivneg	equ	4
omodneg	equ	6
F_LMOD@	proc	far
LMOD@	proc	far
	push	bp
	mov	bp,sp
	mov	bx,omod
			; Take abs of denominator
	cmp	byte ptr [bp+dhi+1],bh		; bh = 0
	jge	modpd
	negbp	dlo
modpd:			; Negate mod if numerator < 0
	cmp	byte ptr [bp+nhi+1],bh		; bh = 0
	jge	udiv
	mov	bx,omodneg
negnum:	negbp	nlo
	jmp	udiv
LMOD@	endp
F_LMOD@	endp
F_LUMOD@ proc	far
LUMOD@	proc	far
	mov	bx,omod
	jmp	udpush
LUMOD@	endp
F_LUMOD@ endp
F_LDIV@	proc	far
LDIV@	proc	far
	push	bp
	mov	bp,sp
	mov	bx,odiv
			; Negate quo if num^den < 0
	mov	ax,[bp+nhi]
	xor	ax,[bp+dhi]
	jge	divabs
	mov	bx,odivneg
divabs:			; Take abs of denominator
	cmp	byte ptr [bp+dhi+1],bh		; bh = 0
	jge	divpd
	negbp	dlo
divpd:			; Take abs of numerator
	cmp	byte ptr [bp+nhi+1],bh		; bh = 0
	jge	udiv
	jmp	negnum
LDIV@	endp
F_LDIV@	endp
F_LUDIV@ proc	far
LUDIV@	proc	far
	mov	bx,odiv
udpush:	push	bp
	mov	bp,sp
udiv:	push	bx				; odiv, omod, odivneg, omodneg
	mov	ax,[bp+nlo]
	mov	dx,[bp+nhi]
	mov	bx,[bp+dlo]
	mov	cx,[bp+dhi]
; Now we are dividing dx:ax by cx:bx.
; Check to see whether this is really a 32/16 division.
	or	cx,cx
	jnz	div2
; 32/16, check for 16- vs. 32-bit quotient
	cmp	dx,bx
	jae	div1
; 32/16 with 16-bit quotient, just do it.
	div	bx				; ax = quo, dx = rem
	pop	bx
	pop	bp
	jmp	cs:xx1[bx]
	even
xx1	dw	offset divx1
	dw	offset modx1
	dw	offset divx1neg
	dw	offset modx1neg
modx1:	mov	ax,dx
divx1:	xor	dx,dx
	ret	8
modx1neg: mov	ax,dx
divx1neg: xor	dx,dx
rneg:	negr
	ret	8
; 32/16 with 32-bit quotient, do in 2 parts.
div1:	mov	cx,ax				; save lo num
	mov	ax,dx
	xor	dx,dx
	div	bx				; ax = hi quo
	xchg	cx,ax				; save hi quo, get lo num
	div	bx				; ax = lo quo, dx = rem
	pop	bx
	pop	bp
	jmp	cs:xx1a[bx]
	even
xx1a	dw	offset divx1a
	dw	offset modx1
	dw	offset divx1aneg
	dw	offset modx1neg
divx1a:	mov	dx,cx				; hi quo
	ret	8
divx1aneg: mov	dx,cx
	jmp	rneg
; This is really a 32/32 bit division.
; (Note that the quotient cannot exceed 16 bits.)
; The following algorithm is taken from pp. 235-240 of Knuth, vol. 2
; (first edition).
; Start by normalizing the numerator and denominator.
div2:	or	ch,ch
	jz	div21				; ch == 0, but cl != 0
; Do 8 steps all at once.
	mov	bl,bh
	mov	bh,cl
	mov	cl,ch
	xor	ch,ch
	mov	al,ah
	mov	ah,dl
	mov	dl,dh
	xor	dh,dh
	rol	bx,1				; faster than jmp
div2a:	rcr	bx,1				; finish previous shift
div21:	shr	dx,1
	rcr	ax,1
	shr	cx,1
	jnz	div2a
	rcr	bx,1
; Now we can do a 32/16 divide.
div2x:	div	bx				; ax = quo, dx = rem
; Multiply by the denominator, and correct the result.
	mov	cx,ax				; save quotient
	mul	word ptr [bp+dhi]
	mov	bx,ax				; save lo part of hi product
	mov	ax,cx
	mul	word ptr [bp+dlo]
	add	dx,bx
; Now cx = trial quotient, (dx,ax) = cx * denominator.
	not	dx
	neg	ax
	cmc
	adc	dx,0				; double-precision neg
	jc	divz				; zero quotient
						; requires special handling
	add	ax,[bp+nlo]
	adc	dx,[bp+nhi]
	jc	divx
; Quotient is too large, adjust it.
div3:	dec	cx
	add	ax,[bp+dlo]
	adc	dx,[bp+dhi]
	jnc	div3
; All done.  (dx,ax) = remainder, cx = lo quotient.
divx:	pop	bx
	pop	bp
	jmp	cs:xx3[bx]
	even
xx3	dw	offset divx3
	dw	offset modx3
	dw	offset divx3neg
	dw	offset modx3neg
divx3:	mov	ax,cx
	xor	dx,dx
modx3:	ret	8
divx3neg: mov	ax,cx
	xor	dx,dx
modx3neg: jmp	rneg
; Handle zero quotient specially.
divz:	pop	bx
	jmp	cs:xxz[bx]
	even
xxz	dw	offset divxz
	dw	offset modxz
	dw	offset divxz
	dw	offset modxzneg
divxz:	pop	bp
	ret	8
modxzneg: negbp	nlo
modxz:	mov	ax,[bp+nlo]
	mov	dx,[bp+nhi]
	pop	bp
	ret	8
LUDIV@	endp
F_LUDIV@ endp

	endif					; FOR80386

; Transpose an 8x8 bit matrix.  See gsmisc.c for the algorithm in C.
	PUBLIC	_memflip8x8
_memflip8x8 proc far
	push	ds
	push	si
	push	di
		; After pushing, the offsets of the parameters are:
		; byte *inp=10, int line_size=14, byte *outp=16, int dist=20.
	mov	si,sp
	mov	di,ss:[si+14]			; line_size
	lds	si,ss:[si+10]			; inp
		; We assign variables to registers as follows:
		; ax = AE, bx = BF, cx (or di) = CG, dx = DH.
		; Load the input data.  Initially we assign
		; ax = AB, bx = EF, cx (or di) = CD, dx = GH.
	mov	ah,[si]
iload	macro	reg
	add	si,di
	mov	reg,[si]
	endm
	iload	al
	iload	ch
	iload	cl
	iload	bh
	iload	bl
	iload	dh
	iload	dl
		; Transposition macro, see C code for explanation.
trans	macro	reg1,reg2,shift,mask
	mov	si,reg1
	shr	si,shift
	xor	si,reg2
	and	si,mask
	xor	reg2,si
	shl	si,shift
	xor	reg1,si
	endm
		; Do 4x4 transpositions
	mov	di,cx			; we need cl for the shift count
	mov	cl,4
	trans	bx,ax,cl,0f0fh
	trans	dx,di,cl,0f0fh
		; Swap B/E, D/G
	xchg	al,bh
	mov	cx,di
	xchg	cl,dh
		; Do 2x2 transpositions
	mov	di,cx				; need cl again
	mov	cl,2
	trans	di,ax,cl,3333h
	trans	dx,bx,cl,3333h
	mov	cx,di				; done shifting >1
		; Do 1x1 transpositions
	trans	bx,ax,1,5555h
	trans	dx,cx,1,5555h
		; Store result
	mov	si,sp
	mov	di,ss:[si+20]			; dist
	lds	si,ss:[si+16]			; outp
	mov	[si],ah
istore	macro	reg
	add	si,di
	mov	[si],reg
	endm
	istore	bh
	istore	ch
	istore	dh
	istore	al
	istore	bl
	istore	cl
	istore	dl
		; All done
	pop	di
	pop	si
	pop	ds
	ret
_memflip8x8 ENDP


utilasm_TEXT ENDS
	END
