-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathasm_vecDiv_sse.s
70 lines (52 loc) · 1.05 KB
/
asm_vecDiv_sse.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
// +build sse
// +build amd64
#include "textflag.h"
// func divAsm(a, b []float32)
TEXT ·divAsm(SB), NOSPLIT, $0
MOVQ a_data+0(FP), SI
MOVQ b_data+24(FP), DI // use destination index register for this
MOVQ a_len+8(FP), AX // len(a) into AX
// check if there are at least 16 elements
SUBQ $16, AX
JL remainder
loop:
// a[0]
MOVAPS (SI), X0
MOVAPS (DI), X1
DIVPS X1, X0
MOVAPS X0, (SI)
MOVAPS 16(SI), X2
MOVAPS 16(DI), X3
DIVPS X3, X2
MOVAPS X2, 16(SI)
MOVAPS 32(SI), X4
MOVAPS 32(DI), X5
DIVPS X5, X4
MOVAPS X4, 32(SI)
MOVAPS 48(SI), X6
MOVAPS 48(DI), X7
DIVPS X7, X6
MOVAPS X6, 48(SI)
// update pointers. 4 registers, 4 elements each, 4 bytes per element
ADDQ $64, SI
ADDQ $64, DI
// len(a) is now 4*4 elements less
SUBQ $16, AX
JGE loop
remainder:
ADDQ $16, AX
JE done
remainderloop:
// copy into the appropriate registers
MOVSS (SI), X0
MOVSS (DI), X1
DIVSS X1, X0
// save it back
MOVSS X0, (SI)
// update pointer to the top of the data
ADDQ $4, SI
ADDQ $4, DI
DECQ AX
JNE remainderloop
done:
RET