Matrix Inverse using SSE

I am trying to write a function to perform the inverse of a matrix NxN, using technology SSE. but do not know where to start, I managed to write only in the case of a 4x4 matrix .. how can I extend it to the case NxN?

This is the code:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
void inverse(float* src){
    __m128 minor0, minor1, minor2, minor3;
    __m128 row0,   row1,   row2,   row3;
    __m128 det,tmp1;
    
    tmp1= _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
    row1= _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
    row0= _mm_shuffle_ps(tmp1, row1, 0x88);
    row1= _mm_shuffle_ps(row1, tmp1, 0xDD);
    tmp1= _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
    row3= _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
    row2= _mm_shuffle_ps(tmp1, row3, 0x88);
    row3= _mm_shuffle_ps(row3, tmp1, 0xDD);
    
    tmp1= _mm_mul_ps(row2, row3);
    tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
    minor0= _mm_mul_ps(row1, tmp1);
    minor1= _mm_mul_ps(row0, tmp1);
    tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
    minor0= _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
    minor1= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
    minor1= _mm_shuffle_ps(minor1, minor1, 0x4E);
    
    tmp1= _mm_mul_ps(row1, row2);
    tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
    minor0= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
    minor3= _mm_mul_ps(row0, tmp1);
    tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
    minor0= _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
    minor3= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
    minor3= _mm_shuffle_ps(minor3, minor3, 0x4E);
    
    tmp1= _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
    tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
    row2= _mm_shuffle_ps(row2, row2, 0x4E);
    minor0= _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
    minor2= _mm_mul_ps(row0, tmp1);
    tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
    minor0= _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
    minor2= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
    minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
    
    tmp1= _mm_mul_ps(row0, row1);
    tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
    
    minor2= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
    minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
    tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
    minor2= _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
    minor3= _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
    
    tmp1= _mm_mul_ps(row0, row3);
    tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
    minor1= _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
    minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
    tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
    minor1= _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
    minor2= _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
    
    
    tmp1= _mm_mul_ps(row0, row2);
    tmp1= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
    minor1= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
    minor3= _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
    tmp1= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
    minor1= _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
    minor3= _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
    
    det= _mm_mul_ps(row0, minor0);
    det= _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
    det= _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
    tmp1= _mm_rcp_ss(det);
    det= _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
    det= _mm_shuffle_ps(det, det, 0x00);
    
    minor0= _mm_mul_ps(det, minor0);
    _mm_storel_pi((__m64*)(src), minor0);
    _mm_storeh_pi((__m64*)(src+2), minor0);
    
    minor1 = _mm_mul_ps(det, minor1);
    _mm_storel_pi((__m64*)(src+4), minor1);
    _mm_storeh_pi((__m64*)(src+6), minor1);
    
    minor2 = _mm_mul_ps(det, minor2);
    _mm_storel_pi((__m64*)(src+ 8), minor2);
    _mm_storeh_pi((__m64*)(src+10), minor2);
    
    minor3 = _mm_mul_ps(det, minor3);
    _mm_storel_pi((__m64*)(src+12), minor3);
    _mm_storeh_pi((__m64*)(src+14), minor3);
}
Topic archived. No new replies allowed.