MUDA
Loading...
Searching...
No Matches
svd_impl.h
1#pragma once
2#include <cuda.h>
3
4namespace muda::details::eigen
5{
6
7union un
8{
9 float f;
10 unsigned int ui;
11};
12
13__device__ __forceinline__ void svd3x3(float a11,
14 float a12,
15 float a13,
16 float a21,
17 float a22,
18 float a23,
19 float a31,
20 float a32,
21 float a33, // input A
22 float& u11,
23 float& u12,
24 float& u13,
25 float& u21,
26 float& u22,
27 float& u23,
28 float& u31,
29 float& u32,
30 float& u33, // output U
31 float& s11,
32 //float &s12, float &s13, float &s21,
33 float& s22,
34 //float &s23, float &s31, float &s32,
35 float& s33, // output S
36 float& v11,
37 float& v12,
38 float& v13,
39 float& v21,
40 float& v22,
41 float& v23,
42 float& v31,
43 float& v32,
44 float& v33 // output V
45)
46{
47 constexpr auto gone = 1065353216;
48 constexpr auto gsine_pi_over_eight = 1053028117;
49 constexpr auto gcosine_pi_over_eight = 1064076127;
50 constexpr float gone_half = 0.5;
51 constexpr float gsmall_number = 1.e-12;
52 constexpr float gtiny_number = 1.e-20;
53 constexpr float gfour_gamma_squared = 5.8284273147583007813;
54
55 un Sa11, Sa21, Sa31, Sa12, Sa22, Sa32, Sa13, Sa23, Sa33;
56 un Su11, Su21, Su31, Su12, Su22, Su32, Su13, Su23, Su33;
57 un Sv11, Sv21, Sv31, Sv12, Sv22, Sv32, Sv13, Sv23, Sv33;
58 un Sc, Ss, Sch, Ssh;
59 un Stmp1, Stmp2, Stmp3, Stmp4, Stmp5;
60 un Ss11, Ss21, Ss31, Ss22, Ss32, Ss33;
61 un Sqvs, Sqvvx, Sqvvy, Sqvvz;
62
63 Sa11.f = a11;
64 Sa12.f = a12;
65 Sa13.f = a13;
66 Sa21.f = a21;
67 Sa22.f = a22;
68 Sa23.f = a23;
69 Sa31.f = a31;
70 Sa32.f = a32;
71 Sa33.f = a33;
72
73 //###########################################################
74 // Compute normal equations matrix
75 //###########################################################
76
77 Ss11.f = Sa11.f * Sa11.f;
78 Stmp1.f = Sa21.f * Sa21.f;
79 Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);
80 Stmp1.f = Sa31.f * Sa31.f;
81 Ss11.f = __fadd_rn(Stmp1.f, Ss11.f);
82
83 Ss21.f = Sa12.f * Sa11.f;
84 Stmp1.f = Sa22.f * Sa21.f;
85 Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);
86 Stmp1.f = Sa32.f * Sa31.f;
87 Ss21.f = __fadd_rn(Stmp1.f, Ss21.f);
88
89 Ss31.f = Sa13.f * Sa11.f;
90 Stmp1.f = Sa23.f * Sa21.f;
91 Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);
92 Stmp1.f = Sa33.f * Sa31.f;
93 Ss31.f = __fadd_rn(Stmp1.f, Ss31.f);
94
95 Ss22.f = Sa12.f * Sa12.f;
96 Stmp1.f = Sa22.f * Sa22.f;
97 Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);
98 Stmp1.f = Sa32.f * Sa32.f;
99 Ss22.f = __fadd_rn(Stmp1.f, Ss22.f);
100
101 Ss32.f = Sa13.f * Sa12.f;
102 Stmp1.f = Sa23.f * Sa22.f;
103 Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);
104 Stmp1.f = Sa33.f * Sa32.f;
105 Ss32.f = __fadd_rn(Stmp1.f, Ss32.f);
106
107 Ss33.f = Sa13.f * Sa13.f;
108 Stmp1.f = Sa23.f * Sa23.f;
109 Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);
110 Stmp1.f = Sa33.f * Sa33.f;
111 Ss33.f = __fadd_rn(Stmp1.f, Ss33.f);
112
113 Sqvs.f = 1.f;
114 Sqvvx.f = 0.f;
115 Sqvvy.f = 0.f;
116 Sqvvz.f = 0.f;
117
118 //###########################################################
119 // Solve symmetric eigenproblem using Jacobi iteration
120 //###########################################################
121 for(int i = 0; i < 4; i++)
122 {
123 Ssh.f = Ss21.f * 0.5f;
124 Stmp5.f = __fsub_rn(Ss11.f, Ss22.f);
125
126 Stmp2.f = Ssh.f * Ssh.f;
127 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
128 Ssh.ui = Stmp1.ui & Ssh.ui;
129 Sch.ui = Stmp1.ui & Stmp5.ui;
130 Stmp2.ui = ~Stmp1.ui & gone;
131 Sch.ui = Sch.ui | Stmp2.ui;
132
133 Stmp1.f = Ssh.f * Ssh.f;
134 Stmp2.f = Sch.f * Sch.f;
135 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
136 Stmp4.f = __frsqrt_rn(Stmp3.f);
137
138 Ssh.f = Stmp4.f * Ssh.f;
139 Sch.f = Stmp4.f * Sch.f;
140 Stmp1.f = gfour_gamma_squared * Stmp1.f;
141 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
142
143 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
144 Ssh.ui = ~Stmp1.ui & Ssh.ui;
145 Ssh.ui = Ssh.ui | Stmp2.ui;
146 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
147 Sch.ui = ~Stmp1.ui & Sch.ui;
148 Sch.ui = Sch.ui | Stmp2.ui;
149
150 Stmp1.f = Ssh.f * Ssh.f;
151 Stmp2.f = Sch.f * Sch.f;
152 Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
153 Ss.f = Sch.f * Ssh.f;
154 Ss.f = __fadd_rn(Ss.f, Ss.f);
155
156#ifdef DEBUG_JACOBI_CONJUGATE
157 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f, Sch.f);
158#endif
159 //###########################################################
160 // Perform the actual Givens conjugation
161 //###########################################################
162
163 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
164 Ss33.f = Ss33.f * Stmp3.f;
165 Ss31.f = Ss31.f * Stmp3.f;
166 Ss32.f = Ss32.f * Stmp3.f;
167 Ss33.f = Ss33.f * Stmp3.f;
168
169 Stmp1.f = Ss.f * Ss31.f;
170 Stmp2.f = Ss.f * Ss32.f;
171 Ss31.f = Sc.f * Ss31.f;
172 Ss32.f = Sc.f * Ss32.f;
173 Ss31.f = __fadd_rn(Stmp2.f, Ss31.f);
174 Ss32.f = __fsub_rn(Ss32.f, Stmp1.f);
175
176 Stmp2.f = Ss.f * Ss.f;
177 Stmp1.f = Ss22.f * Stmp2.f;
178 Stmp3.f = Ss11.f * Stmp2.f;
179 Stmp4.f = Sc.f * Sc.f;
180 Ss11.f = Ss11.f * Stmp4.f;
181 Ss22.f = Ss22.f * Stmp4.f;
182 Ss11.f = __fadd_rn(Ss11.f, Stmp1.f);
183 Ss22.f = __fadd_rn(Ss22.f, Stmp3.f);
184 Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
185 Stmp2.f = __fadd_rn(Ss21.f, Ss21.f);
186 Ss21.f = Ss21.f * Stmp4.f;
187 Stmp4.f = Sc.f * Ss.f;
188 Stmp2.f = Stmp2.f * Stmp4.f;
189 Stmp5.f = Stmp5.f * Stmp4.f;
190 Ss11.f = __fadd_rn(Ss11.f, Stmp2.f);
191 Ss21.f = __fsub_rn(Ss21.f, Stmp5.f);
192 Ss22.f = __fsub_rn(Ss22.f, Stmp2.f);
193
194#ifdef DEBUG_JACOBI_CONJUGATE
195 printf("%.20g\n", Ss11.f);
196 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
197 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
198#endif
199
200 //###########################################################
201 // Compute the cumulative rotation, in quaternion form
202 //###########################################################
203
204 Stmp1.f = Ssh.f * Sqvvx.f;
205 Stmp2.f = Ssh.f * Sqvvy.f;
206 Stmp3.f = Ssh.f * Sqvvz.f;
207 Ssh.f = Ssh.f * Sqvs.f;
208
209 Sqvs.f = Sch.f * Sqvs.f;
210 Sqvvx.f = Sch.f * Sqvvx.f;
211 Sqvvy.f = Sch.f * Sqvvy.f;
212 Sqvvz.f = Sch.f * Sqvvz.f;
213
214 Sqvvz.f = __fadd_rn(Sqvvz.f, Ssh.f);
215 Sqvs.f = __fsub_rn(Sqvs.f, Stmp3.f);
216 Sqvvx.f = __fadd_rn(Sqvvx.f, Stmp2.f);
217 Sqvvy.f = __fsub_rn(Sqvvy.f, Stmp1.f);
218
219#ifdef DEBUG_JACOBI_CONJUGATE
220 printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f, Sqvs.f);
221#endif
222
224 // (1->3)
226 Ssh.f = Ss32.f * 0.5f;
227 Stmp5.f = __fsub_rn(Ss22.f, Ss33.f);
228
229 Stmp2.f = Ssh.f * Ssh.f;
230 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
231 Ssh.ui = Stmp1.ui & Ssh.ui;
232 Sch.ui = Stmp1.ui & Stmp5.ui;
233 Stmp2.ui = ~Stmp1.ui & gone;
234 Sch.ui = Sch.ui | Stmp2.ui;
235
236 Stmp1.f = Ssh.f * Ssh.f;
237 Stmp2.f = Sch.f * Sch.f;
238 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
239 Stmp4.f = __frsqrt_rn(Stmp3.f);
240
241 Ssh.f = Stmp4.f * Ssh.f;
242 Sch.f = Stmp4.f * Sch.f;
243 Stmp1.f = gfour_gamma_squared * Stmp1.f;
244 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
245
246 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
247 Ssh.ui = ~Stmp1.ui & Ssh.ui;
248 Ssh.ui = Ssh.ui | Stmp2.ui;
249 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
250 Sch.ui = ~Stmp1.ui & Sch.ui;
251 Sch.ui = Sch.ui | Stmp2.ui;
252
253 Stmp1.f = Ssh.f * Ssh.f;
254 Stmp2.f = Sch.f * Sch.f;
255 Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
256 Ss.f = Sch.f * Ssh.f;
257 Ss.f = __fadd_rn(Ss.f, Ss.f);
258
259#ifdef DEBUG_JACOBI_CONJUGATE
260 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f, Sch.f);
261#endif
262
263 //###########################################################
264 // Perform the actual Givens conjugation
265 //###########################################################
266
267 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
268 Ss11.f = Ss11.f * Stmp3.f;
269 Ss21.f = Ss21.f * Stmp3.f;
270 Ss31.f = Ss31.f * Stmp3.f;
271 Ss11.f = Ss11.f * Stmp3.f;
272
273 Stmp1.f = Ss.f * Ss21.f;
274 Stmp2.f = Ss.f * Ss31.f;
275 Ss21.f = Sc.f * Ss21.f;
276 Ss31.f = Sc.f * Ss31.f;
277 Ss21.f = __fadd_rn(Stmp2.f, Ss21.f);
278 Ss31.f = __fsub_rn(Ss31.f, Stmp1.f);
279
280 Stmp2.f = Ss.f * Ss.f;
281 Stmp1.f = Ss33.f * Stmp2.f;
282 Stmp3.f = Ss22.f * Stmp2.f;
283 Stmp4.f = Sc.f * Sc.f;
284 Ss22.f = Ss22.f * Stmp4.f;
285 Ss33.f = Ss33.f * Stmp4.f;
286 Ss22.f = __fadd_rn(Ss22.f, Stmp1.f);
287 Ss33.f = __fadd_rn(Ss33.f, Stmp3.f);
288 Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
289 Stmp2.f = __fadd_rn(Ss32.f, Ss32.f);
290 Ss32.f = Ss32.f * Stmp4.f;
291 Stmp4.f = Sc.f * Ss.f;
292 Stmp2.f = Stmp2.f * Stmp4.f;
293 Stmp5.f = Stmp5.f * Stmp4.f;
294 Ss22.f = __fadd_rn(Ss22.f, Stmp2.f);
295 Ss32.f = __fsub_rn(Ss32.f, Stmp5.f);
296 Ss33.f = __fsub_rn(Ss33.f, Stmp2.f);
297
298#ifdef DEBUG_JACOBI_CONJUGATE
299 printf("%.20g\n", Ss11.f);
300 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
301 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
302#endif
303
304 //###########################################################
305 // Compute the cumulative rotation, in quaternion form
306 //###########################################################
307
308 Stmp1.f = Ssh.f * Sqvvx.f;
309 Stmp2.f = Ssh.f * Sqvvy.f;
310 Stmp3.f = Ssh.f * Sqvvz.f;
311 Ssh.f = Ssh.f * Sqvs.f;
312
313 Sqvs.f = Sch.f * Sqvs.f;
314 Sqvvx.f = Sch.f * Sqvvx.f;
315 Sqvvy.f = Sch.f * Sqvvy.f;
316 Sqvvz.f = Sch.f * Sqvvz.f;
317
318 Sqvvx.f = __fadd_rn(Sqvvx.f, Ssh.f);
319 Sqvs.f = __fsub_rn(Sqvs.f, Stmp1.f);
320 Sqvvy.f = __fadd_rn(Sqvvy.f, Stmp3.f);
321 Sqvvz.f = __fsub_rn(Sqvvz.f, Stmp2.f);
322
323#ifdef DEBUG_JACOBI_CONJUGATE
324 printf("GPU q %.20g %.20g %.20g %.20g\n", Sqvvx.f, Sqvvy.f, Sqvvz.f, Sqvs.f);
325#endif
326#if 1
328 // 1 -> 2
330
331 Ssh.f = Ss31.f * 0.5f;
332 Stmp5.f = __fsub_rn(Ss33.f, Ss11.f);
333
334 Stmp2.f = Ssh.f * Ssh.f;
335 Stmp1.ui = (Stmp2.f >= gtiny_number) ? 0xffffffff : 0;
336 Ssh.ui = Stmp1.ui & Ssh.ui;
337 Sch.ui = Stmp1.ui & Stmp5.ui;
338 Stmp2.ui = ~Stmp1.ui & gone;
339 Sch.ui = Sch.ui | Stmp2.ui;
340
341 Stmp1.f = Ssh.f * Ssh.f;
342 Stmp2.f = Sch.f * Sch.f;
343 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
344 Stmp4.f = __frsqrt_rn(Stmp3.f);
345
346 Ssh.f = Stmp4.f * Ssh.f;
347 Sch.f = Stmp4.f * Sch.f;
348 Stmp1.f = gfour_gamma_squared * Stmp1.f;
349 Stmp1.ui = (Stmp2.f <= Stmp1.f) ? 0xffffffff : 0;
350
351 Stmp2.ui = gsine_pi_over_eight & Stmp1.ui;
352 Ssh.ui = ~Stmp1.ui & Ssh.ui;
353 Ssh.ui = Ssh.ui | Stmp2.ui;
354 Stmp2.ui = gcosine_pi_over_eight & Stmp1.ui;
355 Sch.ui = ~Stmp1.ui & Sch.ui;
356 Sch.ui = Sch.ui | Stmp2.ui;
357
358 Stmp1.f = Ssh.f * Ssh.f;
359 Stmp2.f = Sch.f * Sch.f;
360 Sc.f = __fsub_rn(Stmp2.f, Stmp1.f);
361 Ss.f = Sch.f * Ssh.f;
362 Ss.f = __fadd_rn(Ss.f, Ss.f);
363
364#ifdef DEBUG_JACOBI_CONJUGATE
365 printf("GPU s %.20g, c %.20g, sh %.20g, ch %.20g\n", Ss.f, Sc.f, Ssh.f, Sch.f);
366#endif
367
368 //###########################################################
369 // Perform the actual Givens conjugation
370 //###########################################################
371
372 Stmp3.f = __fadd_rn(Stmp1.f, Stmp2.f);
373 Ss22.f = Ss22.f * Stmp3.f;
374 Ss32.f = Ss32.f * Stmp3.f;
375 Ss21.f = Ss21.f * Stmp3.f;
376 Ss22.f = Ss22.f * Stmp3.f;
377
378 Stmp1.f = Ss.f * Ss32.f;
379 Stmp2.f = Ss.f * Ss21.f;
380 Ss32.f = Sc.f * Ss32.f;
381 Ss21.f = Sc.f * Ss21.f;
382 Ss32.f = __fadd_rn(Stmp2.f, Ss32.f);
383 Ss21.f = __fsub_rn(Ss21.f, Stmp1.f);
384
385 Stmp2.f = Ss.f * Ss.f;
386 Stmp1.f = Ss11.f * Stmp2.f;
387 Stmp3.f = Ss33.f * Stmp2.f;
388 Stmp4.f = Sc.f * Sc.f;
389 Ss33.f = Ss33.f * Stmp4.f;
390 Ss11.f = Ss11.f * Stmp4.f;
391 Ss33.f = __fadd_rn(Ss33.f, Stmp1.f);
392 Ss11.f = __fadd_rn(Ss11.f, Stmp3.f);
393 Stmp4.f = __fsub_rn(Stmp4.f, Stmp2.f);
394 Stmp2.f = __fadd_rn(Ss31.f, Ss31.f);
395 Ss31.f = Ss31.f * Stmp4.f;
396 Stmp4.f = Sc.f * Ss.f;
397 Stmp2.f = Stmp2.f * Stmp4.f;
398 Stmp5.f = Stmp5.f * Stmp4.f;
399 Ss33.f = __fadd_rn(Ss33.f, Stmp2.f);
400 Ss31.f = __fsub_rn(Ss31.f, Stmp5.f);
401 Ss11.f = __fsub_rn(Ss11.f, Stmp2.f);
402
403#ifdef DEBUG_JACOBI_CONJUGATE
404 printf("%.20g\n", Ss11.f);
405 printf("%.20g %.20g\n", Ss21.f, Ss22.f);
406 printf("%.20g %.20g %.20g\n", Ss31.f, Ss32.f, Ss33.f);
407#endif
408
409 //###########################################################
410 // Compute the cumulative rotation, in quaternion form
411 //###########################################################
412
413 Stmp1.f = Ssh.f * Sqvvx.f;
414 Stmp2.f = Ssh.f * Sqvvy.f;
415 Stmp3.f = Ssh.f * Sqvvz.f;
416 Ssh.f = Ssh.f * Sqvs.f;
417
418 Sqvs.f = Sch.f * Sqvs.f;
419 Sqvvx.f = Sch.f * Sqvvx.f;
420 Sqvvy.f = Sch.f * Sqvvy.f;
421 Sqvvz.f = Sch.f * Sqvvz.f;
422
423 Sqvvy.f = __fadd_rn(Sqvvy.f, Ssh.f);
424 Sqvs.f = __fsub_rn(Sqvs.f, Stmp2.f);
425 Sqvvz.f = __fadd_rn(Sqvvz.f, Stmp1.f);
426 Sqvvx.f = __fsub_rn(Sqvvx.f, Stmp3.f);
427#endif
428 }
429
430 //###########################################################
431 // Normalize quaternion for matrix V
432 //###########################################################
433
434 Stmp2.f = Sqvs.f * Sqvs.f;
435 Stmp1.f = Sqvvx.f * Sqvvx.f;
436 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
437 Stmp1.f = Sqvvy.f * Sqvvy.f;
438 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
439 Stmp1.f = Sqvvz.f * Sqvvz.f;
440 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
441
442 Stmp1.f = __frsqrt_rn(Stmp2.f);
443 Stmp4.f = Stmp1.f * 0.5f;
444 Stmp3.f = Stmp1.f * Stmp4.f;
445 Stmp3.f = Stmp1.f * Stmp3.f;
446 Stmp3.f = Stmp2.f * Stmp3.f;
447 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
448 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
449
450 Sqvs.f = Sqvs.f * Stmp1.f;
451 Sqvvx.f = Sqvvx.f * Stmp1.f;
452 Sqvvy.f = Sqvvy.f * Stmp1.f;
453 Sqvvz.f = Sqvvz.f * Stmp1.f;
454
455 //###########################################################
456 // Transform quaternion to matrix V
457 //###########################################################
458
459 Stmp1.f = Sqvvx.f * Sqvvx.f;
460 Stmp2.f = Sqvvy.f * Sqvvy.f;
461 Stmp3.f = Sqvvz.f * Sqvvz.f;
462 Sv11.f = Sqvs.f * Sqvs.f;
463 Sv22.f = __fsub_rn(Sv11.f, Stmp1.f);
464 Sv33.f = __fsub_rn(Sv22.f, Stmp2.f);
465 Sv33.f = __fadd_rn(Sv33.f, Stmp3.f);
466 Sv22.f = __fadd_rn(Sv22.f, Stmp2.f);
467 Sv22.f = __fsub_rn(Sv22.f, Stmp3.f);
468 Sv11.f = __fadd_rn(Sv11.f, Stmp1.f);
469 Sv11.f = __fsub_rn(Sv11.f, Stmp2.f);
470 Sv11.f = __fsub_rn(Sv11.f, Stmp3.f);
471 Stmp1.f = __fadd_rn(Sqvvx.f, Sqvvx.f);
472 Stmp2.f = __fadd_rn(Sqvvy.f, Sqvvy.f);
473 Stmp3.f = __fadd_rn(Sqvvz.f, Sqvvz.f);
474 Sv32.f = Sqvs.f * Stmp1.f;
475 Sv13.f = Sqvs.f * Stmp2.f;
476 Sv21.f = Sqvs.f * Stmp3.f;
477 Stmp1.f = Sqvvy.f * Stmp1.f;
478 Stmp2.f = Sqvvz.f * Stmp2.f;
479 Stmp3.f = Sqvvx.f * Stmp3.f;
480 Sv12.f = __fsub_rn(Stmp1.f, Sv21.f);
481 Sv23.f = __fsub_rn(Stmp2.f, Sv32.f);
482 Sv31.f = __fsub_rn(Stmp3.f, Sv13.f);
483 Sv21.f = __fadd_rn(Stmp1.f, Sv21.f);
484 Sv32.f = __fadd_rn(Stmp2.f, Sv32.f);
485 Sv13.f = __fadd_rn(Stmp3.f, Sv13.f);
486
488 // Multiply (from the right) with V
489 //###########################################################
490
491 Stmp2.f = Sa12.f;
492 Stmp3.f = Sa13.f;
493 Sa12.f = Sv12.f * Sa11.f;
494 Sa13.f = Sv13.f * Sa11.f;
495 Sa11.f = Sv11.f * Sa11.f;
496 Stmp1.f = Sv21.f * Stmp2.f;
497 Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
498 Stmp1.f = Sv31.f * Stmp3.f;
499 Sa11.f = __fadd_rn(Sa11.f, Stmp1.f);
500 Stmp1.f = Sv22.f * Stmp2.f;
501 Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
502 Stmp1.f = Sv32.f * Stmp3.f;
503 Sa12.f = __fadd_rn(Sa12.f, Stmp1.f);
504 Stmp1.f = Sv23.f * Stmp2.f;
505 Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
506 Stmp1.f = Sv33.f * Stmp3.f;
507 Sa13.f = __fadd_rn(Sa13.f, Stmp1.f);
508
509 Stmp2.f = Sa22.f;
510 Stmp3.f = Sa23.f;
511 Sa22.f = Sv12.f * Sa21.f;
512 Sa23.f = Sv13.f * Sa21.f;
513 Sa21.f = Sv11.f * Sa21.f;
514 Stmp1.f = Sv21.f * Stmp2.f;
515 Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
516 Stmp1.f = Sv31.f * Stmp3.f;
517 Sa21.f = __fadd_rn(Sa21.f, Stmp1.f);
518 Stmp1.f = Sv22.f * Stmp2.f;
519 Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
520 Stmp1.f = Sv32.f * Stmp3.f;
521 Sa22.f = __fadd_rn(Sa22.f, Stmp1.f);
522 Stmp1.f = Sv23.f * Stmp2.f;
523 Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
524 Stmp1.f = Sv33.f * Stmp3.f;
525 Sa23.f = __fadd_rn(Sa23.f, Stmp1.f);
526
527 Stmp2.f = Sa32.f;
528 Stmp3.f = Sa33.f;
529 Sa32.f = Sv12.f * Sa31.f;
530 Sa33.f = Sv13.f * Sa31.f;
531 Sa31.f = Sv11.f * Sa31.f;
532 Stmp1.f = Sv21.f * Stmp2.f;
533 Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
534 Stmp1.f = Sv31.f * Stmp3.f;
535 Sa31.f = __fadd_rn(Sa31.f, Stmp1.f);
536 Stmp1.f = Sv22.f * Stmp2.f;
537 Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
538 Stmp1.f = Sv32.f * Stmp3.f;
539 Sa32.f = __fadd_rn(Sa32.f, Stmp1.f);
540 Stmp1.f = Sv23.f * Stmp2.f;
541 Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
542 Stmp1.f = Sv33.f * Stmp3.f;
543 Sa33.f = __fadd_rn(Sa33.f, Stmp1.f);
544
545 //###########################################################
546 // Permute columns such that the singular values are sorted
547 //###########################################################
548
549 Stmp1.f = Sa11.f * Sa11.f;
550 Stmp4.f = Sa21.f * Sa21.f;
551 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
552 Stmp4.f = Sa31.f * Sa31.f;
553 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
554
555 Stmp2.f = Sa12.f * Sa12.f;
556 Stmp4.f = Sa22.f * Sa22.f;
557 Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);
558 Stmp4.f = Sa32.f * Sa32.f;
559 Stmp2.f = __fadd_rn(Stmp2.f, Stmp4.f);
560
561 Stmp3.f = Sa13.f * Sa13.f;
562 Stmp4.f = Sa23.f * Sa23.f;
563 Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);
564 Stmp4.f = Sa33.f * Sa33.f;
565 Stmp3.f = __fadd_rn(Stmp3.f, Stmp4.f);
566
567 // Swap columns 1-2 if necessary
568
569 Stmp4.ui = (Stmp1.f < Stmp2.f) ? 0xffffffff : 0;
570 Stmp5.ui = Sa11.ui ^ Sa12.ui;
571 Stmp5.ui = Stmp5.ui & Stmp4.ui;
572 Sa11.ui = Sa11.ui ^ Stmp5.ui;
573 Sa12.ui = Sa12.ui ^ Stmp5.ui;
574
575 Stmp5.ui = Sa21.ui ^ Sa22.ui;
576 Stmp5.ui = Stmp5.ui & Stmp4.ui;
577 Sa21.ui = Sa21.ui ^ Stmp5.ui;
578 Sa22.ui = Sa22.ui ^ Stmp5.ui;
579
580 Stmp5.ui = Sa31.ui ^ Sa32.ui;
581 Stmp5.ui = Stmp5.ui & Stmp4.ui;
582 Sa31.ui = Sa31.ui ^ Stmp5.ui;
583 Sa32.ui = Sa32.ui ^ Stmp5.ui;
584
585 Stmp5.ui = Sv11.ui ^ Sv12.ui;
586 Stmp5.ui = Stmp5.ui & Stmp4.ui;
587 Sv11.ui = Sv11.ui ^ Stmp5.ui;
588 Sv12.ui = Sv12.ui ^ Stmp5.ui;
589
590 Stmp5.ui = Sv21.ui ^ Sv22.ui;
591 Stmp5.ui = Stmp5.ui & Stmp4.ui;
592 Sv21.ui = Sv21.ui ^ Stmp5.ui;
593 Sv22.ui = Sv22.ui ^ Stmp5.ui;
594
595 Stmp5.ui = Sv31.ui ^ Sv32.ui;
596 Stmp5.ui = Stmp5.ui & Stmp4.ui;
597 Sv31.ui = Sv31.ui ^ Stmp5.ui;
598 Sv32.ui = Sv32.ui ^ Stmp5.ui;
599
600 Stmp5.ui = Stmp1.ui ^ Stmp2.ui;
601 Stmp5.ui = Stmp5.ui & Stmp4.ui;
602 Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
603 Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
604
605 // If columns 1-2 have been swapped, negate 2nd column of A and V so that V is still a rotation
606
607 Stmp5.f = -2.f;
608 Stmp5.ui = Stmp5.ui & Stmp4.ui;
609 Stmp4.f = 1.f;
610 Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
611
612 Sa12.f = Sa12.f * Stmp4.f;
613 Sa22.f = Sa22.f * Stmp4.f;
614 Sa32.f = Sa32.f * Stmp4.f;
615
616 Sv12.f = Sv12.f * Stmp4.f;
617 Sv22.f = Sv22.f * Stmp4.f;
618 Sv32.f = Sv32.f * Stmp4.f;
619
620 // Swap columns 1-3 if necessary
621
622 Stmp4.ui = (Stmp1.f < Stmp3.f) ? 0xffffffff : 0;
623 Stmp5.ui = Sa11.ui ^ Sa13.ui;
624 Stmp5.ui = Stmp5.ui & Stmp4.ui;
625 Sa11.ui = Sa11.ui ^ Stmp5.ui;
626 Sa13.ui = Sa13.ui ^ Stmp5.ui;
627
628 Stmp5.ui = Sa21.ui ^ Sa23.ui;
629 Stmp5.ui = Stmp5.ui & Stmp4.ui;
630 Sa21.ui = Sa21.ui ^ Stmp5.ui;
631 Sa23.ui = Sa23.ui ^ Stmp5.ui;
632
633 Stmp5.ui = Sa31.ui ^ Sa33.ui;
634 Stmp5.ui = Stmp5.ui & Stmp4.ui;
635 Sa31.ui = Sa31.ui ^ Stmp5.ui;
636 Sa33.ui = Sa33.ui ^ Stmp5.ui;
637
638 Stmp5.ui = Sv11.ui ^ Sv13.ui;
639 Stmp5.ui = Stmp5.ui & Stmp4.ui;
640 Sv11.ui = Sv11.ui ^ Stmp5.ui;
641 Sv13.ui = Sv13.ui ^ Stmp5.ui;
642
643 Stmp5.ui = Sv21.ui ^ Sv23.ui;
644 Stmp5.ui = Stmp5.ui & Stmp4.ui;
645 Sv21.ui = Sv21.ui ^ Stmp5.ui;
646 Sv23.ui = Sv23.ui ^ Stmp5.ui;
647
648 Stmp5.ui = Sv31.ui ^ Sv33.ui;
649 Stmp5.ui = Stmp5.ui & Stmp4.ui;
650 Sv31.ui = Sv31.ui ^ Stmp5.ui;
651 Sv33.ui = Sv33.ui ^ Stmp5.ui;
652
653 Stmp5.ui = Stmp1.ui ^ Stmp3.ui;
654 Stmp5.ui = Stmp5.ui & Stmp4.ui;
655 Stmp1.ui = Stmp1.ui ^ Stmp5.ui;
656 Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
657
658 // If columns 1-3 have been swapped, negate 1st column of A and V so that V is still a rotation
659
660 Stmp5.f = -2.f;
661 Stmp5.ui = Stmp5.ui & Stmp4.ui;
662 Stmp4.f = 1.f;
663 Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
664
665 Sa11.f = Sa11.f * Stmp4.f;
666 Sa21.f = Sa21.f * Stmp4.f;
667 Sa31.f = Sa31.f * Stmp4.f;
668
669 Sv11.f = Sv11.f * Stmp4.f;
670 Sv21.f = Sv21.f * Stmp4.f;
671 Sv31.f = Sv31.f * Stmp4.f;
672
673 // Swap columns 2-3 if necessary
674
675 Stmp4.ui = (Stmp2.f < Stmp3.f) ? 0xffffffff : 0;
676 Stmp5.ui = Sa12.ui ^ Sa13.ui;
677 Stmp5.ui = Stmp5.ui & Stmp4.ui;
678 Sa12.ui = Sa12.ui ^ Stmp5.ui;
679 Sa13.ui = Sa13.ui ^ Stmp5.ui;
680
681 Stmp5.ui = Sa22.ui ^ Sa23.ui;
682 Stmp5.ui = Stmp5.ui & Stmp4.ui;
683 Sa22.ui = Sa22.ui ^ Stmp5.ui;
684 Sa23.ui = Sa23.ui ^ Stmp5.ui;
685
686 Stmp5.ui = Sa32.ui ^ Sa33.ui;
687 Stmp5.ui = Stmp5.ui & Stmp4.ui;
688 Sa32.ui = Sa32.ui ^ Stmp5.ui;
689 Sa33.ui = Sa33.ui ^ Stmp5.ui;
690
691 Stmp5.ui = Sv12.ui ^ Sv13.ui;
692 Stmp5.ui = Stmp5.ui & Stmp4.ui;
693 Sv12.ui = Sv12.ui ^ Stmp5.ui;
694 Sv13.ui = Sv13.ui ^ Stmp5.ui;
695
696 Stmp5.ui = Sv22.ui ^ Sv23.ui;
697 Stmp5.ui = Stmp5.ui & Stmp4.ui;
698 Sv22.ui = Sv22.ui ^ Stmp5.ui;
699 Sv23.ui = Sv23.ui ^ Stmp5.ui;
700
701 Stmp5.ui = Sv32.ui ^ Sv33.ui;
702 Stmp5.ui = Stmp5.ui & Stmp4.ui;
703 Sv32.ui = Sv32.ui ^ Stmp5.ui;
704 Sv33.ui = Sv33.ui ^ Stmp5.ui;
705
706 Stmp5.ui = Stmp2.ui ^ Stmp3.ui;
707 Stmp5.ui = Stmp5.ui & Stmp4.ui;
708 Stmp2.ui = Stmp2.ui ^ Stmp5.ui;
709 Stmp3.ui = Stmp3.ui ^ Stmp5.ui;
710
711 // If columns 2-3 have been swapped, negate 3rd column of A and V so that V is still a rotation
712
713 Stmp5.f = -2.f;
714 Stmp5.ui = Stmp5.ui & Stmp4.ui;
715 Stmp4.f = 1.f;
716 Stmp4.f = __fadd_rn(Stmp4.f, Stmp5.f);
717
718 Sa13.f = Sa13.f * Stmp4.f;
719 Sa23.f = Sa23.f * Stmp4.f;
720 Sa33.f = Sa33.f * Stmp4.f;
721
722 Sv13.f = Sv13.f * Stmp4.f;
723 Sv23.f = Sv23.f * Stmp4.f;
724 Sv33.f = Sv33.f * Stmp4.f;
725
726 //###########################################################
727 // Construct QR factorization of A*V (=U*D) using Givens rotations
728 //###########################################################
729
730 Su11.f = 1.f;
731 Su12.f = 0.f;
732 Su13.f = 0.f;
733 Su21.f = 0.f;
734 Su22.f = 1.f;
735 Su23.f = 0.f;
736 Su31.f = 0.f;
737 Su32.f = 0.f;
738 Su33.f = 1.f;
739
740 Ssh.f = Sa21.f * Sa21.f;
741 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
742 Ssh.ui = Ssh.ui & Sa21.ui;
743
744 Stmp5.f = 0.f;
745 Sch.f = __fsub_rn(Stmp5.f, Sa11.f);
746 Sch.f = ::max(Sch.f, Sa11.f);
747 Sch.f = ::max(Sch.f, gsmall_number);
748 Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
749
750 Stmp1.f = Sch.f * Sch.f;
751 Stmp2.f = Ssh.f * Ssh.f;
752 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
753 Stmp1.f = __frsqrt_rn(Stmp2.f);
754
755 Stmp4.f = Stmp1.f * 0.5f;
756 Stmp3.f = Stmp1.f * Stmp4.f;
757 Stmp3.f = Stmp1.f * Stmp3.f;
758 Stmp3.f = Stmp2.f * Stmp3.f;
759 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
760 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
761 Stmp1.f = Stmp1.f * Stmp2.f;
762
763 Sch.f = __fadd_rn(Sch.f, Stmp1.f);
764
765 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
766 Stmp2.ui = ~Stmp5.ui & Sch.ui;
767 Sch.ui = Stmp5.ui & Sch.ui;
768 Ssh.ui = Stmp5.ui & Ssh.ui;
769 Sch.ui = Sch.ui | Stmp1.ui;
770 Ssh.ui = Ssh.ui | Stmp2.ui;
771
772 Stmp1.f = Sch.f * Sch.f;
773 Stmp2.f = Ssh.f * Ssh.f;
774 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
775 Stmp1.f = __frsqrt_rn(Stmp2.f);
776
777 Stmp4.f = Stmp1.f * 0.5f;
778 Stmp3.f = Stmp1.f * Stmp4.f;
779 Stmp3.f = Stmp1.f * Stmp3.f;
780 Stmp3.f = Stmp2.f * Stmp3.f;
781 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
782 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
783
784 Sch.f = Sch.f * Stmp1.f;
785 Ssh.f = Ssh.f * Stmp1.f;
786
787 Sc.f = Sch.f * Sch.f;
788 Ss.f = Ssh.f * Ssh.f;
789 Sc.f = __fsub_rn(Sc.f, Ss.f);
790 Ss.f = Ssh.f * Sch.f;
791 Ss.f = __fadd_rn(Ss.f, Ss.f);
792
793 //###########################################################
794 // Rotate matrix A
795 //###########################################################
796
797 Stmp1.f = Ss.f * Sa11.f;
798 Stmp2.f = Ss.f * Sa21.f;
799 Sa11.f = Sc.f * Sa11.f;
800 Sa21.f = Sc.f * Sa21.f;
801 Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);
802 Sa21.f = __fsub_rn(Sa21.f, Stmp1.f);
803
804 Stmp1.f = Ss.f * Sa12.f;
805 Stmp2.f = Ss.f * Sa22.f;
806 Sa12.f = Sc.f * Sa12.f;
807 Sa22.f = Sc.f * Sa22.f;
808 Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);
809 Sa22.f = __fsub_rn(Sa22.f, Stmp1.f);
810
811 Stmp1.f = Ss.f * Sa13.f;
812 Stmp2.f = Ss.f * Sa23.f;
813 Sa13.f = Sc.f * Sa13.f;
814 Sa23.f = Sc.f * Sa23.f;
815 Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);
816 Sa23.f = __fsub_rn(Sa23.f, Stmp1.f);
817
818 //###########################################################
819 // Update matrix U
820 //###########################################################
821
822 Stmp1.f = Ss.f * Su11.f;
823 Stmp2.f = Ss.f * Su12.f;
824 Su11.f = Sc.f * Su11.f;
825 Su12.f = Sc.f * Su12.f;
826 Su11.f = __fadd_rn(Su11.f, Stmp2.f);
827 Su12.f = __fsub_rn(Su12.f, Stmp1.f);
828
829 Stmp1.f = Ss.f * Su21.f;
830 Stmp2.f = Ss.f * Su22.f;
831 Su21.f = Sc.f * Su21.f;
832 Su22.f = Sc.f * Su22.f;
833 Su21.f = __fadd_rn(Su21.f, Stmp2.f);
834 Su22.f = __fsub_rn(Su22.f, Stmp1.f);
835
836 Stmp1.f = Ss.f * Su31.f;
837 Stmp2.f = Ss.f * Su32.f;
838 Su31.f = Sc.f * Su31.f;
839 Su32.f = Sc.f * Su32.f;
840 Su31.f = __fadd_rn(Su31.f, Stmp2.f);
841 Su32.f = __fsub_rn(Su32.f, Stmp1.f);
842
843 // Second Givens rotation
844
845 Ssh.f = Sa31.f * Sa31.f;
846 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
847 Ssh.ui = Ssh.ui & Sa31.ui;
848
849 Stmp5.f = 0.f;
850 Sch.f = __fsub_rn(Stmp5.f, Sa11.f);
851 Sch.f = ::max(Sch.f, Sa11.f);
852 Sch.f = ::max(Sch.f, gsmall_number);
853 Stmp5.ui = (Sa11.f >= Stmp5.f) ? 0xffffffff : 0;
854
855 Stmp1.f = Sch.f * Sch.f;
856 Stmp2.f = Ssh.f * Ssh.f;
857 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
858 Stmp1.f = __frsqrt_rn(Stmp2.f);
859
860 Stmp4.f = Stmp1.f * 0.5;
861 Stmp3.f = Stmp1.f * Stmp4.f;
862 Stmp3.f = Stmp1.f * Stmp3.f;
863 Stmp3.f = Stmp2.f * Stmp3.f;
864 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
865 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
866 Stmp1.f = Stmp1.f * Stmp2.f;
867
868 Sch.f = __fadd_rn(Sch.f, Stmp1.f);
869
870 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
871 Stmp2.ui = ~Stmp5.ui & Sch.ui;
872 Sch.ui = Stmp5.ui & Sch.ui;
873 Ssh.ui = Stmp5.ui & Ssh.ui;
874 Sch.ui = Sch.ui | Stmp1.ui;
875 Ssh.ui = Ssh.ui | Stmp2.ui;
876
877 Stmp1.f = Sch.f * Sch.f;
878 Stmp2.f = Ssh.f * Ssh.f;
879 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
880 Stmp1.f = __frsqrt_rn(Stmp2.f);
881
882 Stmp4.f = Stmp1.f * 0.5f;
883 Stmp3.f = Stmp1.f * Stmp4.f;
884 Stmp3.f = Stmp1.f * Stmp3.f;
885 Stmp3.f = Stmp2.f * Stmp3.f;
886 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
887 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
888
889 Sch.f = Sch.f * Stmp1.f;
890 Ssh.f = Ssh.f * Stmp1.f;
891
892 Sc.f = Sch.f * Sch.f;
893 Ss.f = Ssh.f * Ssh.f;
894 Sc.f = __fsub_rn(Sc.f, Ss.f);
895 Ss.f = Ssh.f * Sch.f;
896 Ss.f = __fadd_rn(Ss.f, Ss.f);
897
898 //###########################################################
899 // Rotate matrix A
900 //###########################################################
901
902 Stmp1.f = Ss.f * Sa11.f;
903 Stmp2.f = Ss.f * Sa31.f;
904 Sa11.f = Sc.f * Sa11.f;
905 Sa31.f = Sc.f * Sa31.f;
906 Sa11.f = __fadd_rn(Sa11.f, Stmp2.f);
907 Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);
908
909 Stmp1.f = Ss.f * Sa12.f;
910 Stmp2.f = Ss.f * Sa32.f;
911 Sa12.f = Sc.f * Sa12.f;
912 Sa32.f = Sc.f * Sa32.f;
913 Sa12.f = __fadd_rn(Sa12.f, Stmp2.f);
914 Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);
915
916 Stmp1.f = Ss.f * Sa13.f;
917 Stmp2.f = Ss.f * Sa33.f;
918 Sa13.f = Sc.f * Sa13.f;
919 Sa33.f = Sc.f * Sa33.f;
920 Sa13.f = __fadd_rn(Sa13.f, Stmp2.f);
921 Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);
922
923 //###########################################################
924 // Update matrix U
925 //###########################################################
926
927 Stmp1.f = Ss.f * Su11.f;
928 Stmp2.f = Ss.f * Su13.f;
929 Su11.f = Sc.f * Su11.f;
930 Su13.f = Sc.f * Su13.f;
931 Su11.f = __fadd_rn(Su11.f, Stmp2.f);
932 Su13.f = __fsub_rn(Su13.f, Stmp1.f);
933
934 Stmp1.f = Ss.f * Su21.f;
935 Stmp2.f = Ss.f * Su23.f;
936 Su21.f = Sc.f * Su21.f;
937 Su23.f = Sc.f * Su23.f;
938 Su21.f = __fadd_rn(Su21.f, Stmp2.f);
939 Su23.f = __fsub_rn(Su23.f, Stmp1.f);
940
941 Stmp1.f = Ss.f * Su31.f;
942 Stmp2.f = Ss.f * Su33.f;
943 Su31.f = Sc.f * Su31.f;
944 Su33.f = Sc.f * Su33.f;
945 Su31.f = __fadd_rn(Su31.f, Stmp2.f);
946 Su33.f = __fsub_rn(Su33.f, Stmp1.f);
947
948 // Third Givens Rotation
949
950 Ssh.f = Sa32.f * Sa32.f;
951 Ssh.ui = (Ssh.f >= gsmall_number) ? 0xffffffff : 0;
952 Ssh.ui = Ssh.ui & Sa32.ui;
953
954 Stmp5.f = 0.f;
955 Sch.f = __fsub_rn(Stmp5.f, Sa22.f);
956 Sch.f = ::max(Sch.f, Sa22.f);
957 Sch.f = ::max(Sch.f, gsmall_number);
958 Stmp5.ui = (Sa22.f >= Stmp5.f) ? 0xffffffff : 0;
959
960 Stmp1.f = Sch.f * Sch.f;
961 Stmp2.f = Ssh.f * Ssh.f;
962 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
963 Stmp1.f = __frsqrt_rn(Stmp2.f);
964
965 Stmp4.f = Stmp1.f * 0.5f;
966 Stmp3.f = Stmp1.f * Stmp4.f;
967 Stmp3.f = Stmp1.f * Stmp3.f;
968 Stmp3.f = Stmp2.f * Stmp3.f;
969 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
970 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
971 Stmp1.f = Stmp1.f * Stmp2.f;
972
973 Sch.f = __fadd_rn(Sch.f, Stmp1.f);
974
975 Stmp1.ui = ~Stmp5.ui & Ssh.ui;
976 Stmp2.ui = ~Stmp5.ui & Sch.ui;
977 Sch.ui = Stmp5.ui & Sch.ui;
978 Ssh.ui = Stmp5.ui & Ssh.ui;
979 Sch.ui = Sch.ui | Stmp1.ui;
980 Ssh.ui = Ssh.ui | Stmp2.ui;
981
982 Stmp1.f = Sch.f * Sch.f;
983 Stmp2.f = Ssh.f * Ssh.f;
984 Stmp2.f = __fadd_rn(Stmp1.f, Stmp2.f);
985 Stmp1.f = __frsqrt_rn(Stmp2.f);
986
987 Stmp4.f = Stmp1.f * 0.5f;
988 Stmp3.f = Stmp1.f * Stmp4.f;
989 Stmp3.f = Stmp1.f * Stmp3.f;
990 Stmp3.f = Stmp2.f * Stmp3.f;
991 Stmp1.f = __fadd_rn(Stmp1.f, Stmp4.f);
992 Stmp1.f = __fsub_rn(Stmp1.f, Stmp3.f);
993
994 Sch.f = Sch.f * Stmp1.f;
995 Ssh.f = Ssh.f * Stmp1.f;
996
997 Sc.f = Sch.f * Sch.f;
998 Ss.f = Ssh.f * Ssh.f;
999 Sc.f = __fsub_rn(Sc.f, Ss.f);
1000 Ss.f = Ssh.f * Sch.f;
1001 Ss.f = __fadd_rn(Ss.f, Ss.f);
1002
1003 //###########################################################
1004 // Rotate matrix A
1005 //###########################################################
1006
1007 Stmp1.f = Ss.f * Sa21.f;
1008 Stmp2.f = Ss.f * Sa31.f;
1009 Sa21.f = Sc.f * Sa21.f;
1010 Sa31.f = Sc.f * Sa31.f;
1011 Sa21.f = __fadd_rn(Sa21.f, Stmp2.f);
1012 Sa31.f = __fsub_rn(Sa31.f, Stmp1.f);
1013
1014 Stmp1.f = Ss.f * Sa22.f;
1015 Stmp2.f = Ss.f * Sa32.f;
1016 Sa22.f = Sc.f * Sa22.f;
1017 Sa32.f = Sc.f * Sa32.f;
1018 Sa22.f = __fadd_rn(Sa22.f, Stmp2.f);
1019 Sa32.f = __fsub_rn(Sa32.f, Stmp1.f);
1020
1021 Stmp1.f = Ss.f * Sa23.f;
1022 Stmp2.f = Ss.f * Sa33.f;
1023 Sa23.f = Sc.f * Sa23.f;
1024 Sa33.f = Sc.f * Sa33.f;
1025 Sa23.f = __fadd_rn(Sa23.f, Stmp2.f);
1026 Sa33.f = __fsub_rn(Sa33.f, Stmp1.f);
1027
1028 //###########################################################
1029 // Update matrix U
1030 //###########################################################
1031
1032 Stmp1.f = Ss.f * Su12.f;
1033 Stmp2.f = Ss.f * Su13.f;
1034 Su12.f = Sc.f * Su12.f;
1035 Su13.f = Sc.f * Su13.f;
1036 Su12.f = __fadd_rn(Su12.f, Stmp2.f);
1037 Su13.f = __fsub_rn(Su13.f, Stmp1.f);
1038
1039 Stmp1.f = Ss.f * Su22.f;
1040 Stmp2.f = Ss.f * Su23.f;
1041 Su22.f = Sc.f * Su22.f;
1042 Su23.f = Sc.f * Su23.f;
1043 Su22.f = __fadd_rn(Su22.f, Stmp2.f);
1044 Su23.f = __fsub_rn(Su23.f, Stmp1.f);
1045
1046 Stmp1.f = Ss.f * Su32.f;
1047 Stmp2.f = Ss.f * Su33.f;
1048 Su32.f = Sc.f * Su32.f;
1049 Su33.f = Sc.f * Su33.f;
1050 Su32.f = __fadd_rn(Su32.f, Stmp2.f);
1051 Su33.f = __fsub_rn(Su33.f, Stmp1.f);
1052
1053 v11 = Sv11.f;
1054 v12 = Sv12.f;
1055 v13 = Sv13.f;
1056 v21 = Sv21.f;
1057 v22 = Sv22.f;
1058 v23 = Sv23.f;
1059 v31 = Sv31.f;
1060 v32 = Sv32.f;
1061 v33 = Sv33.f;
1062
1063 u11 = Su11.f;
1064 u12 = Su12.f;
1065 u13 = Su13.f;
1066 u21 = Su21.f;
1067 u22 = Su22.f;
1068 u23 = Su23.f;
1069 u31 = Su31.f;
1070 u32 = Su32.f;
1071 u33 = Su33.f;
1072
1073 s11 = Sa11.f;
1074 //s12 = Sa12.f; s13 = Sa13.f; s21 = Sa21.f;
1075 s22 = Sa22.f;
1076 //s23 = Sa23.f; s31 = Sa31.f; s32 = Sa32.f;
1077 s33 = Sa33.f;
1078}
1079} // namespace muda::details::eigen
Definition svd_impl.h:8