11#ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H
12#define EIGEN_MATRIX_PRODUCT_ALTIVEC_H
14#ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK
15#define EIGEN_ALTIVEC_USE_CUSTOM_PACK 1
18#include "MatrixProductCommon.h"
22#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY)
24#define EIGEN_ALTIVEC_MMA_ONLY
26#define EIGEN_ALTIVEC_DISABLE_MMA
32#define EIGEN_ALTIVEC_DISABLE_MMA
36#if __has_builtin(__builtin_mma_assemble_acc)
37 #define ALTIVEC_MMA_SUPPORT
41#if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
42 #include "MatrixProductMMA.h"
57template<
typename Scalar>
60 typedef typename packet_traits<Scalar>::type vectortype;
61 typedef PacketBlock<vectortype,4> type;
62 typedef vectortype rhstype;
65 vectorsize = packet_traits<Scalar>::size,
72struct quad_traits<double>
74 typedef Packet2d vectortype;
75 typedef PacketBlock<vectortype,4> type;
76 typedef PacketBlock<Packet2d,2> rhstype;
79 vectorsize = packet_traits<double>::size,
89const static Packet16uc p16uc_GETREAL32 = { 0, 1, 2, 3,
94const static Packet16uc p16uc_GETIMAG32 = { 4, 5, 6, 7,
98const static Packet16uc p16uc_GETREAL64 = { 0, 1, 2, 3, 4, 5, 6, 7,
99 16, 17, 18, 19, 20, 21, 22, 23};
102const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15,
103 24, 25, 26, 27, 28, 29, 30, 31};
123template<
typename Scalar,
typename Index,
int StorageOrder>
124EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(Index i, Index j, const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder>& dt)
126 std::complex<Scalar> v;
129 v.real( dt(j,i).
real());
130 v.imag(-dt(j,i).
imag());
133 v.real( dt(i,j).
real());
134 v.imag( dt(i,j).
imag());
136 v.real( dt(i,j).
real());
142template<
typename Scalar,
typename Index,
int StorageOrder,
int N>
143EIGEN_STRONG_INLINE
void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB,
const std::complex<Scalar>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
145 const Index depth = k2 + rows;
146 const_blas_data_mapper<std::complex<Scalar>,
Index, StorageOrder> rhs(_rhs, rhsStride);
147 const Index vectorSize = N*quad_traits<Scalar>::vectorsize;
148 const Index vectorDelta = vectorSize * rows;
149 Scalar* blockBf =
reinterpret_cast<Scalar *
>(blockB);
151 Index rir = 0, rii, j = 0;
152 for(; j + vectorSize <= cols; j+=vectorSize)
154 rii = rir + vectorDelta;
156 for(Index i = k2; i < depth; i++)
158 for(Index k = 0; k < vectorSize; k++)
160 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j + k, rhs);
162 blockBf[rir + k] = v.real();
163 blockBf[rii + k] = v.imag();
173 rii = rir + ((cols - j) * rows);
175 for(Index i = k2; i < depth; i++)
180 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, k, rhs);
182 blockBf[rir] = v.real();
183 blockBf[rii] = v.imag();
192template<
typename Scalar,
typename Index,
int StorageOrder>
193EIGEN_STRONG_INLINE
void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA,
const std::complex<Scalar>* _lhs, Index lhsStride, Index cols, Index rows)
195 const Index depth = cols;
196 const_blas_data_mapper<std::complex<Scalar>,
Index, StorageOrder> lhs(_lhs, lhsStride);
197 const Index vectorSize = quad_traits<Scalar>::vectorsize;
198 const Index vectorDelta = vectorSize * depth;
199 Scalar* blockAf = (Scalar *)(blockA);
201 Index rir = 0, rii, j = 0;
202 for(; j + vectorSize <= rows; j+=vectorSize)
204 rii = rir + vectorDelta;
206 for(Index i = 0; i < depth; i++)
208 for(Index k = 0; k < vectorSize; k++)
210 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(j+k, i, lhs);
212 blockAf[rir + k] = v.real();
213 blockAf[rii + k] = v.imag();
224 rii = rir + ((rows - j) * depth);
226 for(Index i = 0; i < depth; i++)
231 std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(k, i, lhs);
233 blockAf[rir] = v.real();
234 blockAf[rii] = v.imag();
243template<
typename Scalar,
typename Index,
int StorageOrder,
int N>
244EIGEN_STRONG_INLINE
void symm_pack_rhs_helper(Scalar* blockB,
const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
246 const Index depth = k2 + rows;
247 const_blas_data_mapper<Scalar, Index, StorageOrder> rhs(_rhs, rhsStride);
248 const Index vectorSize = quad_traits<Scalar>::vectorsize;
251 for(; j + N*vectorSize <= cols; j+=N*vectorSize)
254 for(; i < depth; i++)
256 for(Index k = 0; k < N*vectorSize; k++)
259 blockB[ri + k] = rhs(j+k, i);
261 blockB[ri + k] = rhs(i, j+k);
269 for(Index i = k2; i < depth; i++)
275 blockB[ri] = rhs(i, k);
277 blockB[ri] = rhs(k, i);
284template<
typename Scalar,
typename Index,
int StorageOrder>
285EIGEN_STRONG_INLINE
void symm_pack_lhs_helper(Scalar* blockA,
const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
287 const Index depth = cols;
288 const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs, lhsStride);
289 const Index vectorSize = quad_traits<Scalar>::vectorsize;
292 for(; j + vectorSize <= rows; j+=vectorSize)
296 for(; i < depth; i++)
298 for(Index k = 0; k < vectorSize; k++)
301 blockA[ri + k] = lhs(j+k, i);
303 blockA[ri + k] = lhs(i, j+k);
311 for(Index i = 0; i < depth; i++)
317 blockA[ri] = lhs(k, i);
319 blockA[ri] = lhs(i, k);
326template<
typename Index,
int nr,
int StorageOrder>
327struct symm_pack_rhs<std::complex<float>, Index, nr, StorageOrder>
329 void operator()(std::complex<float>* blockB,
const std::complex<float>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
331 symm_pack_complex_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
335template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
336struct symm_pack_lhs<std::complex<float>, Index, Pack1, Pack2_dummy, StorageOrder>
338 void operator()(std::complex<float>* blockA,
const std::complex<float>* _lhs, Index lhsStride, Index cols, Index rows)
340 symm_pack_complex_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
346template<
typename Index,
int nr,
int StorageOrder>
347struct symm_pack_rhs<std::complex<double>, Index, nr, StorageOrder>
349 void operator()(std::complex<double>* blockB,
const std::complex<double>* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
351 symm_pack_complex_rhs_helper<double, Index, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
355template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
356struct symm_pack_lhs<std::complex<double>, Index, Pack1, Pack2_dummy, StorageOrder>
358 void operator()(std::complex<double>* blockA,
const std::complex<double>* _lhs, Index lhsStride, Index cols, Index rows)
360 symm_pack_complex_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
365template<
typename Index,
int nr,
int StorageOrder>
366struct symm_pack_rhs<float, Index, nr, StorageOrder>
368 void operator()(
float* blockB,
const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
370 symm_pack_rhs_helper<float, Index, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
374template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
375struct symm_pack_lhs<float, Index, Pack1, Pack2_dummy, StorageOrder>
377 void operator()(
float* blockA,
const float* _lhs, Index lhsStride, Index cols, Index rows)
379 symm_pack_lhs_helper<float, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
384template<
typename Index,
int nr,
int StorageOrder>
385struct symm_pack_rhs<double, Index, nr, StorageOrder>
387 void operator()(
double* blockB,
const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
389 symm_pack_rhs_helper<double, Index, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
393template<
typename Index,
int Pack1,
int Pack2_dummy,
int StorageOrder>
394struct symm_pack_lhs<double, Index, Pack1, Pack2_dummy, StorageOrder>
396 void operator()(
double* blockA,
const double* _lhs, Index lhsStride, Index cols, Index rows)
398 symm_pack_lhs_helper<double, Index, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
413template<
typename Scalar,
typename Packet,
typename Index>
414EIGEN_ALWAYS_INLINE
void storeBlock(Scalar* to, PacketBlock<Packet,4>& block)
416 const Index size = 16 /
sizeof(Scalar);
417 pstore<Scalar>(to + (0 * size), block.packet[0]);
418 pstore<Scalar>(to + (1 * size), block.packet[1]);
419 pstore<Scalar>(to + (2 * size), block.packet[2]);
420 pstore<Scalar>(to + (3 * size), block.packet[3]);
423template<
typename Scalar,
typename Packet,
typename Index>
424EIGEN_ALWAYS_INLINE
void storeBlock(Scalar* to, PacketBlock<Packet,2>& block)
426 const Index size = 16 /
sizeof(Scalar);
427 pstore<Scalar>(to + (0 * size), block.packet[0]);
428 pstore<Scalar>(to + (1 * size), block.packet[1]);
432template<
typename Scalar,
typename Index,
typename DataMapper,
typename Packet,
typename PacketC,
int StorageOrder,
bool Conjugate,
bool PanelMode,
bool UseLhs>
434 EIGEN_STRONG_INLINE
void operator()(std::complex<Scalar>* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
436 const Index vectorSize = quad_traits<Scalar>::vectorsize;
437 const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
438 Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
439 Scalar* blockAt =
reinterpret_cast<Scalar *
>(blockA);
442 for(; j + vectorSize <= rows; j+=vectorSize)
446 rii = rir + vectorDelta;
448 for(; i + vectorSize <= depth; i+=vectorSize)
450 PacketBlock<Packet,4> blockr, blocki;
451 PacketBlock<PacketC,8> cblock;
454 bload<DataMapper, PacketC, Index, 2, 0, StorageOrder>(cblock, lhs, j, i);
456 bload<DataMapper, PacketC, Index, 2, 0, StorageOrder>(cblock, lhs, i, j);
459 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
460 blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETREAL32);
461 blockr.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETREAL32);
462 blockr.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETREAL32);
464 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETIMAG32);
465 blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, p16uc_GETIMAG32);
466 blocki.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, p16uc_GETIMAG32);
467 blocki.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, p16uc_GETIMAG32);
471 blocki.packet[0] = -blocki.packet[0];
472 blocki.packet[1] = -blocki.packet[1];
473 blocki.packet[2] = -blocki.packet[2];
474 blocki.packet[3] = -blocki.packet[3];
477 if(((StorageOrder ==
RowMajor) && UseLhs) || (((StorageOrder ==
ColMajor) && !UseLhs)))
483 storeBlock<Scalar, Packet, Index>(blockAt + rir, blockr);
484 storeBlock<Scalar, Packet, Index>(blockAt + rii, blocki);
489 for(; i < depth; i++)
491 PacketBlock<Packet,1> blockr, blocki;
492 PacketBlock<PacketC,2> cblock;
494 if(((StorageOrder ==
ColMajor) && UseLhs) || (((StorageOrder ==
RowMajor) && !UseLhs)))
497 cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
498 cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 2, i);
500 cblock.packet[0] = lhs.template loadPacket<PacketC>(i, j + 0);
501 cblock.packet[1] = lhs.template loadPacket<PacketC>(i, j + 2);
504 std::complex<Scalar> lhs0, lhs1;
506 lhs0 = lhs(j + 0, i);
507 lhs1 = lhs(j + 1, i);
508 cblock.packet[0] = pload2(&lhs0, &lhs1);
509 lhs0 = lhs(j + 2, i);
510 lhs1 = lhs(j + 3, i);
511 cblock.packet[1] = pload2(&lhs0, &lhs1);
513 lhs0 = lhs(i, j + 0);
514 lhs1 = lhs(i, j + 1);
515 cblock.packet[0] = pload2(&lhs0, &lhs1);
516 lhs0 = lhs(i, j + 2);
517 lhs1 = lhs(i, j + 3);
518 cblock.packet[1] = pload2(&lhs0, &lhs1);
522 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL32);
523 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG32);
527 blocki.packet[0] = -blocki.packet[0];
530 pstore<Scalar>(blockAt + rir, blockr.packet[0]);
531 pstore<Scalar>(blockAt + rii, blocki.packet[0]);
537 rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
542 if(PanelMode) rir += (offset*(rows - j - vectorSize));
543 rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
545 for(Index i = 0; i < depth; i++)
551 blockAt[rir] = lhs(k, i).real();
554 blockAt[rii] = -lhs(k, i).imag();
556 blockAt[rii] = lhs(k, i).imag();
558 blockAt[rir] = lhs(i, k).real();
561 blockAt[rii] = -lhs(i, k).imag();
563 blockAt[rii] = lhs(i, k).imag();
575template<
typename Scalar,
typename Index,
typename DataMapper,
typename Packet,
int StorageOrder,
bool PanelMode,
bool UseLhs>
577 EIGEN_STRONG_INLINE
void operator()(Scalar* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
579 const Index vectorSize = quad_traits<Scalar>::vectorsize;
582 for(; j + vectorSize <= rows; j+=vectorSize)
586 if(PanelMode) ri += vectorSize*offset;
588 for(; i + vectorSize <= depth; i+=vectorSize)
590 PacketBlock<Packet,4> block;
593 bload<DataMapper, Packet, Index, 4, 0, StorageOrder>(block, lhs, j, i);
595 bload<DataMapper, Packet, Index, 4, 0, StorageOrder>(block, lhs, i, j);
597 if(((StorageOrder ==
RowMajor) && UseLhs) || ((StorageOrder ==
ColMajor) && !UseLhs))
602 storeBlock<Scalar, Packet, Index>(blockA + ri, block);
606 for(; i < depth; i++)
608 if(((StorageOrder ==
RowMajor) && UseLhs) || ((StorageOrder ==
ColMajor) && !UseLhs))
611 blockA[ri+0] = lhs(j+0, i);
612 blockA[ri+1] = lhs(j+1, i);
613 blockA[ri+2] = lhs(j+2, i);
614 blockA[ri+3] = lhs(j+3, i);
616 blockA[ri+0] = lhs(i, j+0);
617 blockA[ri+1] = lhs(i, j+1);
618 blockA[ri+2] = lhs(i, j+2);
619 blockA[ri+3] = lhs(i, j+3);
624 lhsV = lhs.template loadPacket<Packet>(j, i);
626 lhsV = lhs.template loadPacket<Packet>(i, j);
628 pstore<Scalar>(blockA + ri, lhsV);
634 if(PanelMode) ri += vectorSize*(stride - offset - depth);
639 if(PanelMode) ri += offset*(rows - j);
641 for(Index i = 0; i < depth; i++)
647 blockA[ri] = lhs(k, i);
649 blockA[ri] = lhs(i, k);
659template<
typename Index,
typename DataMapper,
int StorageOrder,
bool PanelMode>
660struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, true>
662 EIGEN_STRONG_INLINE
void operator()(
double* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
664 const Index vectorSize = quad_traits<double>::vectorsize;
667 for(; j + vectorSize <= rows; j+=vectorSize)
671 if(PanelMode) ri += vectorSize*offset;
673 for(; i + vectorSize <= depth; i+=vectorSize)
675 PacketBlock<Packet2d,2> block;
678 block.packet[0] = lhs.template loadPacket<Packet2d>(j + 0, i);
679 block.packet[1] = lhs.template loadPacket<Packet2d>(j + 1, i);
683 block.packet[0] = lhs.template loadPacket<Packet2d>(j, i + 0);
684 block.packet[1] = lhs.template loadPacket<Packet2d>(j, i + 1);
687 storeBlock<double, Packet2d, Index>(blockA + ri, block);
691 for(; i < depth; i++)
695 blockA[ri+0] = lhs(j+0, i);
696 blockA[ri+1] = lhs(j+1, i);
698 Packet2d lhsV = lhs.template loadPacket<Packet2d>(j, i);
699 pstore<double>(blockA + ri, lhsV);
705 if(PanelMode) ri += vectorSize*(stride - offset - depth);
710 if(PanelMode) ri += offset*(rows - j);
712 for(Index i = 0; i < depth; i++)
717 blockA[ri] = lhs(k, i);
726template<
typename Index,
typename DataMapper,
int StorageOrder,
bool PanelMode>
727struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, false>
729 EIGEN_STRONG_INLINE
void operator()(
double* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
731 const Index vectorSize = quad_traits<double>::vectorsize;
734 for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
738 if(PanelMode) ri += offset*(2*vectorSize);
740 for(; i + vectorSize <= depth; i+=vectorSize)
742 PacketBlock<Packet2d,4> block;
745 PacketBlock<Packet2d,2> block1, block2;
746 block1.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 0);
747 block1.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 1);
748 block2.packet[0] = rhs.template loadPacket<Packet2d>(i, j + 2);
749 block2.packet[1] = rhs.template loadPacket<Packet2d>(i, j + 3);
754 pstore<double>(blockB + ri , block1.packet[0]);
755 pstore<double>(blockB + ri + 2, block2.packet[0]);
756 pstore<double>(blockB + ri + 4, block1.packet[1]);
757 pstore<double>(blockB + ri + 6, block2.packet[1]);
759 block.packet[0] = rhs.template loadPacket<Packet2d>(i + 0, j + 0);
760 block.packet[1] = rhs.template loadPacket<Packet2d>(i + 0, j + 2);
761 block.packet[2] = rhs.template loadPacket<Packet2d>(i + 1, j + 0);
762 block.packet[3] = rhs.template loadPacket<Packet2d>(i + 1, j + 2);
764 storeBlock<double, Packet2d, Index>(blockB + ri, block);
769 for(; i < depth; i++)
773 blockB[ri+0] = rhs(i, j+0);
774 blockB[ri+1] = rhs(i, j+1);
778 blockB[ri+0] = rhs(i, j+2);
779 blockB[ri+1] = rhs(i, j+3);
781 Packet2d rhsV = rhs.template loadPacket<Packet2d>(i, j);
782 pstore<double>(blockB + ri, rhsV);
786 rhsV = rhs.template loadPacket<Packet2d>(i, j + 2);
787 pstore<double>(blockB + ri, rhsV);
792 if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth);
797 if(PanelMode) ri += offset*(cols - j);
799 for(Index i = 0; i < depth; i++)
804 blockB[ri] = rhs(i, k);
813template<
typename Index,
typename DataMapper,
typename Packet,
typename PacketC,
int StorageOrder,
bool Conjugate,
bool PanelMode>
814struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true>
816 EIGEN_STRONG_INLINE
void operator()(std::complex<double>* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
818 const Index vectorSize = quad_traits<double>::vectorsize;
819 const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
820 Index rir = ((PanelMode) ? (vectorSize*offset) : 0), rii;
821 double* blockAt =
reinterpret_cast<double *
>(blockA);
824 for(; j + vectorSize <= rows; j+=vectorSize)
828 rii = rir + vectorDelta;
830 for(; i + vectorSize <= depth; i+=vectorSize)
832 PacketBlock<Packet,2> blockr, blocki;
833 PacketBlock<PacketC,4> cblock;
837 cblock.packet[0] = lhs.template loadPacket<PacketC>(j, i + 0);
838 cblock.packet[1] = lhs.template loadPacket<PacketC>(j, i + 1);
840 cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 1, i + 0);
841 cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1);
843 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETREAL64);
844 blockr.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETREAL64);
846 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[2].v, p16uc_GETIMAG64);
847 blocki.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[3].v, p16uc_GETIMAG64);
849 cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
850 cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
852 cblock.packet[2] = lhs.template loadPacket<PacketC>(j + 0, i + 1);
853 cblock.packet[3] = lhs.template loadPacket<PacketC>(j + 1, i + 1);
855 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
856 blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64);
858 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
859 blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
864 blocki.packet[0] = -blocki.packet[0];
865 blocki.packet[1] = -blocki.packet[1];
868 storeBlock<double, Packet, Index>(blockAt + rir, blockr);
869 storeBlock<double, Packet, Index>(blockAt + rii, blocki);
874 for(; i < depth; i++)
876 PacketBlock<Packet,1> blockr, blocki;
877 PacketBlock<PacketC,2> cblock;
879 cblock.packet[0] = lhs.template loadPacket<PacketC>(j + 0, i);
880 cblock.packet[1] = lhs.template loadPacket<PacketC>(j + 1, i);
882 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
883 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
887 blocki.packet[0] = -blocki.packet[0];
890 pstore<double>(blockAt + rir, blockr.packet[0]);
891 pstore<double>(blockAt + rii, blocki.packet[0]);
897 rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
902 if(PanelMode) rir += (offset*(rows - j - vectorSize));
903 rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
905 for(Index i = 0; i < depth; i++)
910 blockAt[rir] = lhs(k, i).real();
913 blockAt[rii] = -lhs(k, i).imag();
915 blockAt[rii] = lhs(k, i).imag();
926template<
typename Index,
typename DataMapper,
typename Packet,
typename PacketC,
int StorageOrder,
bool Conjugate,
bool PanelMode>
927struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false>
929 EIGEN_STRONG_INLINE
void operator()(std::complex<double>* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
931 const Index vectorSize = quad_traits<double>::vectorsize;
932 const Index vectorDelta = 2*vectorSize * ((PanelMode) ? stride : depth);
933 Index rir = ((PanelMode) ? (2*vectorSize*offset) : 0), rii;
934 double* blockBt =
reinterpret_cast<double *
>(blockB);
937 for(; j + 2*vectorSize <= cols; j+=2*vectorSize)
941 rii = rir + vectorDelta;
943 for(; i < depth; i++)
945 PacketBlock<PacketC,4> cblock;
946 PacketBlock<Packet,2> blockr, blocki;
948 bload<DataMapper, PacketC, Index, 2, 0, ColMajor>(cblock, rhs, i, j);
950 blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
951 blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64);
953 blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG64);
954 blocki.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETIMAG64);
958 blocki.packet[0] = -blocki.packet[0];
959 blocki.packet[1] = -blocki.packet[1];
962 storeBlock<double, Packet, Index>(blockBt + rir, blockr);
963 storeBlock<double, Packet, Index>(blockBt + rii, blocki);
969 rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta);
974 if(PanelMode) rir += (offset*(cols - j - 2*vectorSize));
975 rii = rir + (((PanelMode) ? stride : depth) * (cols - j));
977 for(Index i = 0; i < depth; i++)
982 blockBt[rir] = rhs(i, k).real();
985 blockBt[rii] = -rhs(i, k).imag();
987 blockBt[rii] = rhs(i, k).imag();
1002template<
typename Packet,
bool NegativeAccumulate>
1003EIGEN_ALWAYS_INLINE
void pger_common(PacketBlock<Packet,4>* acc,
const Packet& lhsV,
const Packet* rhsV)
1005 if(NegativeAccumulate)
1007 acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
1008 acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]);
1009 acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]);
1010 acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]);
1012 acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
1013 acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]);
1014 acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]);
1015 acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]);
1019template<
typename Packet,
bool NegativeAccumulate>
1020EIGEN_ALWAYS_INLINE
void pger_common(PacketBlock<Packet,1>* acc,
const Packet& lhsV,
const Packet* rhsV)
1022 if(NegativeAccumulate)
1024 acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
1026 acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
1030template<
int N,
typename Scalar,
typename Packet,
bool NegativeAccumulate>
1031EIGEN_ALWAYS_INLINE
void pger(PacketBlock<Packet,N>* acc,
const Scalar* lhs,
const Packet* rhsV)
1033 Packet lhsV = pload<Packet>(lhs);
1035 pger_common<Packet, NegativeAccumulate>(acc, lhsV, rhsV);
1038template<
typename Scalar,
typename Packet,
typename Index>
1039EIGEN_ALWAYS_INLINE
void loadPacketRemaining(
const Scalar* lhs, Packet &lhsV, Index remaining_rows)
1042 lhsV = vec_xl_len((Scalar *)lhs, remaining_rows *
sizeof(Scalar));
1047 }
while (++i < remaining_rows);
1051template<
int N,
typename Scalar,
typename Packet,
typename Index,
bool NegativeAccumulate>
1052EIGEN_ALWAYS_INLINE
void pger(PacketBlock<Packet,N>* acc,
const Scalar* lhs,
const Packet* rhsV, Index remaining_rows)
1055 loadPacketRemaining<Scalar, Packet, Index>(lhs, lhsV, remaining_rows);
1057 pger_common<Packet, NegativeAccumulate>(acc, lhsV, rhsV);
1061template<
int N,
typename Packet,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1062EIGEN_ALWAYS_INLINE
void pgerc_common(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag,
const Packet &lhsV,
const Packet &lhsVi,
const Packet* rhsV,
const Packet* rhsVi)
1064 pger_common<Packet, false>(accReal, lhsV, rhsV);
1067 pger_common<Packet, ConjugateRhs>(accImag, lhsV, rhsVi);
1068 EIGEN_UNUSED_VARIABLE(lhsVi);
1071 pger_common<Packet, ConjugateLhs == ConjugateRhs>(accReal, lhsVi, rhsVi);
1072 pger_common<Packet, ConjugateRhs>(accImag, lhsV, rhsVi);
1074 EIGEN_UNUSED_VARIABLE(rhsVi);
1076 pger_common<Packet, ConjugateLhs>(accImag, lhsVi, rhsV);
1080template<
int N,
typename Scalar,
typename Packet,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1081EIGEN_ALWAYS_INLINE
void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag,
const Scalar* lhs_ptr,
const Scalar* lhs_ptr_imag,
const Packet* rhsV,
const Packet* rhsVi)
1083 Packet lhsV = ploadLhs<Scalar, Packet>(lhs_ptr);
1085 if(!LhsIsReal) lhsVi = ploadLhs<Scalar, Packet>(lhs_ptr_imag);
1086 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1088 pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
1091template<
typename Scalar,
typename Packet,
typename Index,
bool LhsIsReal>
1092EIGEN_ALWAYS_INLINE
void loadPacketRemaining(
const Scalar* lhs_ptr,
const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows)
1095 lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows *
sizeof(Scalar));
1096 if(!LhsIsReal) lhsVi = vec_xl_len((Scalar *)lhs_ptr_imag, remaining_rows *
sizeof(Scalar));
1097 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1101 lhsV[i] = lhs_ptr[i];
1102 if(!LhsIsReal) lhsVi[i] = lhs_ptr_imag[i];
1103 }
while (++i < remaining_rows);
1104 if(LhsIsReal) EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1108template<
int N,
typename Scalar,
typename Packet,
typename Index,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1109EIGEN_ALWAYS_INLINE
void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag,
const Scalar* lhs_ptr,
const Scalar* lhs_ptr_imag,
const Packet* rhsV,
const Packet* rhsVi, Index remaining_rows)
1112 loadPacketRemaining<Scalar, Packet, Index, LhsIsReal>(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows);
1114 pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
1117template<
typename Scalar,
typename Packet>
1118EIGEN_ALWAYS_INLINE Packet ploadLhs(
const Scalar* lhs)
1120 return ploadu<Packet>(lhs);
1124template<
typename Scalar,
typename Packet>
1125EIGEN_ALWAYS_INLINE
void bsetzero(PacketBlock<Packet,4>& acc)
1127 acc.packet[0] = pset1<Packet>((Scalar)0);
1128 acc.packet[1] = pset1<Packet>((Scalar)0);
1129 acc.packet[2] = pset1<Packet>((Scalar)0);
1130 acc.packet[3] = pset1<Packet>((Scalar)0);
1133template<
typename Scalar,
typename Packet>
1134EIGEN_ALWAYS_INLINE
void bsetzero(PacketBlock<Packet,1>& acc)
1136 acc.packet[0] = pset1<Packet>((Scalar)0);
1140template<
typename Packet>
1141EIGEN_ALWAYS_INLINE
void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ,
const Packet& pAlpha)
1143 acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
1144 acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]);
1145 acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]);
1146 acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]);
1149template<
typename Packet>
1150EIGEN_ALWAYS_INLINE
void bscale(PacketBlock<Packet,1>& acc, PacketBlock<Packet,1>& accZ,
const Packet& pAlpha)
1152 acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
1155template<
typename Packet>
1156EIGEN_ALWAYS_INLINE
void bscalec_common(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ,
const Packet& pAlpha)
1158 acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
1159 acc.packet[1] = pmul<Packet>(accZ.packet[1], pAlpha);
1160 acc.packet[2] = pmul<Packet>(accZ.packet[2], pAlpha);
1161 acc.packet[3] = pmul<Packet>(accZ.packet[3], pAlpha);
1164template<
typename Packet>
1165EIGEN_ALWAYS_INLINE
void bscalec_common(PacketBlock<Packet,1>& acc, PacketBlock<Packet,1>& accZ,
const Packet& pAlpha)
1167 acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
1171template<
typename Packet,
int N>
1172EIGEN_ALWAYS_INLINE
void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag,
const Packet& bReal,
const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag)
1174 bscalec_common<Packet>(cReal, aReal, bReal);
1176 bscalec_common<Packet>(cImag, aImag, bReal);
1178 pger_common<Packet, true>(&cReal, bImag, aImag.packet);
1180 pger_common<Packet, false>(&cImag, bImag, aReal.packet);
1183template<
typename Packet>
1184EIGEN_ALWAYS_INLINE
void band(PacketBlock<Packet,4>& acc,
const Packet& pMask)
1186 acc.packet[0] = pand(acc.packet[0], pMask);
1187 acc.packet[1] = pand(acc.packet[1], pMask);
1188 acc.packet[2] = pand(acc.packet[2], pMask);
1189 acc.packet[3] = pand(acc.packet[3], pMask);
1192template<
typename Packet>
1193EIGEN_ALWAYS_INLINE
void bscalec(PacketBlock<Packet,4>& aReal, PacketBlock<Packet,4>& aImag,
const Packet& bReal,
const Packet& bImag, PacketBlock<Packet,4>& cReal, PacketBlock<Packet,4>& cImag,
const Packet& pMask)
1195 band<Packet>(aReal, pMask);
1196 band<Packet>(aImag, pMask);
1198 bscalec<Packet,4>(aReal, aImag, bReal, bImag, cReal, cImag);
1202template<
typename DataMapper,
typename Packet,
typename Index, const Index accCols,
int N,
int StorageOrder>
1203EIGEN_ALWAYS_INLINE
void bload(PacketBlock<Packet,4>& acc,
const DataMapper& res, Index row, Index col)
1206 acc.packet[0] = res.template loadPacket<Packet>(row + 0, col + N*accCols);
1207 acc.packet[1] = res.template loadPacket<Packet>(row + 1, col + N*accCols);
1208 acc.packet[2] = res.template loadPacket<Packet>(row + 2, col + N*accCols);
1209 acc.packet[3] = res.template loadPacket<Packet>(row + 3, col + N*accCols);
1211 acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
1212 acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
1213 acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
1214 acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
1219template<
typename DataMapper,
typename Packet,
typename Index, const Index accCols,
int N,
int StorageOrder>
1220EIGEN_ALWAYS_INLINE
void bload(PacketBlock<Packet,8>& acc,
const DataMapper& res, Index row, Index col)
1223 acc.packet[0] = res.template loadPacket<Packet>(row + 0, col + N*accCols);
1224 acc.packet[1] = res.template loadPacket<Packet>(row + 1, col + N*accCols);
1225 acc.packet[2] = res.template loadPacket<Packet>(row + 2, col + N*accCols);
1226 acc.packet[3] = res.template loadPacket<Packet>(row + 3, col + N*accCols);
1227 acc.packet[4] = res.template loadPacket<Packet>(row + 0, col + (N+1)*accCols);
1228 acc.packet[5] = res.template loadPacket<Packet>(row + 1, col + (N+1)*accCols);
1229 acc.packet[6] = res.template loadPacket<Packet>(row + 2, col + (N+1)*accCols);
1230 acc.packet[7] = res.template loadPacket<Packet>(row + 3, col + (N+1)*accCols);
1232 acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
1233 acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
1234 acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
1235 acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
1236 acc.packet[4] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
1237 acc.packet[5] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 1);
1238 acc.packet[6] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 2);
1239 acc.packet[7] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 3);
1243template<
typename DataMapper,
typename Packet,
typename Index, const Index accCols,
int N,
int StorageOrder>
1244EIGEN_ALWAYS_INLINE
void bload(PacketBlock<Packet,2>& acc,
const DataMapper& res, Index row, Index col)
1246 acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
1247 acc.packet[1] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
1250const static Packet4i mask41 = { -1, 0, 0, 0 };
1251const static Packet4i mask42 = { -1, -1, 0, 0 };
1252const static Packet4i mask43 = { -1, -1, -1, 0 };
1254const static Packet2l mask21 = { -1, 0 };
1256template<
typename Packet>
1257EIGEN_ALWAYS_INLINE Packet bmask(
const int remaining_rows)
1259 if (remaining_rows == 0) {
1260 return pset1<Packet>(
float(0.0));
1262 switch (remaining_rows) {
1263 case 1:
return Packet(mask41);
1264 case 2:
return Packet(mask42);
1265 default:
return Packet(mask43);
1271EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d>(
const int remaining_rows)
1273 if (remaining_rows == 0) {
1274 return pset1<Packet2d>(
double(0.0));
1276 return Packet2d(mask21);
1280template<
typename Packet>
1281EIGEN_ALWAYS_INLINE
void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ,
const Packet& pAlpha,
const Packet& pMask)
1283 band<Packet>(accZ, pMask);
1285 bscale<Packet>(acc, accZ, pAlpha);
1288template<
typename Packet>
1289EIGEN_ALWAYS_INLINE
void pbroadcast4_old(
const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3)
1291 pbroadcast4<Packet>(a, a0, a1, a2, a3);
1295EIGEN_ALWAYS_INLINE
void pbroadcast4_old<Packet2d>(
const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
1297 a1 = pload<Packet2d>(a);
1298 a3 = pload<Packet2d>(a + 2);
1299 a0 = vec_splat(a1, 0);
1300 a1 = vec_splat(a1, 1);
1301 a2 = vec_splat(a3, 0);
1302 a3 = vec_splat(a3, 1);
1308template<
typename Scalar,
typename Packet,
typename Index>
1309EIGEN_ALWAYS_INLINE
void MICRO_EXTRA_COL(
1310 const Scalar* &lhs_ptr,
1311 const Scalar* &rhs_ptr,
1312 PacketBlock<Packet,1> &accZero,
1313 Index remaining_rows,
1314 Index remaining_cols)
1317 rhsV[0] = pset1<Packet>(rhs_ptr[0]);
1318 pger<1,Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
1319 lhs_ptr += remaining_rows;
1320 rhs_ptr += remaining_cols;
1323template<
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accRows>
1324EIGEN_STRONG_INLINE
void gemm_extra_col(
1325 const DataMapper& res,
1326 const Scalar* lhs_base,
1327 const Scalar* rhs_base,
1333 Index remaining_rows,
1334 Index remaining_cols,
1335 const Packet& pAlpha)
1337 const Scalar* rhs_ptr = rhs_base;
1338 const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
1339 PacketBlock<Packet,1> accZero;
1341 bsetzero<Scalar, Packet>(accZero);
1343 Index remaining_depth = (depth & -accRows);
1345 for(; k + PEEL <= remaining_depth; k+= PEEL)
1347 EIGEN_POWER_PREFETCH(rhs_ptr);
1348 EIGEN_POWER_PREFETCH(lhs_ptr);
1349 for (
int l = 0; l < PEEL; l++) {
1350 MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
1353 for(; k < remaining_depth; k++)
1355 MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
1357 for(; k < depth; k++)
1360 rhsV[0] = pset1<Packet>(rhs_ptr[0]);
1361 pger<1, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
1362 lhs_ptr += remaining_rows;
1363 rhs_ptr += remaining_cols;
1366 accZero.packet[0] = vec_mul(pAlpha, accZero.packet[0]);
1367 for(Index i = 0; i < remaining_rows; i++) {
1368 res(row + i, col) += accZero.packet[0][i];
1372template<
typename Scalar,
typename Packet,
typename Index, const Index accRows>
1373EIGEN_ALWAYS_INLINE
void MICRO_EXTRA_ROW(
1374 const Scalar* &lhs_ptr,
1375 const Scalar* &rhs_ptr,
1376 PacketBlock<Packet,4> &accZero,
1377 Index remaining_rows)
1380 pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1381 pger<4, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
1382 lhs_ptr += remaining_rows;
1386template<
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accRows, const Index accCols>
1387EIGEN_STRONG_INLINE
void gemm_extra_row(
1388 const DataMapper& res,
1389 const Scalar* lhs_base,
1390 const Scalar* rhs_base,
1398 Index remaining_rows,
1399 const Packet& pAlpha,
1400 const Packet& pMask)
1402 const Scalar* rhs_ptr = rhs_base;
1403 const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
1404 PacketBlock<Packet,4> accZero, acc;
1406 bsetzero<Scalar, Packet>(accZero);
1408 Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows);
1410 for(; k + PEEL <= remaining_depth; k+= PEEL)
1412 EIGEN_POWER_PREFETCH(rhs_ptr);
1413 EIGEN_POWER_PREFETCH(lhs_ptr);
1414 for (
int l = 0; l < PEEL; l++) {
1415 MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
1418 for(; k < remaining_depth; k++)
1420 MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
1423 if ((remaining_depth == depth) && (rows >= accCols))
1425 for(Index j = 0; j < 4; j++) {
1426 acc.packet[j] = res.template loadPacket<Packet>(row, col + j);
1428 bscale<Packet>(acc, accZero, pAlpha, pMask);
1429 res.template storePacketBlock<Packet,4>(row, col, acc);
1431 for(; k < depth; k++)
1434 pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1435 pger<4, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
1436 lhs_ptr += remaining_rows;
1440 for(Index j = 0; j < 4; j++) {
1441 accZero.packet[j] = vec_mul(pAlpha, accZero.packet[j]);
1443 for(Index j = 0; j < 4; j++) {
1444 for(Index i = 0; i < remaining_rows; i++) {
1445 res(row + i, col + j) += accZero.packet[j][i];
1451#define MICRO_UNROLL(func) \
1452 func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
1454#define MICRO_UNROLL_WORK(func, func2, peel) \
1455 MICRO_UNROLL(func2); \
1456 func(0,peel) func(1,peel) func(2,peel) func(3,peel) \
1457 func(4,peel) func(5,peel) func(6,peel) func(7,peel)
1459#define MICRO_LOAD_ONE(iter) \
1460 if (unroll_factor > iter) { \
1461 lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
1462 lhs_ptr##iter += accCols; \
1464 EIGEN_UNUSED_VARIABLE(lhsV##iter); \
1467#define MICRO_WORK_ONE(iter, peel) \
1468 if (unroll_factor > iter) { \
1469 pger_common<Packet, false>(&accZero##iter, lhsV##iter, rhsV##peel); \
1472#define MICRO_TYPE_PEEL4(func, func2, peel) \
1473 if (PEEL > peel) { \
1474 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
1475 pbroadcast4<Packet>(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
1476 MICRO_UNROLL_WORK(func, func2, peel) \
1478 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
1481#define MICRO_TYPE_PEEL1(func, func2, peel) \
1482 if (PEEL > peel) { \
1483 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
1484 rhsV##peel[0] = pset1<Packet>(rhs_ptr[remaining_cols * peel]); \
1485 MICRO_UNROLL_WORK(func, func2, peel) \
1487 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
1490#define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \
1491 Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \
1492 func(func1,func2,0); func(func1,func2,1); \
1493 func(func1,func2,2); func(func1,func2,3); \
1494 func(func1,func2,4); func(func1,func2,5); \
1495 func(func1,func2,6); func(func1,func2,7); \
1496 func(func1,func2,8); func(func1,func2,9);
1498#define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \
1500 func(func1,func2,0);
1502#define MICRO_ONE_PEEL4 \
1503 MICRO_UNROLL_TYPE_PEEL(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1504 rhs_ptr += (accRows * PEEL);
1507 MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1510#define MICRO_ONE_PEEL1 \
1511 MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1512 rhs_ptr += (remaining_cols * PEEL);
1515 MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
1516 rhs_ptr += remaining_cols;
1518#define MICRO_DST_PTR_ONE(iter) \
1519 if (unroll_factor > iter) { \
1520 bsetzero<Scalar, Packet>(accZero##iter); \
1522 EIGEN_UNUSED_VARIABLE(accZero##iter); \
1525#define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE)
1527#define MICRO_SRC_PTR_ONE(iter) \
1528 if (unroll_factor > iter) { \
1529 lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \
1531 EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
1534#define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE)
1536#define MICRO_PREFETCH_ONE(iter) \
1537 if (unroll_factor > iter) { \
1538 EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
1541#define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE)
1543#define MICRO_STORE_ONE(iter) \
1544 if (unroll_factor > iter) { \
1545 acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
1546 acc.packet[1] = res.template loadPacket<Packet>(row + iter*accCols, col + 1); \
1547 acc.packet[2] = res.template loadPacket<Packet>(row + iter*accCols, col + 2); \
1548 acc.packet[3] = res.template loadPacket<Packet>(row + iter*accCols, col + 3); \
1549 bscale<Packet>(acc, accZero##iter, pAlpha); \
1550 res.template storePacketBlock<Packet,4>(row + iter*accCols, col, acc); \
1553#define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
1555#define MICRO_COL_STORE_ONE(iter) \
1556 if (unroll_factor > iter) { \
1557 acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
1558 bscale<Packet>(acc, accZero##iter, pAlpha); \
1559 res.template storePacketBlock<Packet,1>(row + iter*accCols, col, acc); \
1562#define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE)
1564template<
int unroll_factor,
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accRows, const Index accCols>
1565EIGEN_STRONG_INLINE
void gemm_unrolled_iteration(
1566 const DataMapper& res,
1567 const Scalar* lhs_base,
1568 const Scalar* rhs_base,
1574 const Packet& pAlpha)
1576 const Scalar* rhs_ptr = rhs_base;
1577 const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
1578 PacketBlock<Packet,4> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
1579 PacketBlock<Packet,4> acc;
1585 for(; k + PEEL <= depth; k+= PEEL)
1587 EIGEN_POWER_PREFETCH(rhs_ptr);
1591 for(; k < depth; k++)
1597 row += unroll_factor*accCols;
1600template<
int unroll_factor,
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accCols>
1601EIGEN_STRONG_INLINE
void gemm_unrolled_col_iteration(
1602 const DataMapper& res,
1603 const Scalar* lhs_base,
1604 const Scalar* rhs_base,
1610 Index remaining_cols,
1611 const Packet& pAlpha)
1613 const Scalar* rhs_ptr = rhs_base;
1614 const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, *lhs_ptr7 = NULL;
1615 PacketBlock<Packet,1> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
1616 PacketBlock<Packet,1> acc;
1622 for(; k + PEEL <= depth; k+= PEEL)
1624 EIGEN_POWER_PREFETCH(rhs_ptr);
1628 for(; k < depth; k++)
1634 row += unroll_factor*accCols;
1637template<
typename Scalar,
typename Packet,
typename DataMapper,
typename Index, const Index accCols>
1638EIGEN_STRONG_INLINE
void gemm_unrolled_col(
1639 const DataMapper& res,
1640 const Scalar* lhs_base,
1641 const Scalar* rhs_base,
1648 Index remaining_cols,
1649 const Packet& pAlpha)
1652 while(row + MAX_UNROLL*accCols <= rows) {
1653 gemm_unrolled_col_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1655 switch( (rows-row)/accCols ) {
1658 gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1663 gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1668 gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1673 gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1678 gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1683 gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1688 gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
1700template<
typename Scalar,
typename Index,
typename Packet,
typename RhsPacket,
typename DataMapper, const Index accRows, const Index accCols>
1701EIGEN_STRONG_INLINE
void gemm(
const DataMapper& res,
const Scalar* blockA,
const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
1703 const Index remaining_rows = rows % accCols;
1704 const Index remaining_cols = cols % accRows;
1706 if( strideA == -1 ) strideA = depth;
1707 if( strideB == -1 ) strideB = depth;
1709 const Packet pAlpha = pset1<Packet>(alpha);
1710 const Packet pMask = bmask<Packet>((
const int)(remaining_rows));
1713 for(; col + accRows <= cols; col += accRows)
1715 const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
1716 const Scalar* lhs_base = blockA;
1720 while(row + MAX_UNROLL*accCols <= rows) {
1721 gemm_unrolled_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1723 switch( (rows-row)/accCols ) {
1726 gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1731 gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1736 gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1741 gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1746 gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1751 gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1756 gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
1764 if(remaining_rows > 0)
1766 gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
1770 if(remaining_cols > 0)
1772 const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB;
1773 const Scalar* lhs_base = blockA;
1775 for(; col < cols; col++)
1779 gemm_unrolled_col<Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha);
1781 if (remaining_rows > 0)
1783 gemm_extra_col<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha);
1790#define accColsC (accCols / 2)
1791#define advanceRows ((LhsIsReal) ? 1 : 2)
1792#define advanceCols ((RhsIsReal) ? 1 : 2)
1795#define PEEL_COMPLEX 3
1797template<
typename Scalar,
typename Packet,
typename Index, const Index accRows,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1798EIGEN_ALWAYS_INLINE
void MICRO_COMPLEX_EXTRA_COL(
1799 const Scalar* &lhs_ptr_real,
const Scalar* &lhs_ptr_imag,
1800 const Scalar* &rhs_ptr_real,
const Scalar* &rhs_ptr_imag,
1801 PacketBlock<Packet,1> &accReal, PacketBlock<Packet,1> &accImag,
1802 Index remaining_rows,
1803 Index remaining_cols)
1805 Packet rhsV[1], rhsVi[1];
1806 rhsV[0] = pset1<Packet>(rhs_ptr_real[0]);
1807 if(!RhsIsReal) rhsVi[0] = pset1<Packet>(rhs_ptr_imag[0]);
1808 pgerc<1, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
1809 lhs_ptr_real += remaining_rows;
1810 if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1811 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1812 rhs_ptr_real += remaining_cols;
1813 if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
1814 else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1817template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1818EIGEN_STRONG_INLINE
void gemm_complex_extra_col(
1819 const DataMapper& res,
1820 const Scalar* lhs_base,
1821 const Scalar* rhs_base,
1828 Index remaining_rows,
1829 Index remaining_cols,
1830 const Packet& pAlphaReal,
1831 const Packet& pAlphaImag)
1833 const Scalar* rhs_ptr_real = rhs_base;
1834 const Scalar* rhs_ptr_imag;
1835 if(!RhsIsReal) rhs_ptr_imag = rhs_base + remaining_cols*strideB;
1836 else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1837 const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
1838 const Scalar* lhs_ptr_imag;
1839 if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
1840 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1841 PacketBlock<Packet,1> accReal, accImag;
1842 PacketBlock<Packet,1> taccReal, taccImag;
1843 PacketBlock<Packetc,1> acc0, acc1;
1845 bsetzero<Scalar, Packet>(accReal);
1846 bsetzero<Scalar, Packet>(accImag);
1848 Index remaining_depth = (depth & -accRows);
1850 for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX)
1852 EIGEN_POWER_PREFETCH(rhs_ptr_real);
1854 EIGEN_POWER_PREFETCH(rhs_ptr_imag);
1856 EIGEN_POWER_PREFETCH(lhs_ptr_real);
1858 EIGEN_POWER_PREFETCH(lhs_ptr_imag);
1860 for (
int l = 0; l < PEEL_COMPLEX; l++) {
1861 MICRO_COMPLEX_EXTRA_COL<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols);
1864 for(; k < remaining_depth; k++)
1866 MICRO_COMPLEX_EXTRA_COL<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols);
1869 for(; k < depth; k++)
1871 Packet rhsV[1], rhsVi[1];
1872 rhsV[0] = pset1<Packet>(rhs_ptr_real[0]);
1873 if(!RhsIsReal) rhsVi[0] = pset1<Packet>(rhs_ptr_imag[0]);
1874 pgerc<1, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows);
1875 lhs_ptr_real += remaining_rows;
1876 if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1877 rhs_ptr_real += remaining_cols;
1878 if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
1881 bscalec<Packet,1>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag);
1882 bcouple_common<Packet, Packetc>(taccReal, taccImag, acc0, acc1);
1884 if ((
sizeof(Scalar) ==
sizeof(
float)) && (remaining_rows == 1))
1886 res(row + 0, col + 0) += pfirst<Packetc>(acc0.packet[0]);
1888 acc0.packet[0] += res.template loadPacket<Packetc>(row + 0, col + 0);
1889 res.template storePacketBlock<Packetc,1>(row + 0, col + 0, acc0);
1890 if(remaining_rows > accColsC) {
1891 res(row + accColsC, col + 0) += pfirst<Packetc>(acc1.packet[0]);
1896template<
typename Scalar,
typename Packet,
typename Index, const Index accRows,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1897EIGEN_ALWAYS_INLINE
void MICRO_COMPLEX_EXTRA_ROW(
1898 const Scalar* &lhs_ptr_real,
const Scalar* &lhs_ptr_imag,
1899 const Scalar* &rhs_ptr_real,
const Scalar* &rhs_ptr_imag,
1900 PacketBlock<Packet,4> &accReal, PacketBlock<Packet,4> &accImag,
1901 Index remaining_rows)
1903 Packet rhsV[4], rhsVi[4];
1904 pbroadcast4_old<Packet>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1905 if(!RhsIsReal) pbroadcast4_old<Packet>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
1906 pgerc<4, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
1907 lhs_ptr_real += remaining_rows;
1908 if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1909 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1910 rhs_ptr_real += accRows;
1911 if(!RhsIsReal) rhs_ptr_imag += accRows;
1912 else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1915template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
1916EIGEN_STRONG_INLINE
void gemm_complex_extra_row(
1917 const DataMapper& res,
1918 const Scalar* lhs_base,
1919 const Scalar* rhs_base,
1928 Index remaining_rows,
1929 const Packet& pAlphaReal,
1930 const Packet& pAlphaImag,
1931 const Packet& pMask)
1933 const Scalar* rhs_ptr_real = rhs_base;
1934 const Scalar* rhs_ptr_imag;
1935 if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB;
1936 else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
1937 const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
1938 const Scalar* lhs_ptr_imag;
1939 if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
1940 else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
1941 PacketBlock<Packet,4> accReal, accImag;
1942 PacketBlock<Packet,4> taccReal, taccImag;
1943 PacketBlock<Packetc,4> acc0, acc1;
1944 PacketBlock<Packetc,8> tRes;
1946 bsetzero<Scalar, Packet>(accReal);
1947 bsetzero<Scalar, Packet>(accImag);
1949 Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows);
1951 for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX)
1953 EIGEN_POWER_PREFETCH(rhs_ptr_real);
1955 EIGEN_POWER_PREFETCH(rhs_ptr_imag);
1957 EIGEN_POWER_PREFETCH(lhs_ptr_real);
1959 EIGEN_POWER_PREFETCH(lhs_ptr_imag);
1961 for (
int l = 0; l < PEEL_COMPLEX; l++) {
1962 MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows);
1965 for(; k < remaining_depth; k++)
1967 MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows);
1970 if ((remaining_depth == depth) && (rows >= accCols))
1972 bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row, col);
1973 bscalec<Packet>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
1974 bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1);
1975 res.template storePacketBlock<Packetc,4>(row + 0, col, acc0);
1976 res.template storePacketBlock<Packetc,4>(row + accColsC, col, acc1);
1978 for(; k < depth; k++)
1980 Packet rhsV[4], rhsVi[4];
1981 pbroadcast4_old<Packet>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
1982 if(!RhsIsReal) pbroadcast4_old<Packet>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
1983 pgerc<4, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows);
1984 lhs_ptr_real += remaining_rows;
1985 if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
1986 rhs_ptr_real += accRows;
1987 if(!RhsIsReal) rhs_ptr_imag += accRows;
1990 bscalec<Packet,4>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag);
1991 bcouple_common<Packet, Packetc>(taccReal, taccImag, acc0, acc1);
1993 if ((
sizeof(Scalar) ==
sizeof(
float)) && (remaining_rows == 1))
1995 for(Index j = 0; j < 4; j++) {
1996 res(row + 0, col + j) += pfirst<Packetc>(acc0.packet[j]);
1999 for(Index j = 0; j < 4; j++) {
2000 PacketBlock<Packetc,1> acc2;
2001 acc2.packet[0] = res.template loadPacket<Packetc>(row + 0, col + j) + acc0.packet[j];
2002 res.template storePacketBlock<Packetc,1>(row + 0, col + j, acc2);
2003 if(remaining_rows > accColsC) {
2004 res(row + accColsC, col + j) += pfirst<Packetc>(acc1.packet[j]);
2011#define MICRO_COMPLEX_UNROLL(func) \
2012 func(0) func(1) func(2) func(3) func(4)
2014#define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2015 MICRO_COMPLEX_UNROLL(func2); \
2016 func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel)
2018#define MICRO_COMPLEX_LOAD_ONE(iter) \
2019 if (unroll_factor > iter) { \
2020 lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
2021 lhs_ptr_real##iter += accCols; \
2023 lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_imag##iter); \
2024 lhs_ptr_imag##iter += accCols; \
2026 EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
2029 EIGEN_UNUSED_VARIABLE(lhsV##iter); \
2030 EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
2033#define MICRO_COMPLEX_WORK_ONE4(iter, peel) \
2034 if (unroll_factor > iter) { \
2035 pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
2038#define MICRO_COMPLEX_WORK_ONE1(iter, peel) \
2039 if (unroll_factor > iter) { \
2040 pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
2043#define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \
2044 if (PEEL_COMPLEX > peel) { \
2045 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
2046 Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
2047 pbroadcast4_old<Packet>(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
2049 pbroadcast4_old<Packet>(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \
2051 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2053 MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2055 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
2056 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2059#define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \
2060 if (PEEL_COMPLEX > peel) { \
2061 Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
2062 Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
2063 rhsV##peel[0] = pset1<Packet>(rhs_ptr_real[remaining_cols * peel]); \
2065 rhsVi##peel[0] = pset1<Packet>(rhs_ptr_imag[remaining_cols * peel]); \
2067 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2069 MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
2071 EIGEN_UNUSED_VARIABLE(rhsV##peel); \
2072 EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
2075#define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \
2076 Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \
2077 Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \
2078 func(func1,func2,0); func(func1,func2,1); \
2079 func(func1,func2,2); func(func1,func2,3); \
2080 func(func1,func2,4); func(func1,func2,5); \
2081 func(func1,func2,6); func(func1,func2,7); \
2082 func(func1,func2,8); func(func1,func2,9);
2084#define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \
2085 Packet rhsV0[M], rhsVi0[M];\
2086 func(func1,func2,0);
2088#define MICRO_COMPLEX_ONE_PEEL4 \
2089 MICRO_COMPLEX_UNROLL_TYPE_PEEL(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
2090 rhs_ptr_real += (accRows * PEEL_COMPLEX); \
2091 if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX);
2093#define MICRO_COMPLEX_ONE4 \
2094 MICRO_COMPLEX_UNROLL_TYPE_ONE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE); \
2095 rhs_ptr_real += accRows; \
2096 if(!RhsIsReal) rhs_ptr_imag += accRows;
2098#define MICRO_COMPLEX_ONE_PEEL1 \
2099 MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \
2100 rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \
2101 if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX);
2103#define MICRO_COMPLEX_ONE1 \
2104 MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \
2105 rhs_ptr_real += remaining_cols; \
2106 if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
2108#define MICRO_COMPLEX_DST_PTR_ONE(iter) \
2109 if (unroll_factor > iter) { \
2110 bsetzero<Scalar, Packet>(accReal##iter); \
2111 bsetzero<Scalar, Packet>(accImag##iter); \
2113 EIGEN_UNUSED_VARIABLE(accReal##iter); \
2114 EIGEN_UNUSED_VARIABLE(accImag##iter); \
2117#define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE)
2119#define MICRO_COMPLEX_SRC_PTR_ONE(iter) \
2120 if (unroll_factor > iter) { \
2121 lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \
2123 lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \
2125 EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
2128 EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
2129 EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
2132#define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
2134#define MICRO_COMPLEX_PREFETCH_ONE(iter) \
2135 if (unroll_factor > iter) { \
2136 EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
2138 EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \
2142#define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
2144#define MICRO_COMPLEX_STORE_ONE(iter) \
2145 if (unroll_factor > iter) { \
2146 bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \
2147 bscalec<Packet,4>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
2148 bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \
2149 res.template storePacketBlock<Packetc,4>(row + iter*accCols + 0, col, acc0); \
2150 res.template storePacketBlock<Packetc,4>(row + iter*accCols + accColsC, col, acc1); \
2153#define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE)
2155#define MICRO_COMPLEX_COL_STORE_ONE(iter) \
2156 if (unroll_factor > iter) { \
2157 bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \
2158 bscalec<Packet,1>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
2159 bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \
2160 res.template storePacketBlock<Packetc,1>(row + iter*accCols + 0, col, acc0); \
2161 res.template storePacketBlock<Packetc,1>(row + iter*accCols + accColsC, col, acc1); \
2164#define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE)
2166template<
int unroll_factor,
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2167EIGEN_STRONG_INLINE
void gemm_complex_unrolled_iteration(
2168 const DataMapper& res,
2169 const Scalar* lhs_base,
2170 const Scalar* rhs_base,
2177 const Packet& pAlphaReal,
2178 const Packet& pAlphaImag)
2180 const Scalar* rhs_ptr_real = rhs_base;
2181 const Scalar* rhs_ptr_imag;
2183 rhs_ptr_imag = rhs_base + accRows*strideB;
2185 EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
2187 const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL;
2188 const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL;
2189 const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL;
2190 PacketBlock<Packet,4> accReal0, accImag0, accReal1, accImag1;
2191 PacketBlock<Packet,4> accReal2, accImag2, accReal3, accImag3;
2192 PacketBlock<Packet,4> accReal4, accImag4;
2193 PacketBlock<Packet,4> taccReal, taccImag;
2194 PacketBlock<Packetc,4> acc0, acc1;
2195 PacketBlock<Packetc,8> tRes;
2197 MICRO_COMPLEX_SRC_PTR
2198 MICRO_COMPLEX_DST_PTR
2201 for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
2203 EIGEN_POWER_PREFETCH(rhs_ptr_real);
2205 EIGEN_POWER_PREFETCH(rhs_ptr_imag);
2207 MICRO_COMPLEX_PREFETCH
2208 MICRO_COMPLEX_ONE_PEEL4
2210 for(; k < depth; k++)
2216 row += unroll_factor*accCols;
2219template<
int unroll_factor,
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2220EIGEN_STRONG_INLINE
void gemm_complex_unrolled_col_iteration(
2221 const DataMapper& res,
2222 const Scalar* lhs_base,
2223 const Scalar* rhs_base,
2230 Index remaining_cols,
2231 const Packet& pAlphaReal,
2232 const Packet& pAlphaImag)
2234 const Scalar* rhs_ptr_real = rhs_base;
2235 const Scalar* rhs_ptr_imag;
2237 rhs_ptr_imag = rhs_base + remaining_cols*strideB;
2239 EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
2241 const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL;
2242 const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL;
2243 const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL;
2244 PacketBlock<Packet,1> accReal0, accImag0, accReal1, accImag1;
2245 PacketBlock<Packet,1> accReal2, accImag2, accReal3, accImag3;
2246 PacketBlock<Packet,1> accReal4, accImag4;
2247 PacketBlock<Packet,1> taccReal, taccImag;
2248 PacketBlock<Packetc,1> acc0, acc1;
2249 PacketBlock<Packetc,2> tRes;
2251 MICRO_COMPLEX_SRC_PTR
2252 MICRO_COMPLEX_DST_PTR
2255 for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
2257 EIGEN_POWER_PREFETCH(rhs_ptr_real);
2259 EIGEN_POWER_PREFETCH(rhs_ptr_imag);
2261 MICRO_COMPLEX_PREFETCH
2262 MICRO_COMPLEX_ONE_PEEL1
2264 for(; k < depth; k++)
2268 MICRO_COMPLEX_COL_STORE
2270 row += unroll_factor*accCols;
2273template<
typename Scalar,
typename Packet,
typename Packetc,
typename DataMapper,
typename Index, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2274EIGEN_STRONG_INLINE
void gemm_complex_unrolled_col(
2275 const DataMapper& res,
2276 const Scalar* lhs_base,
2277 const Scalar* rhs_base,
2285 Index remaining_cols,
2286 const Packet& pAlphaReal,
2287 const Packet& pAlphaImag)
2289#define MAX_COMPLEX_UNROLL 3
2290 while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
2291 gemm_complex_unrolled_col_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2293 switch( (rows-row)/accCols ) {
2294#if MAX_COMPLEX_UNROLL > 4
2296 gemm_complex_unrolled_col_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2299#if MAX_COMPLEX_UNROLL > 3
2301 gemm_complex_unrolled_col_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2304#if MAX_COMPLEX_UNROLL > 2
2306 gemm_complex_unrolled_col_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2309#if MAX_COMPLEX_UNROLL > 1
2311 gemm_complex_unrolled_col_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
2317#undef MAX_COMPLEX_UNROLL
2320template<
typename LhsScalar,
typename RhsScalar,
typename Scalarc,
typename Scalar,
typename Index,
typename Packet,
typename Packetc,
typename RhsPacket,
typename DataMapper, const Index accRows, const Index accCols,
bool ConjugateLhs,
bool ConjugateRhs,
bool LhsIsReal,
bool RhsIsReal>
2321EIGEN_STRONG_INLINE
void gemm_complex(
const DataMapper& res,
const LhsScalar* blockAc,
const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
2323 const Index remaining_rows = rows % accCols;
2324 const Index remaining_cols = cols % accRows;
2326 if( strideA == -1 ) strideA = depth;
2327 if( strideB == -1 ) strideB = depth;
2329 const Packet pAlphaReal = pset1<Packet>(alpha.real());
2330 const Packet pAlphaImag = pset1<Packet>(alpha.imag());
2331 const Packet pMask = bmask<Packet>((
const int)(remaining_rows));
2333 const Scalar* blockA = (Scalar *) blockAc;
2334 const Scalar* blockB = (Scalar *) blockBc;
2337 for(; col + accRows <= cols; col += accRows)
2339 const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
2340 const Scalar* lhs_base = blockA;
2343#define MAX_COMPLEX_UNROLL 3
2344 while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
2345 gemm_complex_unrolled_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2347 switch( (rows-row)/accCols ) {
2348#if MAX_COMPLEX_UNROLL > 4
2350 gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2353#if MAX_COMPLEX_UNROLL > 3
2355 gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2358#if MAX_COMPLEX_UNROLL > 2
2360 gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2363#if MAX_COMPLEX_UNROLL > 1
2365 gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
2371#undef MAX_COMPLEX_UNROLL
2373 if(remaining_rows > 0)
2375 gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
2379 if(remaining_cols > 0)
2381 const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB;
2382 const Scalar* lhs_base = blockA;
2384 for(; col < cols; col++)
2388 gemm_complex_unrolled_col<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag);
2390 if (remaining_rows > 0)
2392 gemm_complex_extra_col<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag);
2406template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2407struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet,
ColMajor, Conjugate, PanelMode>
2409 void operator()(
double* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2412template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2413void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2414 ::operator()(
double* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2416 dhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode, true> pack;
2417 pack(blockA, lhs, depth, rows, stride, offset);
2420template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2421struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet,
RowMajor, Conjugate, PanelMode>
2423 void operator()(
double* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2426template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2427void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2428 ::operator()(
double* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2430 dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, true> pack;
2431 pack(blockA, lhs, depth, rows, stride, offset);
2434#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
2435template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2436struct gemm_pack_rhs<double, Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
2438 void operator()(
double* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2441template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2442void gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2443 ::operator()(
double* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2445 dhs_pack<double, Index, DataMapper, Packet2d, ColMajor, PanelMode, false> pack;
2446 pack(blockB, rhs, depth, cols, stride, offset);
2449template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2450struct gemm_pack_rhs<double, Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2452 void operator()(
double* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2455template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2456void gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2457 ::operator()(
double* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2459 dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, false> pack;
2460 pack(blockB, rhs, depth, cols, stride, offset);
2464template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2465struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet,
RowMajor, Conjugate, PanelMode>
2467 void operator()(
float* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2470template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2471void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
2472 ::operator()(
float* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2474 dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, true> pack;
2475 pack(blockA, lhs, depth, rows, stride, offset);
2478template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2479struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet,
ColMajor, Conjugate, PanelMode>
2481 void operator()(
float* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2484template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2485void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
2486 ::operator()(
float* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2488 dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, true> pack;
2489 pack(blockA, lhs, depth, rows, stride, offset);
2492template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2493struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet,
RowMajor, Conjugate, PanelMode>
2495 void operator()(std::complex<float>* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2498template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2499void gemm_pack_lhs<std::complex<float>,
Index, DataMapper, Pack1, Pack2, Packet,
RowMajor, Conjugate, PanelMode>
2500 ::operator()(std::complex<float>* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2502 dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, true> pack;
2503 pack(blockA, lhs, depth, rows, stride, offset);
2506template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2507struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet,
ColMajor, Conjugate, PanelMode>
2509 void operator()(std::complex<float>* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2512template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2513void gemm_pack_lhs<std::complex<float>,
Index, DataMapper, Pack1, Pack2, Packet,
ColMajor, Conjugate, PanelMode>
2514 ::operator()(std::complex<float>* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2516 dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, true> pack;
2517 pack(blockA, lhs, depth, rows, stride, offset);
2520#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
2521template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2522struct gemm_pack_rhs<float, Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
2524 void operator()(
float* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2527template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2528void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
2529 ::operator()(
float* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2531 dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, false> pack;
2532 pack(blockB, rhs, depth, cols, stride, offset);
2535template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2536struct gemm_pack_rhs<float, Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2538 void operator()(
float* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2541template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2542void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
2543 ::operator()(
float* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2545 dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, false> pack;
2546 pack(blockB, rhs, depth, cols, stride, offset);
2550template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2551struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
2553 void operator()(std::complex<float>* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2556template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2557void gemm_pack_rhs<std::complex<float>,
Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
2558 ::operator()(std::complex<float>* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2560 dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, false> pack;
2561 pack(blockB, rhs, depth, cols, stride, offset);
2564template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2565struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2567 void operator()(std::complex<float>* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2570template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2571void gemm_pack_rhs<std::complex<float>,
Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2572 ::operator()(std::complex<float>* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2574 dhs_cpack<float, Index, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, false> pack;
2575 pack(blockB, rhs, depth, cols, stride, offset);
2578template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2579struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet,
RowMajor, Conjugate, PanelMode>
2581 void operator()(std::complex<double>* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2584template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2585void gemm_pack_lhs<std::complex<double>,
Index, DataMapper, Pack1, Pack2, Packet,
RowMajor, Conjugate, PanelMode>
2586 ::operator()(std::complex<double>* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2588 dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, true> pack;
2589 pack(blockA, lhs, depth, rows, stride, offset);
2592template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2593struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet,
ColMajor, Conjugate, PanelMode>
2595 void operator()(std::complex<double>* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
2598template<
typename Index,
typename DataMapper,
int Pack1,
int Pack2,
typename Packet,
bool Conjugate,
bool PanelMode>
2599void gemm_pack_lhs<std::complex<double>,
Index, DataMapper, Pack1, Pack2, Packet,
ColMajor, Conjugate, PanelMode>
2600 ::operator()(std::complex<double>* blockA,
const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
2602 dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, true> pack;
2603 pack(blockA, lhs, depth, rows, stride, offset);
2606template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2607struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
2609 void operator()(std::complex<double>* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2612template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2613void gemm_pack_rhs<std::complex<double>,
Index, DataMapper, nr,
ColMajor, Conjugate, PanelMode>
2614 ::operator()(std::complex<double>* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2616 dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, false> pack;
2617 pack(blockB, rhs, depth, cols, stride, offset);
2620template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2621struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2623 void operator()(std::complex<double>* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
2626template<
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2627void gemm_pack_rhs<std::complex<double>,
Index, DataMapper, nr,
RowMajor, Conjugate, PanelMode>
2628 ::operator()(std::complex<double>* blockB,
const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset)
2630 dhs_cpack<double, Index, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, false> pack;
2631 pack(blockB, rhs, depth, cols, stride, offset);
2635template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2636struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2638 typedef typename quad_traits<float>::vectortype Packet;
2639 typedef typename quad_traits<float>::rhstype RhsPacket;
2641 void operator()(
const DataMapper& res,
const float* blockA,
const float* blockB,
2642 Index rows, Index depth, Index cols,
float alpha,
2643 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2646template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2647void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2648 ::operator()(
const DataMapper& res,
const float* blockA,
const float* blockB,
2649 Index rows, Index depth, Index cols,
float alpha,
2650 Index strideA, Index strideB, Index offsetA, Index offsetB)
2652 const Index accRows = quad_traits<float>::rows;
2653 const Index accCols = quad_traits<float>::size;
2654 void (*gemm_function)(
const DataMapper&,
const float*,
const float*,
Index,
Index,
Index, float,
Index,
Index,
Index,
Index);
2656 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2658 gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2659 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2660 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2661 gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2664 gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2667 gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2669 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2672template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2673struct gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2675 typedef Packet4f Packet;
2676 typedef Packet2cf Packetc;
2677 typedef Packet4f RhsPacket;
2679 void operator()(
const DataMapper& res,
const std::complex<float>* blockA,
const std::complex<float>* blockB,
2680 Index rows, Index depth, Index cols, std::complex<float> alpha,
2681 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2684template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2685void gebp_kernel<std::complex<float>, std::complex<float>,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2686 ::operator()(
const DataMapper& res,
const std::complex<float>* blockA,
const std::complex<float>* blockB,
2687 Index rows, Index depth, Index cols, std::complex<float> alpha,
2688 Index strideA, Index strideB, Index offsetA, Index offsetB)
2690 const Index accRows = quad_traits<float>::rows;
2691 const Index accCols = quad_traits<float>::size;
2692 void (*gemm_function)(
const DataMapper&,
const std::complex<float>*,
const std::complex<float>*,
2695 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2697 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2698 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2699 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2700 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2703 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2706 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2708 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2711template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2712struct gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2714 typedef Packet4f Packet;
2715 typedef Packet2cf Packetc;
2716 typedef Packet4f RhsPacket;
2718 void operator()(
const DataMapper& res,
const float* blockA,
const std::complex<float>* blockB,
2719 Index rows, Index depth, Index cols, std::complex<float> alpha,
2720 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2723template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2724void gebp_kernel<float, std::complex<float>,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2725 ::operator()(
const DataMapper& res,
const float* blockA,
const std::complex<float>* blockB,
2726 Index rows, Index depth, Index cols, std::complex<float> alpha,
2727 Index strideA, Index strideB, Index offsetA, Index offsetB)
2729 const Index accRows = quad_traits<float>::rows;
2730 const Index accCols = quad_traits<float>::size;
2731 void (*gemm_function)(
const DataMapper&,
const float*,
const std::complex<float>*,
2733 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2735 gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2736 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2737 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2738 gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2741 gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2744 gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2746 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2749template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2750struct gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2752 typedef Packet4f Packet;
2753 typedef Packet2cf Packetc;
2754 typedef Packet4f RhsPacket;
2756 void operator()(
const DataMapper& res,
const std::complex<float>* blockA,
const float* blockB,
2757 Index rows, Index depth, Index cols, std::complex<float> alpha,
2758 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2761template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2762void gebp_kernel<std::complex<float>, float,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2763 ::operator()(
const DataMapper& res,
const std::complex<float>* blockA,
const float* blockB,
2764 Index rows, Index depth, Index cols, std::complex<float> alpha,
2765 Index strideA, Index strideB, Index offsetA, Index offsetB)
2767 const Index accRows = quad_traits<float>::rows;
2768 const Index accCols = quad_traits<float>::size;
2769 void (*gemm_function)(
const DataMapper&,
const std::complex<float>*,
const float*,
2771 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2773 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2774 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2775 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2776 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2779 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2782 gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2784 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2787template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2788struct gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2790 typedef typename quad_traits<double>::vectortype Packet;
2791 typedef typename quad_traits<double>::rhstype RhsPacket;
2793 void operator()(
const DataMapper& res,
const double* blockA,
const double* blockB,
2794 Index rows, Index depth, Index cols,
double alpha,
2795 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2798template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2799void gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2800 ::operator()(
const DataMapper& res,
const double* blockA,
const double* blockB,
2801 Index rows, Index depth, Index cols,
double alpha,
2802 Index strideA, Index strideB, Index offsetA, Index offsetB)
2804 const Index accRows = quad_traits<double>::rows;
2805 const Index accCols = quad_traits<double>::size;
2806 void (*gemm_function)(
const DataMapper&,
const double*,
const double*,
Index,
Index,
Index, double,
Index,
Index,
Index,
Index);
2808 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2810 gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2811 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2812 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2813 gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2816 gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2819 gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
2821 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2824template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2825struct gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2827 typedef quad_traits<double>::vectortype Packet;
2828 typedef Packet1cd Packetc;
2829 typedef quad_traits<double>::rhstype RhsPacket;
2831 void operator()(
const DataMapper& res,
const std::complex<double>* blockA,
const std::complex<double>* blockB,
2832 Index rows, Index depth, Index cols, std::complex<double> alpha,
2833 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2836template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2837void gebp_kernel<std::complex<double>, std::complex<double>,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2838 ::operator()(
const DataMapper& res,
const std::complex<double>* blockA,
const std::complex<double>* blockB,
2839 Index rows, Index depth, Index cols, std::complex<double> alpha,
2840 Index strideA, Index strideB, Index offsetA, Index offsetB)
2842 const Index accRows = quad_traits<double>::rows;
2843 const Index accCols = quad_traits<double>::size;
2844 void (*gemm_function)(
const DataMapper&,
const std::complex<double>*,
const std::complex<double>*,
2846 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2848 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2849 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2850 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2851 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2854 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2857 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
false>;
2859 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2862template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2863struct gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2865 typedef quad_traits<double>::vectortype Packet;
2866 typedef Packet1cd Packetc;
2867 typedef quad_traits<double>::rhstype RhsPacket;
2869 void operator()(
const DataMapper& res,
const std::complex<double>* blockA,
const double* blockB,
2870 Index rows, Index depth, Index cols, std::complex<double> alpha,
2871 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2874template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2875void gebp_kernel<std::complex<double>, double,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2876 ::operator()(
const DataMapper& res,
const std::complex<double>* blockA,
const double* blockB,
2877 Index rows, Index depth, Index cols, std::complex<double> alpha,
2878 Index strideA, Index strideB, Index offsetA, Index offsetB)
2880 const Index accRows = quad_traits<double>::rows;
2881 const Index accCols = quad_traits<double>::size;
2882 void (*gemm_function)(
const DataMapper&,
const std::complex<double>*,
const double*,
2884 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2886 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2887 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2888 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2889 gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2892 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2895 gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
false,
true>;
2897 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
2900template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2901struct gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2903 typedef quad_traits<double>::vectortype Packet;
2904 typedef Packet1cd Packetc;
2905 typedef quad_traits<double>::rhstype RhsPacket;
2907 void operator()(
const DataMapper& res,
const double* blockA,
const std::complex<double>* blockB,
2908 Index rows, Index depth, Index cols, std::complex<double> alpha,
2909 Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
2912template<
typename Index,
typename DataMapper,
int mr,
int nr,
bool ConjugateLhs,
bool ConjugateRhs>
2913void gebp_kernel<double, std::complex<double>,
Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
2914 ::operator()(
const DataMapper& res,
const double* blockA,
const std::complex<double>* blockB,
2915 Index rows, Index depth, Index cols, std::complex<double> alpha,
2916 Index strideA, Index strideB, Index offsetA, Index offsetB)
2918 const Index accRows = quad_traits<double>::rows;
2919 const Index accCols = quad_traits<double>::size;
2920 void (*gemm_function)(
const DataMapper&,
const double*,
const std::complex<double>*,
2922 #ifdef EIGEN_ALTIVEC_MMA_ONLY
2924 gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2925 #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
2926 if (__builtin_cpu_supports (
"arch_3_1") && __builtin_cpu_supports (
"mma")){
2927 gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2930 gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2933 gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double,
Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs,
true,
false>;
2935 gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
@ ColMajor
Definition Constants.h:319
@ RowMajor
Definition Constants.h:321
Namespace containing all symbols from the Eigen library.
Definition Core:141
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_real_op< typename Derived::Scalar >, const Derived > real(const Eigen::ArrayBase< Derived > &x)
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition Meta.h:74
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_imag_op< typename Derived::Scalar >, const Derived > imag(const Eigen::ArrayBase< Derived > &x)