40#ifndef NMBLAS_SGEMM_H_
41#define NMBLAS_SGEMM_H_
44#define ALL_FPU( instr ) "fpu 0 " instr "\n\t" \
45 "fpu 1 " instr "\n\t" \
46 "fpu 2 " instr "\n\t" \
52static inline __attribute__((always_inline))
void
53loadCFromMemory(
const float* pc,
int ldc,
const int fpu,
int* dummy_to_link )
56 "fpu %4 rep vlen vreg7= [%1++%2];\n\t"
57 :
"+m"(*dummy_to_link)
58 :
"RA0"( pc + fpu*2 ),
"RG0"(ldc),
"m" (*(
const float (*)[])pc),
"i"(fpu) );
62static inline __attribute__((always_inline))
void
63loadAFromMemory(
const float* pa,
int lda,
const int vrNum,
int* dummy_to_link )
66 "fpu 0 rep vlen vreg%3 = [%1++%2];\n\t"
68 "fpu 1 vreg%3 = fpu 0 vreg%3;\n\t"
69 "fpu 2 vreg%3 = fpu 1 vreg%3;\n\t"
70 "fpu 3 vreg%3 = fpu 2 vreg%3;\n\t"
71 :
"+m" (*dummy_to_link),
"+RA0" (pa)
72 :
"RG0"(lda),
"i"(vrNum),
"m"(*(
const float (*)[])pa) );
75static inline __attribute__((always_inline))
void
76loadBAndMAdd(
const float* pb,
const float* pb1,
int ldb,
const int vrNum,
int* dummy_to_link )
79 "fpu 0 rep 1 vreg4 = [%1++];\n\t"
80 "fpu 0 rep 1 vreg5 = [%2++];\n\t"
81 "fpu 1 rep 1 vreg4 = [%1++];\n\t"
82 "fpu 1 rep 1 vreg5 = [%2++];\n\t"
83 "fpu 2 rep 1 vreg4 = [%1++];\n\t"
84 "fpu 2 rep 1 vreg5 = [%2++];\n\t"
85 "fpu 3 rep 1 vreg4 = [%1++];\n\t"
86 "fpu 3 rep 1 vreg5 = [%2++];\n\t"
87 ALL_FPU (
".matrix vreg7= vreg%3 * .retrieve (vreg4,vreg5) + vreg7;")
88 :
"+m" (*dummy_to_link),
"+a" (pb),
"+a" (pb1)
89 :
"i"(vrNum),
"m"(*(const float (*)[])pb),
"m"(*(const float (*)[])pb1) );
93static inline __attribute__((always_inline))
void
94loadBAndMultiply(
const float* pb,
const float* pb1,
int ldb,
const int vrNum,
int* dummy_to_link )
97 "fpu 0 rep 1 vreg4 = [%1++];\n\t"
98 "fpu 0 rep 1 vreg5 = [%2++];\n\t"
99 "fpu 1 rep 1 vreg4 = [%1++];\n\t"
100 "fpu 1 rep 1 vreg5 = [%2++];\n\t"
101 "fpu 2 rep 1 vreg4 = [%1++];\n\t"
102 "fpu 2 rep 1 vreg5 = [%2++];\n\t"
103 "fpu 3 rep 1 vreg4 = [%1++];\n\t"
104 "fpu 3 rep 1 vreg5 = [%2++];\n\t"
105 ALL_FPU (
".matrix vreg7= vreg%3 * .retrieve (vreg4,vreg5);")
106 :
"+m" (*dummy_to_link),
"+a" (pb),
"+a" (pb1)
107 :
"i"(vrNum),
"m"(*(const float (*)[])pb),
"m"(*(const float (*)[])pb1) );
112static inline __attribute__((always_inline))
void
113storeCToMemory(
float* pc,
int ldc,
const int fpu,
int* dummy_to_link )
116 "fpu %4 rep vlen [ar0++gr0] = vreg7;\n\t"
117 :
"=m"(*(float (*)[])pc)
118 :
"RA0"(pc),
"RG0"(ldc),
"m"(*dummy_to_link),
"i"(fpu) );
124nmblas_sgemm(
const enum nm_trans TransA,
125 const enum nm_trans TransB,
140 if( TransA != nm_n || TransB != nm_n ){}
145 int beta0 = beta ==0.0f;
146 int alpha1 = alpha==1.0f;
149 if ( !alpha1 && alpha !=0.0f )
161 for(i=0; i<I; i+=32){
165 :
"g"( I-i-1 >= 31 ? 31 : I-i-1 ) );
173 float bufScalar[2] __attribute__ ((aligned (8)));
175 asm(
"":
"=m"(*dummy_to_link),
"=a"(dummy_to_link));
179 loadCFromMemory( C + i *ldc +j, ldc, 0, dummy_to_link );
180 loadCFromMemory( C + i *ldc +j, ldc, 1, dummy_to_link );
181 loadCFromMemory( C + i *ldc +j, ldc, 2, dummy_to_link );
182 loadCFromMemory( C + i *ldc +j, ldc, 3, dummy_to_link );
187 float* pbeta= bufScalar;
189 "fpu 0 rep 1 vreg4 = [%1++];\n\t"
190 "fpu 1 vreg4 = fpu 0 vreg4;\n\t"
191 "fpu 2 vreg4 = fpu 1 vreg4;\n\t"
192 "fpu 3 vreg4 = fpu 2 vreg4;\n\t"
193 ALL_FPU (
".float vreg7= vreg7 * .retrieve (vreg4);")
194 :
"+m" (*dummy_to_link),
"+a" (pbeta)
201 pb1 = B +(k+1)*ldb +j;
203 asm(
"":
"=m"(*dummy_to_link));
204 loadAFromMemory ( pa, lda, 0, dummy_to_link );
206 loadBAndMultiply( pb, pb1, ldb, 0, dummy_to_link );
216 pb1 = B +(k+1)*ldb +j;
218 loadAFromMemory( pa, lda, 3, dummy_to_link );
220 loadBAndMAdd( pb, pb1, ldb, 3, dummy_to_link );
227 pb1 = B +(k+1)*ldb +j;
229 loadAFromMemory( pa, lda, 0, dummy_to_link );
231 loadBAndMAdd( pb, pb1, ldb, 0, dummy_to_link );
238 float* palpha= bufScalar;
240 "fpu 0 rep 1 vreg4 = [%1++];\n\t"
241 "fpu 1 vreg4 = fpu 0 vreg4;\n\t"
242 "fpu 2 vreg4 = fpu 1 vreg4;\n\t"
243 "fpu 3 vreg4 = fpu 2 vreg4;\n\t"
244 ALL_FPU (
".float vreg7= vreg7 * .retrieve (vreg4);")
245 :
"+m" (*dummy_to_link),
"+a" (palpha)
250 storeCToMemory( C + i *ldc +j+0, ldc, 0, dummy_to_link );
253 storeCToMemory( C + i *ldc +j+2, ldc, 1, dummy_to_link );
256 storeCToMemory( C + i *ldc +j+4, ldc, 2, dummy_to_link );
259 storeCToMemory( C + i *ldc +j+6, ldc, 3, dummy_to_link );