Actual source code: bvhip.hip.cxx
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: HIP-related code common to several BV impls
12: */
14: #include <slepc/private/bvimpl.h>
15: #include <slepccupmblas.h>
17: #define BLOCKSIZE 64
19: /*
20: C := alpha*A*B + beta*C
21: */
22: PetscErrorCode BVMult_BLAS_HIP(BV,PetscInt m_,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar beta,PetscScalar *d_C,PetscInt ldc_)
23: {
24: PetscHipBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0;
25: hipblasHandle_t hipblashandle;
27: PetscFunctionBegin;
28: PetscCall(PetscHIPBLASGetHandle(&hipblashandle));
29: PetscCall(PetscHipBLASIntCast(m_,&m));
30: PetscCall(PetscHipBLASIntCast(n_,&n));
31: PetscCall(PetscHipBLASIntCast(k_,&k));
32: PetscCall(PetscHipBLASIntCast(lda_,&lda));
33: PetscCall(PetscHipBLASIntCast(ldb_,&ldb));
34: PetscCall(PetscHipBLASIntCast(ldc_,&ldc));
35: PetscCall(PetscLogGpuTimeBegin());
36: PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_N,HIPBLAS_OP_N,m,n,k,&alpha,d_A,lda,d_B,ldb,&beta,d_C,ldc));
37: PetscCall(PetscLogGpuTimeEnd());
38: PetscCall(PetscLogGpuFlops(2.0*m*n*k));
39: PetscFunctionReturn(PETSC_SUCCESS);
40: }
42: /*
43: y := alpha*A*x + beta*y
44: */
45: PetscErrorCode BVMultVec_BLAS_HIP(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar beta,PetscScalar *d_y)
46: {
47: PetscHipBLASInt n=0,k=0,lda=0,one=1;
48: hipblasHandle_t hipblashandle;
50: PetscFunctionBegin;
51: PetscCall(PetscHIPBLASGetHandle(&hipblashandle));
52: PetscCall(PetscHipBLASIntCast(n_,&n));
53: PetscCall(PetscHipBLASIntCast(k_,&k));
54: PetscCall(PetscHipBLASIntCast(lda_,&lda));
55: PetscCall(PetscLogGpuTimeBegin());
56: PetscCallHIPBLAS(hipblasXgemv(hipblashandle,HIPBLAS_OP_N,n,k,&alpha,d_A,lda,d_x,one,&beta,d_y,one));
57: PetscCall(PetscLogGpuTimeEnd());
58: PetscCall(PetscLogGpuFlops(2.0*n*k));
59: PetscFunctionReturn(PETSC_SUCCESS);
60: }
62: /*
63: A(:,s:e-1) := A*B(:,s:e-1)
64: */
65: PetscErrorCode BVMultInPlace_BLAS_HIP(BV,PetscInt m_,PetscInt k_,PetscInt s,PetscInt e,PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscBool btrans)
66: {
67: const PetscScalar *d_B1;
68: PetscScalar *d_work,sone=1.0,szero=0.0;
69: PetscHipBLASInt m=0,n=0,k=0,l=0,lda=0,ldb=0,bs=BLOCKSIZE;
70: size_t freemem,totmem;
71: hipblasHandle_t hipblashandle;
72: hipblasOperation_t bt;
74: PetscFunctionBegin;
75: PetscCall(PetscHIPBLASGetHandle(&hipblashandle));
76: PetscCall(PetscHipBLASIntCast(m_,&m));
77: PetscCall(PetscHipBLASIntCast(e-s,&n));
78: PetscCall(PetscHipBLASIntCast(k_,&k));
79: PetscCall(PetscHipBLASIntCast(lda_,&lda));
80: PetscCall(PetscHipBLASIntCast(ldb_,&ldb));
81: PetscCall(PetscLogGpuTimeBegin());
82: if (PetscUnlikely(btrans)) {
83: d_B1 = d_B+s;
84: bt = HIPBLAS_OP_C;
85: } else {
86: d_B1 = d_B+s*ldb;
87: bt = HIPBLAS_OP_N;
88: }
89: /* try to allocate the whole matrix */
90: PetscCallHIP(hipMemGetInfo(&freemem,&totmem));
91: if (freemem>=lda*n*sizeof(PetscScalar)) {
92: PetscCallHIP(hipMalloc((void**)&d_work,lda*n*sizeof(PetscScalar)));
93: PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_N,bt,m,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,lda));
94: PetscCallHIP(hipMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,lda*sizeof(PetscScalar),m*sizeof(PetscScalar),n,hipMemcpyDeviceToDevice));
95: } else {
96: PetscCall(PetscHipBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs));
97: PetscCallHIP(hipMalloc((void**)&d_work,bs*n*sizeof(PetscScalar)));
98: PetscCall(PetscHipBLASIntCast(m % bs,&l));
99: if (l) {
100: PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_N,bt,l,n,k,&sone,d_A,lda,d_B1,ldb,&szero,d_work,l));
101: PetscCallHIP(hipMemcpy2D(d_A+s*lda,lda*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,hipMemcpyDeviceToDevice));
102: }
103: for (;l<m;l+=bs) {
104: PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_N,bt,bs,n,k,&sone,d_A+l,lda,d_B1,ldb,&szero,d_work,bs));
105: PetscCallHIP(hipMemcpy2D(d_A+l+s*lda,lda*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,hipMemcpyDeviceToDevice));
106: }
107: }
108: PetscCall(PetscLogGpuTimeEnd());
109: PetscCallHIP(hipFree(d_work));
110: PetscCall(PetscLogGpuFlops(2.0*m*n*k));
111: PetscFunctionReturn(PETSC_SUCCESS);
112: }
114: /*
115: B := alpha*A + beta*B
116: */
117: PetscErrorCode BVAXPY_BLAS_HIP(BV,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscInt lda_,PetscScalar beta,PetscScalar *d_B,PetscInt ldb_)
118: {
119: PetscHipBLASInt n=0,k=0,lda=0,ldb=0;
120: hipblasHandle_t hipblashandle;
122: PetscFunctionBegin;
123: PetscCall(PetscHIPBLASGetHandle(&hipblashandle));
124: PetscCall(PetscHipBLASIntCast(n_,&n));
125: PetscCall(PetscHipBLASIntCast(k_,&k));
126: PetscCall(PetscHipBLASIntCast(lda_,&lda));
127: PetscCall(PetscHipBLASIntCast(ldb_,&ldb));
128: PetscCall(PetscLogGpuTimeBegin());
129: PetscCallHIPBLAS(hipblasXgeam(hipblashandle,HIPBLAS_OP_N,HIPBLAS_OP_N,n,k,&alpha,d_A,lda,&beta,d_B,ldb,d_B,ldb));
130: PetscCall(PetscLogGpuTimeEnd());
131: PetscCall(PetscLogGpuFlops((beta==(PetscScalar)1.0)?2.0*n*k:3.0*n*k));
132: PetscFunctionReturn(PETSC_SUCCESS);
133: }
135: /*
136: C := A'*B
138: C is a CPU array
139: */
140: PetscErrorCode BVDot_BLAS_HIP(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_B,PetscInt ldb_,PetscScalar *C,PetscInt ldc_,PetscBool mpi)
141: {
142: PetscScalar *d_work,sone=1.0,szero=0.0,*CC;
143: PetscInt j;
144: PetscHipBLASInt m=0,n=0,k=0,lda=0,ldb=0,ldc=0;
145: PetscMPIInt len;
146: hipblasHandle_t hipblashandle;
148: PetscFunctionBegin;
149: PetscCall(PetscHIPBLASGetHandle(&hipblashandle));
150: PetscCall(PetscHipBLASIntCast(m_,&m));
151: PetscCall(PetscHipBLASIntCast(n_,&n));
152: PetscCall(PetscHipBLASIntCast(k_,&k));
153: PetscCall(PetscHipBLASIntCast(lda_,&lda));
154: PetscCall(PetscHipBLASIntCast(ldb_,&ldb));
155: PetscCall(PetscHipBLASIntCast(ldc_,&ldc));
156: PetscCallHIP(hipMalloc((void**)&d_work,m*n*sizeof(PetscScalar)));
157: if (mpi) {
158: if (ldc==m) {
159: PetscCall(BVAllocateWork_Private(bv,m*n));
160: if (k) {
161: PetscCall(PetscLogGpuTimeBegin());
162: PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_C,HIPBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,ldc));
163: PetscCall(PetscLogGpuTimeEnd());
164: PetscCallHIP(hipMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),hipMemcpyDeviceToHost));
165: PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
166: } else PetscCall(PetscArrayzero(bv->work,m*n));
167: PetscCall(PetscMPIIntCast(m*n,&len));
168: PetscCallMPI(MPIU_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
169: } else {
170: PetscCall(BVAllocateWork_Private(bv,2*m*n));
171: CC = bv->work+m*n;
172: if (k) {
173: PetscCall(PetscLogGpuTimeBegin());
174: PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_C,HIPBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m));
175: PetscCall(PetscLogGpuTimeEnd());
176: PetscCallHIP(hipMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),hipMemcpyDeviceToHost));
177: PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
178: } else PetscCall(PetscArrayzero(bv->work,m*n));
179: PetscCall(PetscMPIIntCast(m*n,&len));
180: PetscCallMPI(MPIU_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
181: for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,CC+j*m,m));
182: }
183: } else {
184: if (k) {
185: PetscCall(BVAllocateWork_Private(bv,m*n));
186: PetscCall(PetscLogGpuTimeBegin());
187: PetscCallHIPBLAS(hipblasXgemm(hipblashandle,HIPBLAS_OP_C,HIPBLAS_OP_N,m,n,k,&sone,d_A,lda,d_B,ldb,&szero,d_work,m));
188: PetscCall(PetscLogGpuTimeEnd());
189: PetscCallHIP(hipMemcpy(bv->work,d_work,m*n*sizeof(PetscScalar),hipMemcpyDeviceToHost));
190: PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
191: for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldc,bv->work+j*m,m));
192: }
193: }
194: PetscCallHIP(hipFree(d_work));
195: PetscCall(PetscLogGpuFlops(2.0*m*n*k));
196: PetscFunctionReturn(PETSC_SUCCESS);
197: }
199: /*
200: y := A'*x
202: y is a CPU array, if NULL bv->buffer is used as a workspace
203: */
204: PetscErrorCode BVDotVec_BLAS_HIP(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *d_A,PetscInt lda_,const PetscScalar *d_x,PetscScalar *y,PetscBool mpi)
205: {
206: PetscScalar *d_work,szero=0.0,sone=1.0,*yy;
207: PetscHipBLASInt n=0,k=0,lda=0,one=1;
208: PetscMPIInt len;
209: hipblasHandle_t hipblashandle;
211: PetscFunctionBegin;
212: PetscCall(PetscHIPBLASGetHandle(&hipblashandle));
213: PetscCall(PetscHipBLASIntCast(n_,&n));
214: PetscCall(PetscHipBLASIntCast(k_,&k));
215: PetscCall(PetscHipBLASIntCast(lda_,&lda));
216: if (!y) PetscCall(VecHIPGetArrayWrite(bv->buffer,&d_work));
217: else PetscCallHIP(hipMalloc((void**)&d_work,k*sizeof(PetscScalar)));
218: if (mpi) {
219: PetscCall(BVAllocateWork_Private(bv,k));
220: if (n) {
221: PetscCall(PetscLogGpuTimeBegin());
222: PetscCallHIPBLAS(hipblasXgemv(hipblashandle,HIPBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one));
223: PetscCall(PetscLogGpuTimeEnd());
224: PetscCallHIP(hipMemcpy(bv->work,d_work,k*sizeof(PetscScalar),hipMemcpyDeviceToHost));
225: PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
226: } else PetscCall(PetscArrayzero(bv->work,k));
227: /* reduction */
228: PetscCall(PetscMPIIntCast(k,&len));
229: if (!y) {
230: if (use_gpu_aware_mpi) { /* case 1: reduce on GPU using a temporary buffer */
231: PetscCallHIP(hipMalloc((void**)&yy,k*sizeof(PetscScalar)));
232: PetscCallMPI(MPIU_Allreduce(d_work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
233: PetscCallHIP(hipMemcpy(d_work,yy,k*sizeof(PetscScalar),hipMemcpyDeviceToDevice));
234: PetscCallHIP(hipFree(yy));
235: } else { /* case 2: reduce on CPU, copy result back to GPU */
236: PetscCall(BVAllocateWork_Private(bv,2*k));
237: yy = bv->work+k;
238: PetscCallHIP(hipMemcpy(bv->work,d_work,k*sizeof(PetscScalar),hipMemcpyDeviceToHost));
239: PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
240: PetscCallMPI(MPIU_Allreduce(bv->work,yy,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
241: PetscCallHIP(hipMemcpy(d_work,yy,k*sizeof(PetscScalar),hipMemcpyHostToDevice));
242: PetscCall(PetscLogCpuToGpu(k*sizeof(PetscScalar)));
243: }
244: PetscCall(VecHIPRestoreArrayWrite(bv->buffer,&d_work));
245: } else { /* case 3: user-provided array y, reduce on CPU */
246: PetscCallHIP(hipFree(d_work));
247: PetscCallMPI(MPIU_Allreduce(bv->work,y,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
248: }
249: } else {
250: if (n) {
251: PetscCall(PetscLogGpuTimeBegin());
252: PetscCallHIPBLAS(hipblasXgemv(hipblashandle,HIPBLAS_OP_C,n,k,&sone,d_A,lda,d_x,one,&szero,d_work,one));
253: PetscCall(PetscLogGpuTimeEnd());
254: }
255: if (!y) PetscCall(VecHIPRestoreArrayWrite(bv->buffer,&d_work));
256: else {
257: PetscCallHIP(hipMemcpy(y,d_work,k*sizeof(PetscScalar),hipMemcpyDeviceToHost));
258: PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
259: PetscCallHIP(hipFree(d_work));
260: }
261: }
262: PetscCall(PetscLogGpuFlops(2.0*n*k));
263: PetscFunctionReturn(PETSC_SUCCESS);
264: }
266: /*
267: Scale n scalars
268: */
269: PetscErrorCode BVScale_BLAS_HIP(BV,PetscInt n_,PetscScalar *d_A,PetscScalar alpha)
270: {
271: PetscHipBLASInt n=0,one=1;
272: hipblasHandle_t hipblashandle;
274: PetscFunctionBegin;
275: PetscCall(PetscHipBLASIntCast(n_,&n));
276: if (PetscUnlikely(alpha == (PetscScalar)0.0)) PetscCallHIP(hipMemset(d_A,0,n*sizeof(PetscScalar)));
277: else if (alpha != (PetscScalar)1.0) {
278: PetscCall(PetscHIPBLASGetHandle(&hipblashandle));
279: PetscCall(PetscLogGpuTimeBegin());
280: PetscCallHIPBLAS(hipblasXscal(hipblashandle,n,&alpha,d_A,one));
281: PetscCall(PetscLogGpuTimeEnd());
282: PetscCall(PetscLogGpuFlops(1.0*n));
283: }
284: PetscFunctionReturn(PETSC_SUCCESS);
285: }
287: /*
288: Compute 2-norm of vector consisting of n scalars
289: */
290: PetscErrorCode BVNorm_BLAS_HIP(BV,PetscInt n_,const PetscScalar *d_A,PetscReal *nrm)
291: {
292: PetscHipBLASInt n=0,one=1;
293: hipblasHandle_t hipblashandle;
295: PetscFunctionBegin;
296: PetscCall(PetscHipBLASIntCast(n_,&n));
297: PetscCall(PetscHIPBLASGetHandle(&hipblashandle));
298: PetscCall(PetscLogGpuTimeBegin());
299: PetscCallHIPBLAS(hipblasXnrm2(hipblashandle,n,d_A,one,nrm));
300: PetscCall(PetscLogGpuTimeEnd());
301: PetscCall(PetscLogGpuFlops(2.0*n));
302: PetscFunctionReturn(PETSC_SUCCESS);
303: }
305: /*
306: Normalize the columns of A
307: */
308: PetscErrorCode BVNormalize_BLAS_HIP(BV,PetscInt m_,PetscInt n_,PetscScalar *d_A,PetscInt lda_,PetscScalar *eigi)
309: {
310: PetscInt j,k;
311: PetscReal nrm,nrm1;
312: PetscScalar alpha;
313: PetscHipBLASInt m=0,one=1;
314: hipblasHandle_t hipblashandle;
316: PetscFunctionBegin;
317: PetscCall(PetscHipBLASIntCast(m_,&m));
318: PetscCall(PetscHIPBLASGetHandle(&hipblashandle));
319: PetscCall(PetscLogGpuTimeBegin());
320: for (j=0;j<n_;j++) {
321: k = 1;
322: if (!PetscDefined(USE_COMPLEX) && eigi && eigi[j] != 0.0) k = 2;
323: PetscCallHIPBLAS(hipblasXnrm2(hipblashandle,m,d_A+j*lda_,one,&nrm));
324: if (k==2) {
325: PetscCallHIPBLAS(hipblasXnrm2(hipblashandle,m,d_A+(j+1)*lda_,one,&nrm1));
326: nrm = SlepcAbs(nrm,nrm1);
327: }
328: alpha = 1.0/nrm;
329: PetscCallHIPBLAS(hipblasXscal(hipblashandle,m,&alpha,d_A+j*lda_,one));
330: if (k==2) {
331: PetscCallHIPBLAS(hipblasXscal(hipblashandle,m,&alpha,d_A+(j+1)*lda_,one));
332: j++;
333: }
334: }
335: PetscCall(PetscLogGpuTimeEnd());
336: PetscCall(PetscLogGpuFlops(3.0*m*n_));
337: PetscFunctionReturn(PETSC_SUCCESS);
338: }
340: /*
341: BV_CleanCoefficients_HIP - Sets to zero all entries of column j of the bv buffer
342: */
343: PetscErrorCode BV_CleanCoefficients_HIP(BV bv,PetscInt j,PetscScalar *h)
344: {
345: PetscScalar *d_hh,*d_a;
346: PetscInt i;
348: PetscFunctionBegin;
349: if (!h) {
350: PetscCall(VecHIPGetArray(bv->buffer,&d_a));
351: PetscCall(PetscLogGpuTimeBegin());
352: d_hh = d_a + j*(bv->nc+bv->m);
353: PetscCallHIP(hipMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar)));
354: PetscCall(PetscLogGpuTimeEnd());
355: PetscCall(VecHIPRestoreArray(bv->buffer,&d_a));
356: } else { /* cpu memory */
357: for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
358: }
359: PetscFunctionReturn(PETSC_SUCCESS);
360: }
362: /*
363: BV_AddCoefficients_HIP - Add the contents of the scratch (0-th column) of the bv buffer
364: into column j of the bv buffer
365: */
366: PetscErrorCode BV_AddCoefficients_HIP(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
367: {
368: PetscScalar *d_h,*d_c,sone=1.0;
369: PetscInt i;
370: PetscHipBLASInt idx=0,one=1;
371: hipblasHandle_t hipblashandle;
373: PetscFunctionBegin;
374: if (!h) {
375: PetscCall(PetscHIPBLASGetHandle(&hipblashandle));
376: PetscCall(VecHIPGetArray(bv->buffer,&d_c));
377: d_h = d_c + j*(bv->nc+bv->m);
378: PetscCall(PetscHipBLASIntCast(bv->nc+j,&idx));
379: PetscCall(PetscLogGpuTimeBegin());
380: PetscCallHIPBLAS(hipblasXaxpy(hipblashandle,idx,&sone,d_c,one,d_h,one));
381: PetscCall(PetscLogGpuTimeEnd());
382: PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
383: PetscCall(VecHIPRestoreArray(bv->buffer,&d_c));
384: } else { /* cpu memory */
385: for (i=0;i<bv->nc+j;i++) h[i] += c[i];
386: PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
387: }
388: PetscFunctionReturn(PETSC_SUCCESS);
389: }
391: /*
392: BV_SetValue_HIP - Sets value in row j (counted after the constraints) of column k
393: of the coefficients array
394: */
395: PetscErrorCode BV_SetValue_HIP(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
396: {
397: PetscScalar *d_h,*a;
399: PetscFunctionBegin;
400: if (!h) {
401: PetscCall(VecHIPGetArray(bv->buffer,&a));
402: PetscCall(PetscLogGpuTimeBegin());
403: d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
404: PetscCallHIP(hipMemcpy(d_h,&value,sizeof(PetscScalar),hipMemcpyHostToDevice));
405: PetscCall(PetscLogCpuToGpu(sizeof(PetscScalar)));
406: PetscCall(PetscLogGpuTimeEnd());
407: PetscCall(VecHIPRestoreArray(bv->buffer,&a));
408: } else { /* cpu memory */
409: h[bv->nc+j] = value;
410: }
411: PetscFunctionReturn(PETSC_SUCCESS);
412: }
414: /*
415: BV_SquareSum_HIP - Returns the value h'*h, where h represents the contents of the
416: coefficients array (up to position j)
417: */
418: PetscErrorCode BV_SquareSum_HIP(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
419: {
420: const PetscScalar *d_h;
421: PetscScalar dot;
422: PetscInt i;
423: PetscHipBLASInt idx=0,one=1;
424: hipblasHandle_t hipblashandle;
426: PetscFunctionBegin;
427: if (!h) {
428: PetscCall(PetscHIPBLASGetHandle(&hipblashandle));
429: PetscCall(VecHIPGetArrayRead(bv->buffer,&d_h));
430: PetscCall(PetscHipBLASIntCast(bv->nc+j,&idx));
431: PetscCall(PetscLogGpuTimeBegin());
432: PetscCallHIPBLAS(hipblasXdot(hipblashandle,idx,d_h,one,d_h,one,&dot));
433: PetscCall(PetscLogGpuTimeEnd());
434: PetscCall(PetscLogGpuFlops(2.0*(bv->nc+j)));
435: *sum = PetscRealPart(dot);
436: PetscCall(VecHIPRestoreArrayRead(bv->buffer,&d_h));
437: } else { /* cpu memory */
438: *sum = 0.0;
439: for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
440: PetscCall(PetscLogFlops(2.0*(bv->nc+j)));
441: }
442: PetscFunctionReturn(PETSC_SUCCESS);
443: }
445: /* pointwise multiplication */
446: static __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
447: {
448: PetscInt x;
450: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
451: if (x<n) a[x] *= PetscRealPart(b[x]);
452: }
454: /* pointwise division */
455: static __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
456: {
457: PetscInt x;
459: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
460: if (x<n) a[x] /= PetscRealPart(b[x]);
461: }
463: /*
464: BV_ApplySignature_HIP - Computes the pointwise product h*omega, where h represents
465: the contents of the coefficients array (up to position j) and omega is the signature;
466: if inverse=TRUE then the operation is h/omega
467: */
468: PetscErrorCode BV_ApplySignature_HIP(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
469: {
470: PetscScalar *d_h;
471: const PetscScalar *d_omega,*omega;
472: PetscInt i,xcount;
473: dim3 blocks3d, threads3d;
475: PetscFunctionBegin;
476: if (!(bv->nc+j)) PetscFunctionReturn(PETSC_SUCCESS);
477: if (!h) {
478: PetscCall(VecHIPGetArray(bv->buffer,&d_h));
479: PetscCall(VecHIPGetArrayRead(bv->omega,&d_omega));
480: PetscCall(SlepcKernelSetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount));
481: PetscCall(PetscLogGpuTimeBegin());
482: if (inverse) {
483: for (i=0;i<xcount;i++) PointwiseDiv_kernel<<<blocks3d,threads3d,0,0>>>(i,d_h,d_omega,bv->nc+j);
484: } else {
485: for (i=0;i<xcount;i++) PointwiseMult_kernel<<<blocks3d,threads3d,0,0>>>(i,d_h,d_omega,bv->nc+j);
486: }
487: PetscCallHIP(hipGetLastError());
488: PetscCall(PetscLogGpuTimeEnd());
489: PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
490: PetscCall(VecHIPRestoreArrayRead(bv->omega,&d_omega));
491: PetscCall(VecHIPRestoreArray(bv->buffer,&d_h));
492: } else {
493: PetscCall(VecGetArrayRead(bv->omega,&omega));
494: if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
495: else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
496: PetscCall(VecRestoreArrayRead(bv->omega,&omega));
497: PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
498: }
499: PetscFunctionReturn(PETSC_SUCCESS);
500: }
502: /*
503: BV_SquareRoot_HIP - Returns the square root of position j (counted after the constraints)
504: of the coefficients array
505: */
506: PetscErrorCode BV_SquareRoot_HIP(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
507: {
508: const PetscScalar *d_h;
509: PetscScalar hh;
511: PetscFunctionBegin;
512: if (!h) {
513: PetscCall(VecHIPGetArrayRead(bv->buffer,&d_h));
514: PetscCall(PetscLogGpuTimeBegin());
515: PetscCallHIP(hipMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),hipMemcpyDeviceToHost));
516: PetscCall(PetscLogGpuToCpu(sizeof(PetscScalar)));
517: PetscCall(PetscLogGpuTimeEnd());
518: PetscCall(BV_SafeSqrt(bv,hh,beta));
519: PetscCall(VecHIPRestoreArrayRead(bv->buffer,&d_h));
520: } else PetscCall(BV_SafeSqrt(bv,h[bv->nc+j],beta));
521: PetscFunctionReturn(PETSC_SUCCESS);
522: }
524: /*
525: BV_StoreCoefficients_HIP - Copy the contents of the coefficients array to an array dest
526: provided by the caller (only values from l to j are copied)
527: */
528: PetscErrorCode BV_StoreCoefficients_HIP(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
529: {
530: const PetscScalar *d_h,*d_a;
531: PetscInt i;
533: PetscFunctionBegin;
534: if (!h) {
535: PetscCall(VecHIPGetArrayRead(bv->buffer,&d_a));
536: PetscCall(PetscLogGpuTimeBegin());
537: d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
538: PetscCallHIP(hipMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),hipMemcpyDeviceToHost));
539: PetscCall(PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar)));
540: PetscCall(PetscLogGpuTimeEnd());
541: PetscCall(VecHIPRestoreArrayRead(bv->buffer,&d_a));
542: } else {
543: for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
544: }
545: PetscFunctionReturn(PETSC_SUCCESS);
546: }