TABLE OF CONTENTS
- ABINIT/m_chebfiwf
- m_chebfiwf/chebfiwf2
- m_chebfiwf/getBm1X
- m_chebfiwf/getghc_gsc1
- m_chebfiwf/precond1
ABINIT/m_chebfiwf [ Functions ]
NAME
m_chebfiwf
FUNCTION
This module contains a routine updating the whole wave functions at a given k-point, using the Chebyshev filtering method (2021 implementation using xG abstraction layer) for a given spin-polarization, from a fixed hamiltonian but might also simply compute eigenvectors and eigenvalues at this k point. it will also update the matrix elements of the hamiltonian.
COPYRIGHT
Copyright (C) 2018-2024 ABINIT group (BS) This file is distributed under the terms of the gnu general public license, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt . for the initials of contributors, see ~abinit/doc/developers/contributors.txt .
SOURCE
21 #if defined HAVE_CONFIG_H 22 #include "config.h" 23 #endif 24 25 #include "abi_common.h" 26 27 ! nvtx related macro definition 28 #include "nvtx_macros.h" 29 30 module m_chebfiwf 31 32 use defs_abitypes 33 use defs_basis 34 use m_abicore 35 use m_errors 36 use m_fstrings 37 use m_time 38 39 use m_chebfi 40 use m_chebfi2 41 use m_invovl 42 43 use m_cgtools, only : dotprod_g 44 use m_dtset, only : dataset_type 45 46 use m_hamiltonian, only : gs_hamiltonian_type 47 use m_pawcprj, only : pawcprj_type 48 use m_nonlop, only : nonlop 49 use m_prep_kgb, only : prep_getghc, prep_nonlop 50 use m_pawcprj, only : pawcprj_type, pawcprj_alloc, pawcprj_free 51 use m_getghc, only : multithreaded_getghc 52 use m_gemm_nonlop, only : gemm_nonlop_use_gemm 53 54 use m_xg 55 use m_xgTransposer 56 57 !FIXME Keep those in these modules or moves them together ? 58 use m_invovl, only : invovl_ompgpu_static_mem,invovl_ompgpu_work_mem 59 use m_gemm_nonlop_ompgpu, only : gemm_nonlop_ompgpu_static_mem 60 use m_getghc_ompgpu, only : getghc_ompgpu_work_mem 61 62 #if defined(HAVE_GPU) 63 use m_gpu_toolbox 64 #endif 65 66 #if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) 67 use m_nvtx_data 68 #endif 69 70 #if defined(HAVE_GPU) 71 use m_gpu_toolbox 72 #endif 73 74 #if defined(HAVE_YAKL) 75 use gator_mod 76 #endif 77 78 use, intrinsic :: iso_c_binding, only: c_associated,c_loc,c_ptr,c_f_pointer,c_double,c_size_t 79 80 use m_xmpi 81 use m_xomp 82 #ifdef HAVE_OPENMP 83 use omp_lib 84 #endif 85 86 implicit none 87 88 private 89 90 integer, parameter :: l_tim_getghc=7 91 real(dp), parameter :: inv_sqrt2 = 1/sqrt2 92 93 ! For use in getghc_gsc1 94 integer, save :: l_cpopt 95 integer, save :: l_icplx 96 integer, save :: l_istwf 97 integer, save :: l_npw 98 integer, save :: l_nband_filter 99 integer, save :: l_nspinor 100 logical, save :: l_paw 101 integer, save :: l_prtvol 102 integer, save :: l_sij_opt 103 integer, save :: l_paral_kgb 104 integer, save :: l_useria 105 integer, save :: l_block_sliced 106 107 #if defined HAVE_GPU && defined HAVE_YAKL 108 real(kind=c_double), ABI_CONTIGUOUS pointer, save :: l_pcon(:) 109 #else 110 real(dp), allocatable, save :: l_pcon(:) 111 #endif 112 113 type(mpi_type),pointer,save :: l_mpi_enreg 114 type(gs_hamiltonian_type),pointer,save :: l_gs_hamk 115 116 integer, parameter :: DEBUG_ROWS = 5 117 integer, parameter :: DEBUG_COLUMNS = 5 118 119 public :: chebfiwf2_blocksize 120 public :: chebfiwf2 121 122 CONTAINS !========================================================================================
m_chebfiwf/chebfiwf2 [ Functions ]
[ Top ] [ m_chebfiwf ] [ Functions ]
NAME
chebfiwf2
FUNCTION
This routine updates the whole wave functions set at a given k-point, using the Chebfi method (2021 version using xG abstraction layer)
INPUTS
dtset= input variables for this dataset kinpw(npw)= kinetic energy for each plane wave (Hartree) mpi_enreg= MPI-parallelisation information nband= number of bands at this k point npw= number of plane waves at this k point nspinor= number of spinorial components of the wavefunctions prtvol= control print volume and debugging
OUTPUT
eig(nband)= eigenvalues (hartree) for all bands enl_out(nband)= contribution of each band to the nl part of energy resid(nband)= residuals for each band
SIDE EFFECTS
cg(2,npw*nspinor*nband)= planewave coefficients of wavefunctions gs_hamk <type(gs_hamiltonian_type)>=all data for the hamiltonian at k
SOURCE
245 subroutine chebfiwf2(cg,dtset,eig,enl_out,gs_hamk,kinpw,mpi_enreg,& 246 & nband,npw,nspinor,prtvol,resid) 247 248 implicit none 249 250 ! Arguments ------------------------------------ 251 integer,intent(in) :: nband,npw,prtvol,nspinor 252 type(mpi_type),target,intent(in) :: mpi_enreg 253 real(dp),target,intent(inout) :: cg(2,npw*nspinor*nband) 254 real(dp),intent(in) :: kinpw(npw) 255 real(dp),target,intent(out) :: resid(nband) 256 real(dp),intent(out) :: enl_out(nband) 257 real(dp),target,intent(out) :: eig(nband) 258 type(dataset_type),intent(in) :: dtset 259 type(gs_hamiltonian_type),target,intent(inout) :: gs_hamk 260 261 ! Local variables------------------------------- 262 ! scalars 263 integer, parameter :: tim_chebfiwf2 = 1750 264 integer :: ipw,space,blockdim,nline,total_spacedim,ierr 265 real(dp) :: localmem 266 type(c_ptr) :: cptr 267 type(chebfi_t) :: chebfi 268 type(xgBlock_t) :: xgx0,xgeigen,xgresidu 269 ! arrays 270 real(dp) :: tsec(2),chebfiMem(2) 271 real(dp),pointer :: eig_ptr(:,:) => NULL() 272 real(dp),pointer :: resid_ptr(:,:) => NULL() 273 real(dp), allocatable :: l_gvnlxc(:,:) 274 275 ! Stupid things for NC 276 integer,parameter :: choice=1, paw_opt=0, signs=1 277 real(dp) :: gsc_dummy(1,1) 278 type(pawcprj_type) :: cprj_dum(gs_hamk%natom,1) 279 280 #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL) 281 integer(kind=c_size_t) :: l_pcon_size_bytes 282 #endif 283 284 ! ********************************************************************* 285 286 !################ INITIALIZATION ##################################### 287 !###################################################################### 288 289 call timab(tim_chebfiwf2,1,tsec) 290 291 !Set module variables 292 l_paw = (gs_hamk%usepaw==1) 293 l_cpopt=-1;l_sij_opt=0;if (l_paw) l_sij_opt=1 294 l_istwf=gs_hamk%istwf_k 295 l_npw = npw 296 l_nspinor = nspinor 297 l_prtvol = prtvol 298 l_mpi_enreg => mpi_enreg 299 l_gs_hamk => gs_hamk 300 l_nband_filter = nband 301 l_paral_kgb = dtset%paral_kgb 302 l_block_sliced = dtset%invovl_blksliced 303 304 !Variables 305 nline=dtset%nline 306 blockdim=l_mpi_enreg%nproc_band*l_mpi_enreg%bandpp 307 !for debug 308 l_useria=dtset%useria 309 310 !Depends on istwfk 311 if ( l_istwf == 2 ) then ! Real only 312 ! SPACE_CR mean that we have complex numbers but no re*im terms only re*re 313 ! and im*im so that a vector of complex is consider as a long vector of real 314 ! therefore the number of data is (2*npw*nspinor)*nband 315 ! This space is completely equivalent to SPACE_R but will correctly set and 316 ! get the array data into the xgBlock 317 space = SPACE_CR 318 l_icplx = 2 319 else ! complex 320 space = SPACE_C 321 l_icplx = 1 322 end if 323 324 !Memory info 325 if ( prtvol >= 3 ) then 326 if (l_mpi_enreg%paral_kgb == 1) then 327 total_spacedim = l_icplx*l_npw*l_nspinor 328 call xmpi_sum(total_spacedim,l_mpi_enreg%comm_bandspinorfft,ierr) 329 else 330 total_spacedim = 0 331 end if 332 chebfiMem = chebfi_memInfo(nband,l_icplx*l_npw*l_nspinor,space,l_mpi_enreg%paral_kgb, & 333 & total_spacedim,l_mpi_enreg%bandpp) !blockdim 334 localMem = (l_npw+2*l_npw*l_nspinor+2*nband)*kind(1.d0) !blockdim 335 write(std_out,'(1x,A,F10.6,1x,A)') "Each MPI process calling chebfi should need around ", & 336 (localMem+sum(chebfiMem))/1e9,"GB of peak memory as follows :" 337 write(std_out,'(4x,A,F10.6,1x,A)') "Permanent memory in chebfiwf : ",(localMem)/1e9,"GB" 338 write(std_out,'(4x,A,F10.6,1x,A)') "Permanent memory in m_chebfi : ",(chebfiMem(1))/1e9,"GB" 339 write(std_out,'(4x,A,F10.6,1x,A)') "Temporary memory in m_chebfi : ",(chebfiMem(2))/1e9,"GB" 340 end if 341 342 !For preconditionning 343 if(dtset%gpu_option==ABI_GPU_KOKKOS) then 344 #if defined HAVE_GPU && defined HAVE_YAKL 345 ABI_MALLOC_MANAGED(l_pcon, (/l_icplx*npw/)) 346 #endif 347 else 348 ABI_MALLOC(l_pcon,(1:l_icplx*npw)) 349 end if 350 351 !$omp parallel do schedule(static), shared(l_pcon,kinpw) 352 do ipw=1-1,l_icplx*npw-1 353 if(kinpw(ipw/l_icplx+1)>huge(zero)*1.d-11) then 354 l_pcon(ipw+1)=0.d0 355 else 356 l_pcon(ipw+1) = (27+kinpw(ipw/l_icplx+1)*(18+kinpw(ipw/l_icplx+1)*(12+8*kinpw(ipw/l_icplx+1)))) & 357 & / (27+kinpw(ipw/l_icplx+1)*(18+kinpw(ipw/l_icplx+1)*(12+8*kinpw(ipw/l_icplx+1))) + 16*kinpw(ipw/l_icplx+1)**4) 358 end if 359 end do 360 361 #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL) 362 if(l_gs_hamk%gpu_option==ABI_GPU_KOKKOS) then 363 ! upload l_pcon to device / gpu 364 l_pcon_size_bytes =l_icplx * npw * dp 365 call gpu_data_prefetch_async(C_LOC(l_pcon), l_pcon_size_bytes) 366 end if 367 #endif 368 369 call xgBlock_map(xgx0,cg,space,l_icplx*l_npw*l_nspinor,nband,l_mpi_enreg%comm_bandspinorfft,gpu_option=dtset%gpu_option) 370 371 #ifdef HAVE_OPENMP_OFFLOAD 372 !$OMP TARGET ENTER DATA MAP(to:cg,eig,resid) IF(gs_hamk%gpu_option==ABI_GPU_OPENMP) 373 #endif 374 375 ABI_NVTX_START_RANGE(NVTX_CHEBFI2_SQRT2) 376 if ( l_istwf == 2 ) then ! Real only 377 ! Scale cg 378 call xgBlock_scale(xgx0,sqrt2,1) !ALL MPI processes do this 379 380 ! This is possible since the memory in cg and xgx0 is the same 381 ! Don't know yet how to deal with this with xgBlock 382 !MPI HANDLES THIS AUTOMATICALLY (only proc 0 is me_g0) 383 if(l_mpi_enreg%me_g0 == 1) then 384 if(gs_hamk%gpu_option==ABI_GPU_OPENMP) then 385 #ifdef HAVE_OPENMP_OFFLOAD 386 !$OMP TARGET MAP(to:cg) 387 cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * inv_sqrt2 388 !$OMP END TARGET 389 #endif 390 else 391 cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * inv_sqrt2 392 end if 393 end if 394 end if 395 ABI_NVTX_END_RANGE() 396 397 !Trick with C is to change rank of arrays (:) to (:,:) 398 cptr = c_loc(eig) 399 call c_f_pointer(cptr,eig_ptr,(/ nband,1 /)) 400 call xgBlock_map(xgeigen,eig_ptr,SPACE_R,nband,1,l_mpi_enreg%comm_bandspinorfft,gpu_option=dtset%gpu_option) 401 !Trick the with C to change rank of arrays (:) to (:,:) 402 cptr = c_loc(resid) 403 call c_f_pointer(cptr,resid_ptr,(/ nband,1 /)) 404 call xgBlock_map(xgresidu,resid_ptr,SPACE_R,nband,1,l_mpi_enreg%comm_bandspinorfft,gpu_option=dtset%gpu_option) 405 406 ! ABI_MALLOC(l_gvnlxc,(2,l_npw*l_nspinor*l_nband_filter)) 407 call timab(tim_chebfiwf2,2,tsec) 408 409 ABI_NVTX_START_RANGE(NVTX_CHEBFI2_INIT) 410 call chebfi_init(chebfi,nband,l_icplx*l_npw*l_nspinor,dtset%tolwfr_diago,dtset%ecut, & 411 & dtset%paral_kgb,l_mpi_enreg%bandpp, & 412 & nline, space,1,l_gs_hamk%istwf_k, & 413 & l_mpi_enreg%comm_bandspinorfft,l_mpi_enreg%me_g0,l_paw,& 414 & l_mpi_enreg%comm_spinorfft,l_mpi_enreg%comm_band,& 415 & l_gs_hamk%gpu_option,gpu_kokkos_nthrd=dtset%gpu_kokkos_nthrd) 416 ABI_NVTX_END_RANGE() 417 418 !################ RUUUUUUUN ##################################### 419 !###################################################################### 420 421 call chebfi_run(chebfi,xgx0,getghc_gsc1,getBm1X,precond1,xgeigen,xgresidu,nspinor) 422 423 !Free preconditionning since not needed anymore 424 if(dtset%gpu_option==ABI_GPU_KOKKOS) then 425 #if defined HAVE_GPU && defined HAVE_YAKL 426 ABI_FREE_MANAGED(l_pcon) 427 #endif 428 else 429 ABI_FREE(l_pcon) 430 end if 431 432 !Compute enlout (nonlocal energy for each band if necessary) This is the best 433 ! quick and dirty trick to compute this part in NC. gvnlc cannot be part of 434 ! chebfi algorithm 435 if ( .not. l_paw ) then 436 !Check l_gvnlc size 437 !if ( size(l_gvnlxc) < 2*nband*l_npw*l_nspinor ) then 438 !if ( size(l_gvnlxc) /= 0 ) then 439 ! ABI_FREE(l_gvnlxc) 440 #ifdef FC_CRAY 441 ABI_MALLOC(l_gvnlxc,(1,1)) 442 #else 443 ABI_MALLOC(l_gvnlxc,(0,0)) 444 #endif 445 !end if 446 447 ABI_NVTX_START_RANGE(NVTX_CHEBFI2_NONLOP) 448 !Call nonlop 449 call nonlop(choice,l_cpopt,cprj_dum,enl_out,l_gs_hamk,0,eig,mpi_enreg,nband,1,paw_opt,& 450 & signs,gsc_dummy,l_tim_getghc,cg,l_gvnlxc) 451 ABI_NVTX_END_RANGE() 452 ABI_FREE(l_gvnlxc) 453 end if 454 455 !Free chebfi 456 call chebfi_free(chebfi) 457 458 #ifdef HAVE_OPENMP_OFFLOAD 459 !$OMP TARGET UPDATE FROM(cg,eig,resid) IF(gs_hamk%gpu_option==ABI_GPU_OPENMP) 460 !$OMP TARGET EXIT DATA MAP(delete:cg,eig,resid) IF(gs_hamk%gpu_option==ABI_GPU_OPENMP) 461 #endif 462 !################ SORRY IT'S ALREADY FINISHED : ) ################# 463 !###################################################################### 464 465 call timab(tim_chebfiwf2,2,tsec) 466 467 DBG_EXIT("COLL") 468 469 end subroutine chebfiwf2
m_chebfiwf/getBm1X [ Functions ]
[ Top ] [ m_chebfiwf ] [ Functions ]
NAME
getBm1X
FUNCTION
This routine computes S^-1|C> for a given wave function C. It acts as a driver for apply_invovl.
SIDE EFFECTS
X <type(xgBlock_t)>= memory block containing |C> Bm1X <type(xgBlock_t)>= memory block containing S^-1|C> transposer <type(xgTransposer_t)>= data used for array transpositions
SOURCE
655 subroutine getBm1X(X,Bm1X,transposer) 656 657 implicit none 658 659 !Arguments ------------------------------------ 660 type(xgBlock_t), intent(inout) :: X 661 type(xgBlock_t), intent(inout) :: Bm1X 662 type(xgTransposer_t), optional, intent(inout) :: transposer 663 664 !Local variables------------------------------- 665 !scalars 666 integer :: blockdim 667 integer :: spacedim 668 integer :: cpuRow 669 !arrays 670 real(dp), pointer :: ghc_filter(:,:) 671 real(dp), pointer :: gsm1hc_filter(:,:) 672 type(pawcprj_type), allocatable :: cwaveprj_next(:,:) !dummy 673 674 ! ********************************************************************* 675 676 call xgBlock_getSize(X,spacedim,blockdim) 677 678 spacedim = spacedim/l_icplx 679 680 call xgBlock_reverseMap(X,ghc_filter,l_icplx,spacedim*blockdim) 681 682 call xgBlock_reverseMap(Bm1X,gsm1hc_filter,l_icplx,spacedim*blockdim) 683 684 if (l_paral_kgb == 1) cpuRow = xgTransposer_getRank(transposer, 2) 685 686 !scale back cg 687 if(l_istwf == 2) then 688 call xgBlock_scale(X,inv_sqrt2,1) 689 if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then 690 #ifdef HAVE_OPENMP_OFFLOAD 691 if (l_paral_kgb == 0 ) then 692 if(l_mpi_enreg%me_g0 == 1) then 693 !$OMP TARGET MAP(to:ghc_filter) 694 ghc_filter(:, 1:spacedim*blockdim:l_npw) = ghc_filter(:, 1:spacedim*blockdim:l_npw) * sqrt2 695 !$OMP END TARGET 696 end if 697 else 698 if (cpuRow == 0) then 699 !$OMP TARGET MAP(to:ghc_filter) 700 ghc_filter(:, 1:spacedim*blockdim:spacedim) = ghc_filter(:, 1:spacedim*blockdim:spacedim) * sqrt2 701 !$OMP END TARGET 702 end if 703 end if 704 #endif 705 else 706 if (l_paral_kgb == 0) then 707 if(l_mpi_enreg%me_g0 == 1) ghc_filter(:, 1:spacedim*blockdim:l_npw) = ghc_filter(:, 1:spacedim*blockdim:l_npw) * sqrt2 708 else 709 if (cpuRow == 0) then 710 ghc_filter(:, 1:spacedim*blockdim:spacedim) = ghc_filter(:, 1:spacedim*blockdim:spacedim) * sqrt2 711 end if 712 end if 713 end if 714 715 if(l_paw) then 716 call xgBlock_scale(Bm1X,inv_sqrt2,1) 717 if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then 718 #ifdef HAVE_OPENMP_OFFLOAD 719 if (l_paral_kgb == 0) then 720 if(l_mpi_enreg%me_g0 == 1) then 721 !$OMP TARGET MAP(to:gsm1hc_filter) 722 gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) = gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) * sqrt2 723 !$OMP END TARGET 724 end if 725 else 726 if (cpuRow == 0) then 727 !$OMP TARGET MAP(to:gsm1hc_filter) 728 gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) = gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) * sqrt2 729 !$OMP END TARGET 730 end if 731 end if 732 #endif 733 else 734 if (l_paral_kgb == 0) then 735 if(l_mpi_enreg%me_g0 == 1) & 736 & gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) = gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) * sqrt2 737 else 738 if (cpuRow == 0) then 739 gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) = gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) * sqrt2 740 end if 741 end if 742 end if 743 end if 744 end if 745 746 if(l_paw) then 747 !cwaveprj_next is dummy 748 if(gemm_nonlop_use_gemm) then 749 ABI_MALLOC(cwaveprj_next, (1,1)) 750 else 751 ABI_MALLOC(cwaveprj_next, (l_gs_hamk%natom,l_nspinor*blockdim)) 752 call pawcprj_alloc(cwaveprj_next,0,l_gs_hamk%dimcprj) 753 end if 754 755 ABI_NVTX_START_RANGE(NVTX_INVOVL) 756 call apply_invovl(l_gs_hamk, ghc_filter(:,:), gsm1hc_filter(:,:), cwaveprj_next(:,:), & 757 spacedim/l_nspinor, blockdim, l_mpi_enreg, l_nspinor, l_block_sliced) 758 ABI_NVTX_END_RANGE() 759 else 760 gsm1hc_filter(:,:) = ghc_filter(:,:) 761 end if 762 763 ABI_NVTX_START_RANGE(NVTX_INVOVL_POST1) 764 !Scale cg, ghc, gsc 765 if ( l_istwf == 2 ) then 766 call xgBlock_scale(X,sqrt2,1) 767 if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then 768 #ifdef HAVE_OPENMP_OFFLOAD 769 if (l_paral_kgb == 0) then 770 if(l_mpi_enreg%me_g0 == 1) then 771 !$OMP TARGET MAP(to:ghc_filter) 772 ghc_filter(:, 1:spacedim*blockdim:l_npw) = ghc_filter(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 773 !$OMP END TARGET 774 endif 775 else 776 if (cpuRow == 0) then 777 !$OMP TARGET MAP(to:ghc_filter) 778 ghc_filter(:, 1:spacedim*blockdim:spacedim) = ghc_filter(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 779 !$OMP END TARGET 780 end if 781 end if 782 #endif 783 else 784 if (l_paral_kgb == 0) then 785 if(l_mpi_enreg%me_g0 == 1) then 786 ghc_filter(:, 1:spacedim*blockdim:l_npw) = ghc_filter(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 787 endif 788 else 789 if (cpuRow == 0) then 790 ghc_filter(:, 1:spacedim*blockdim:spacedim) = ghc_filter(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 791 end if 792 end if 793 end if 794 795 if(l_paw) then 796 call xgBlock_scale(Bm1X,sqrt2,1) 797 if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then 798 #ifdef HAVE_OPENMP_OFFLOAD 799 if (l_paral_kgb == 0) then 800 if(l_mpi_enreg%me_g0 == 1) then 801 !$OMP TARGET MAP(to:gsm1hc_filter) 802 gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) = gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 803 !$OMP END TARGET 804 end if 805 else 806 if (cpuRow == 0) then 807 !$OMP TARGET MAP(to:gsm1hc_filter) 808 gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) = gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 809 !$OMP END TARGET 810 end if 811 end if 812 #endif 813 else 814 if (l_paral_kgb == 0) then 815 if(l_mpi_enreg%me_g0 == 1) & 816 & gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) = gsm1hc_filter(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 817 else 818 if (cpuRow == 0) then 819 gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) = gsm1hc_filter(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 820 end if 821 end if 822 end if 823 end if 824 end if 825 826 if (l_paw) then 827 call pawcprj_free(cwaveprj_next) 828 ABI_FREE(cwaveprj_next) 829 end if 830 831 ABI_NVTX_END_RANGE() 832 833 end subroutine getBm1X
m_chebfiwf/getghc_gsc1 [ Functions ]
[ Top ] [ m_chebfiwf ] [ Functions ]
NAME
getghc_gsc1
FUNCTION
This routine computes H|C> and possibly S|C> for a given wave function C. It acts as a driver for getghc, taken into account parallelism, multithreading, etc.
SIDE EFFECTS
X <type(xgBlock_t)>= memory block containing |C> AX <type(xgBlock_t)>= memory block containing H|C> BX <type(xgBlock_t)>= memory block containing S|C> transposer <type(xgTransposer_t)>= data used for array transpositions
SOURCE
490 subroutine getghc_gsc1(X,AX,BX,transposer) 491 492 implicit none 493 494 !Arguments ------------------------------------ 495 type(xgBlock_t), intent(inout) :: X 496 type(xgBlock_t), intent(inout) :: AX 497 type(xgBlock_t), intent(inout) :: BX 498 type(xgTransposer_t), optional, intent(inout) :: transposer 499 integer :: blockdim 500 integer :: spacedim 501 type(pawcprj_type) :: cprj_dum(l_gs_hamk%natom,1) 502 503 !Local variables------------------------------- 504 !scalars 505 integer :: cpuRow 506 real(dp) :: eval 507 !arrays 508 real(dp), pointer :: cg(:,:) 509 real(dp), pointer :: ghc(:,:) 510 real(dp), pointer :: gsc(:,:) 511 real(dp) :: l_gvnlxc(1,1) 512 513 ! ********************************************************************* 514 515 call xgBlock_getSize(X,spacedim,blockdim) 516 517 spacedim = spacedim/l_icplx 518 519 call xgBlock_reverseMap(X,cg,l_icplx,spacedim*blockdim) 520 call xgBlock_reverseMap(AX,ghc,l_icplx,spacedim*blockdim) 521 call xgBlock_reverseMap(BX,gsc,l_icplx,spacedim*blockdim) 522 523 !Scale back cg 524 if (l_paral_kgb == 1) cpuRow = xgTransposer_getRank(transposer, 2) 525 if(l_istwf == 2) then 526 call xgBlock_scale(X,inv_sqrt2,1) 527 if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then 528 #ifdef HAVE_OPENMP_OFFLOAD 529 if (l_paral_kgb == 0) then 530 if(l_mpi_enreg%me_g0 == 1) then 531 !$OMP TARGET MAP(cg) 532 cg(:, 1:spacedim*blockdim:l_npw) = cg(:, 1:spacedim*blockdim:l_npw) * sqrt2 533 !$OMP END TARGET 534 end if 535 else 536 if (cpuRow == 0) then 537 !$OMP TARGET MAP(cg) 538 cg(:, 1:spacedim*blockdim:spacedim) = cg(:, 1:spacedim*blockdim:spacedim) * sqrt2 539 !$OMP END TARGET 540 end if 541 end if 542 #endif 543 else 544 if (l_paral_kgb == 0) then 545 if(l_mpi_enreg%me_g0 == 1) cg(:, 1:spacedim*blockdim:l_npw) = cg(:, 1:spacedim*blockdim:l_npw) * sqrt2 546 else 547 if (cpuRow == 0) cg(:, 1:spacedim*blockdim:spacedim) = cg(:, 1:spacedim*blockdim:spacedim) * sqrt2 548 end if 549 end if 550 end if 551 552 !if ( size(l_gvnlxc) < 2*blockdim*spacedim ) then 553 ! ABI_FREE(l_gvnlxc) 554 ! ABI_MALLOC(l_gvnlxc,(2,blockdim*spacedim)) 555 !end if 556 557 call multithreaded_getghc(l_cpopt,cg,cprj_dum,ghc,gsc,& 558 l_gs_hamk,l_gvnlxc,eval,l_mpi_enreg,blockdim,l_prtvol,l_sij_opt,l_tim_getghc,0) 559 560 561 #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL) 562 call gpu_device_synchronize() 563 #endif 564 565 !Scale cg, ghc, gsc 566 if ( l_istwf == 2 ) then 567 call xgBlock_scale(X ,sqrt2,1) 568 call xgBlock_scale(AX,sqrt2,1) 569 570 if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then 571 #ifdef HAVE_OPENMP_OFFLOAD 572 if (l_paral_kgb == 0) then 573 if(l_mpi_enreg%me_g0 == 1) then 574 !$OMP TARGET MAP(cg) 575 cg(:, 1:spacedim*blockdim:l_npw) = cg(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 576 !$OMP END TARGET 577 !$OMP TARGET MAP(ghc) 578 ghc(:, 1:spacedim*blockdim:l_npw) = ghc(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 579 !$OMP END TARGET 580 endif 581 else 582 if (cpuRow == 0) then 583 !$OMP TARGET MAP(cg) 584 cg(:, 1:spacedim*blockdim:spacedim) = cg(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 585 !$OMP END TARGET 586 !$OMP TARGET MAP(ghc) 587 ghc(:, 1:spacedim*blockdim:spacedim) = ghc(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 588 !$OMP END TARGET 589 end if 590 end if 591 #endif 592 else 593 if (l_paral_kgb == 0) then 594 if(l_mpi_enreg%me_g0 == 1) then 595 cg(:, 1:spacedim*blockdim:l_npw) = cg(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 596 ghc(:, 1:spacedim*blockdim:l_npw) = ghc(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 597 endif 598 else 599 if (cpuRow == 0) then 600 cg(:, 1:spacedim*blockdim:spacedim) = cg(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 601 ghc(:, 1:spacedim*blockdim:spacedim) = ghc(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 602 end if 603 end if 604 end if 605 if(l_paw) then 606 call xgBlock_scale(BX,sqrt2,1) 607 if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then 608 #ifdef HAVE_OPENMP_OFFLOAD 609 if (l_paral_kgb == 0) then 610 if(l_mpi_enreg%me_g0 == 1) then 611 !$OMP TARGET MAP(gsc) 612 gsc(:, 1:spacedim*blockdim:l_npw) = gsc(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 613 !$OMP END TARGET 614 end if 615 else 616 if (cpuRow == 0) then 617 !$OMP TARGET MAP(gsc) 618 gsc(:, 1:spacedim*blockdim:spacedim) = gsc(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 619 !$OMP END TARGET 620 end if 621 end if 622 #endif 623 else 624 if (l_paral_kgb == 0) then 625 if(l_mpi_enreg%me_g0 == 1) gsc(:, 1:spacedim*blockdim:l_npw) = gsc(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 626 else 627 if (cpuRow == 0) gsc(:, 1:spacedim*blockdim:spacedim) = gsc(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 628 end if 629 end if 630 end if ! l_paw 631 end if ! l_istwf==2 632 633 if ( .not. l_paw ) call xgBlock_copy(X,BX) 634 635 end subroutine getghc_gsc1
m_chebfiwf/precond1 [ Functions ]
[ Top ] [ m_chebfiwf ] [ Functions ]
NAME
precond1
FUNCTION
This routine applies a preconditionning to a block of memory
INPUTS
[gpu_option] = GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)
SIDE EFFECTS
W <type(xgBlock_t)>= memory block
SOURCE
852 subroutine precond1(W) 853 854 implicit none 855 856 ! Arguments ------------------------------------ 857 type(xgBlock_t), intent(inout) :: W 858 859 860 ! Local variables------------------------------- 861 ! scalars 862 integer :: ispinor 863 864 ! ********************************************************************* 865 866 ! Precondition resid_vec 867 do ispinor = 1,l_nspinor 868 call xgBlock_colwiseMul(W, l_pcon, l_icplx*l_npw*(ispinor-1)) 869 end do 870 871 end subroutine precond1