Julia中MPI,@distributed,@threads三种并行方法的比较
最近折腾了一下Julia的并行操作,目前我了解到的有三种:MPI并行、@distributed、@threads,这里就用极化率的计算来对比一下三者的速度。
{:.info}
这里先单独列举一下三种并行方式在语法上的不同
MPI
1 | using LinearAlgebra,DelimitedFiles,Printf,MPI,Dates |
@distributed
1 | function main1(nk::Int64) |
在使用@distributed这个宏的时候,记得要在所有的函数前面加上@everywhere这个宏,让所有的核心都知道要用到的函数。
@threads
1 | using SharedArrays, LinearAlgebra,Distributed,DelimitedFiles,Printf,.Threads |
在使用@threads的时候,和@distributed方法不一样,此时不需要在每个函数前面加上@everywhere,但是二者都使用了SharedArray这个函数,可以让每个过程同时对一个数组进行幅值操作且不发生互斥。这里还要强调一下@threads这个宏需要导入.Threads
这个包,前面这里是有一个句号的,或者可以直接使用Threads.@threads
,这样就不同在using
的时候导入这个包了。
完整代码
需要使用得到的其他函数如下,只要把上面的不同并行方法套用进来就行,不过要注意前面提到的加@everywhere等问题。1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80function ham(kx::Float64,ky::Float64)
t0::Float64 = 0.1 # 缩放系数
t1x::Float64 = -0.483 * t0
t1z::Float64 = -0.110 * t0
t2x::Float64 = 0.069 * t0
t2z::Float64 = -0.017 * t0
t3xz::Float64 = 0.239 * t0
t4xz::Float64 = -0.034 * t0
tvx::Float64 = 0.005 * t0
tvz::Float64 = -0.635 * t0
ex::Float64 = 0.776 * t0
ez::Float64 = 0.409 * t0
ham = zeros(ComplexF64,4,4)
ham[1,1] = 2 * t1x * (cos(kx) + cos(ky)) + 4 * t2x*cos(kx)*cos(ky) + ex
ham[2,2] = 2 * t1z * (cos(kx) + cos(ky)) + 4 * t2z*cos(kx)*cos(ky) + ez
ham[1,2] = 2 * t3xz * (cos(kx) - cos(ky))
ham[2,1] = 2 * t3xz * (cos(kx) - cos(ky))
ham[3,3] = 2 * t1x * (cos(kx) + cos(ky)) + 4 * t2x*cos(kx)*cos(ky) + ex
ham[4,4] = 2 * t1z * (cos(kx) + cos(ky)) + 4 * t2z*cos(kx)*cos(ky) + ez
ham[3,4] = 2 * t3xz * (cos(kx) - cos(ky))
ham[4,3] = 2 * t3xz * (cos(kx) - cos(ky))
ham[1,3] = tvx
ham[1,4] = 2 * t4xz * (cos(kx) - cos(ky))
ham[2,3] = 2 * t4xz * (cos(kx) - cos(ky))
ham[2,4] = tvz
ham[3,1] = tvx
ham[4,1] = 2 * t4xz * (cos(kx) - cos(ky))
ham[3,2] = 2 * t4xz * (cos(kx) - cos(ky))
ham[4,2] = tvz
val,vec = eigen(ham)
return val,vec
end
#-------------------------------------------------------------------------------
function fsd(x::Float64)
T::Float64 = 0.001 # Temperature
return 1/(exp(x/T) + 1)
end
#-------------------------------------------------------------------------------
# 裸的自旋磁化率
function chi0(qx::Float64,qy::Float64,omega::Float64,nk::Int64)
# nk::Int64 = 100 # 点撒密一点才能找到费米面
klist = range(-pi,pi,length = nk)
bearchi0 = SharedArray(zeros(ComplexF64,4,4))
#@sync @distributed for kx in klist
for kx in klist
for ky in klist
val,vec = ham(kx,ky)
valq,vecq = ham(kx + qx,ky + qy)
for l1 in 1:4,l2 in 1:4
re1::ComplexF64 = 0
for m in 1:4,n in 1:4
re1 += (fsd(val[n]) - fsd(valq[m]))/(im * (omega + 0.0001) + val[n] - valq[m]) * vecq[l1,m]' * vecq[l2,m] * vec[l2,n]' * vec[l1,n]
# re1 += (fsd(val[n]) - fsd(valq[m]))/(im * (omega + 0.0001) + val[n] - valq[m]) * vecq[l1,m] * vecq[l2,m]' * vec[l2,n] * vec[l1,n]'
end
bearchi0[l1,l2] += re1
end
end
end
return -1/nk^2 * bearchi0
end
#-------------------------------------------------------------------------------
function chi(qx::Float64,qy::Float64,omega::Float64,nk::Int64)
U0::Float64 = 3.0
J0::Float64 = 0.4
a1 = diagm(ones(2))
a2 = zeros(Float64,2,2)
I0 = diagm(ones(4))
a2[1,1] = U0
a2[2,2] = U0
a2[1,2] = J0/2
a2[2,1] = J0/2
gamma = kron(a1,a2)
bearchi0 = chi0(qx,qy,omega,nk)
chitemp = inv(I0 - bearchi0 * gamma) * bearchi0
#return chitemp
return imag(sum(chitemp)),real(sum(chitemp))
end
结果对比
计算的时候选择nk=40
,使用的核数也是40,运行时间如下
- MPI
1
2
3
4
5======== Job starts at 2024-03-24 14:47:33 on n33 ========
开始计算极化率: 2024-03-24T14:47:40.915
Number of nk : 40
结束计算极化率: 2024-03-24T14:47:52.785
======== Job ends at 2024-03-24 14:47:53 on n33 ======== - @distributed
1
2
3======== Job starts at 2024-03-24 15:35:28 on n40 ========
65.540391 seconds (1.57 M allocations: 103.933 MiB, 0.07% gc time, 2.65% compilation time)
======== Job ends at 2024-03-24 15:36:46 on n40 ======== - @threads从运行结果上来看使用MPI的效率是最高的,但这里使用
1
2
3======== Job starts at 2024-03-24 15:34:24 on n12 ========
108.048878 seconds (11.88 G allocations: 328.220 GiB, 50.20% gc time, 134.02% compilation time)
======== Job ends at 2024-03-24 15:36:16 on n12 ========@threads
的运行时间居然变得很长,我怀疑应该是我使用的方法有问题,但是其运行效率应该也不会比MPI快。
提交任务的脚本如下1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32!/bin/bash
SBATCH --job-name=chi0val
SBATCH --nodes=1
SBATCH --ntasks-per-node=40
SBATCH --cpus-per-task=1
SBATCH -t 3-00:00:00
load the environment
module purge
module load compiler/intel/2021.3.0
module load mpi/intelmpi/2021.3.0
module load apps/vasp/6.4.0-i21
module load apps/wannier90/3.1-i21
/soft/vasp6_d4/bin
where is your binary file
source /soft/anaconda/anaconda3/etc/profile.d/conda.sh
julia=/soft/julia-1.9.4/bin/julia
python=/soft/anaconda/anaconda3/bin/python3.11
NUM_MPI=$SLURM_NTASKS
echo "======== Job starts at `date +'%Y-%m-%d %T'` on `hostname` ======== "
mpirun -np ${NUM_MPI} julia mpi-rpa.jl
mpirun -np ${NUM_MPI} julia threads-rpa.jl
julia -t ${NUM_MPI} threads-rpa.jl
julia -p ${NUM_MPI} distributed-rpa.jl
python 01-graphene_prim_model.py
echo "======== Job ends at `date +'%Y-%m-%d %T'` on `hostname` ======== "
公众号
相关内容均会在公众号进行同步,若对该Blog感兴趣,欢迎关注微信公众号。
{:.info}
![]() |
yxliphy@gmail.com |