如何让open-mpi在不同版本的OS上运行
- 一.背景
- 二.操作步骤【如果没有指明,则二个节点上都需要执行】
- 1.创建容器
- 2.更新apt源、安装依赖
- 3.设置ssh端口和密码(为避免跟host sshd冲突,修改了容器里sshd端口)
- 4.运行`sshd`服务
- 5.安装`openmpi`及依赖
- 6.免密操作
- 7.准备测试程序
- 8.运行测试程序【选择其中任意节点】
一.背景
- 二台不同OS的服务器运行openmpi程序时遇到以下问题:
- ORTE_ERROR_LOG: Data unpack had inadequate space in file base/regx_base_default_fns.c
- MPI_Allreduce或MPI_Barrier阻塞在poll
- 解决办法:让二边openmpi及主要依赖库的版本保持一致
二.操作步骤【如果没有指明,则二个节点上都需要执行】
1.创建容器
- 节点0(10.22.4.73)
rm mpi_demo -rf
mkdir mpi_demo
cd mpi_demo
docker run -ti --rm --privileged --net=host -v $PWD:/home -w /home ubuntu:20.04 /bin/bash
- 节点1(192.168.25.121)
rm mpi_demo -rf
mkdir mpi_demo
cd mpi_demo
docker run -ti --rm --privileged --net=host -v $PWD:/home -w /home ubuntu:22.04 /bin/bash
2.更新apt源、安装依赖
sed -i "s@http://.*archive.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list
sed -i "s@http://.*security.ubuntu.com@http://repo.huaweicloud.com@g" /etc/apt/sources.list
apt update
apt install gcc g++ vim git wget curl unzip make -y
apt install -y pkg-config
apt install -y openssh-server
apt install -y psmisc pciutils
3.设置ssh端口和密码(为避免跟host sshd冲突,修改了容器里sshd端口)
sed -i 's/^.*PermitRootLogin.*$/PermitRootLogin yes/g' /etc/ssh/sshd_config
sed -i 's/^.*Port.*$/Port 4223/g' /etc/ssh/sshd_config
export passwd=Hello123 && printf "${passwd}\n${passwd}\n" | passwd root
4.运行sshd
服务
cat >/usr/bin/run.sh <<EOF
#!/bin/bash
mkdir -p /run/sshd
source ~/.bashrc
/usr/sbin/sshd -D
EOF
chmod 777 /usr/bin/run.sh
nohup /usr/bin/run.sh &
5.安装openmpi
及依赖
cd /home
wget -O openssl-3.5.0.tar.gz https://github.com/openssl/openssl/releases/download/openssl-3.5.0/openssl-3.5.0.tar.gz
wget -O hwloc-2.12.1.tar.gz https://download.open-mpi.org/release/hwloc/v2.12/hwloc-2.12.1.tar.gz
wget -O openmpi-4.1.2.tar.gz https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.2.tar.gz
wget -O pmix-4.1.2.tar.gz https://github.com/openpmix/openpmix/releases/download/v4.1.2/pmix-4.1.2.tar.gz
wget -O libevent-2.1.12-stable.tar.gz https://github.com/libevent/libevent/releases/download/release-2.1.12-stable/libevent-2.1.12-stable.tar.gzcd /home
tar -xf openssl-3.5.0.tar.gz
cd openssl-3.5.0
./Configure
make -j4
make installcd /home
tar -xf hwloc-2.12.1.tar.gz
cd hwloc-2.12.1
./configure
make install -jcd /home
tar -xf libevent-2.1.12-stable.tar.gz
cd libevent-2.1.12-stable
./configure
make install -jcd /home
tar -xf pmix-4.1.2.tar.gz
cd pmix-4.1.2
./configure
make install -jcd /home
tar -xf openmpi-4.1.2.tar.gz
cd openmpi-4.1.2
./configure --prefix=/usr/local/ --with-libevent=internal --with-hwloc=internal --with-pmix=internal
make install -j
echo "export LD_LIBRARY_PATH=/usr/local/lib" >> /etc/environment
export LD_LIBRARY_PATH=/usr/local/lib
6.免密操作
ssh-keygen -t rsa
ssh-copy-id -i ~/.ssh/id_rsa.pub -p 4223 root@10.22.4.73
ssh-copy-id -i ~/.ssh/id_rsa.pub -p 4223 root@192.168.25.121
7.准备测试程序
cd /home
cat > mpi_all_reduce.cc <<-'EOF'
#include <mpi.h>
#include <iostream>
#include <stdlib.h>int main(int argc, char *argv[])
{int size,myid;double start, end;MPI_Init(&argc, &argv);MPI_Comm_rank(MPI_COMM_WORLD, &myid);MPI_Comm_size(MPI_COMM_WORLD, &size);int count=32;int value=myid+2;int data_array[count]={0};for(int i=0;i<count;i++){data_array[i]=value+i;}printf("current_rank:%d ranks:%d value:%d\n",myid,size,value);MPI_Barrier(MPI_COMM_WORLD);int recv_data_array[count]={0};start=MPI_Wtime(); MPI_Allreduce(data_array,recv_data_array,count, MPI_INT, MPI_SUM, MPI_COMM_WORLD);MPI_Barrier(MPI_COMM_WORLD);end=MPI_Wtime();if (myid==0){printf("rank0\n");for(int i=1;i<=count;i++){printf("%08d ",recv_data_array[i-1]);if(i%16==0)printf("\n");}}MPI_Finalize();
}
EOF
mpic++ mpi_all_reduce.cc -o mpi_all_reduce
8.运行测试程序【选择其中任意节点】
cd /home
cat > hostfile <<EOF
10.22.4.73
192.168.25.121
EOF
mpirun --allow-run-as-root -mca plm_rsh_args "-p 4223" \-np 2 -hostfile hostfile /home/mpi_all_reduce