@zhangsiming65965
2020-06-03T03:19:18.000000Z
字数 8902
阅读 164
Linux
Gpu
事先在阿里云的公网服务器的目录中下载好相应的安装包,之后开放http服务,供外界访问wget下载。
#必须有公网ip,否则只能局域网内访问下载
阿里云服务器
root@kbqatest:/Ubuntupackages# pwd
/Ubuntupackages
#下载cuda的包
root@kbqatest:/Ubuntupackages# wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
#下载tensorflow的包
root@kbqatest:/Ubuntupackages# wget https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl
#下载miniconda的包
root@kbqatest:/Ubuntupackages# wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
#查看安装包目录
root@kbqatest:/Ubuntupackages# ls
cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
#从别的服务器上传cudnnu过来
cudnn-9.0-linux-x64-v7.4.1.5.tgz
Miniconda3-latest-Linux-x86_64.sh
tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl
root@kbqatest:/Ubuntupackages# python -m http.server 8080
Serving HTTP on 0.0.0.0 port 8080 (http://0.0.0.0:8080/) ...
root@gpuserver009:~# cat /etc/network/interfaces
# interfaces(5) file used by ifup(8) and ifdown(8)
auto lo
iface lo inet loopback
auto enp14s0
iface enp14s0 inet static
#静态IP的网段需要和路由分配的局域网子网ip网段相同
address 10.10.3.17
netmask 255.255.0.0
gateway 10.10.0.1
#重启网卡方法
root@gpuserver009:~# ip addr flush dev enp14s0
root@gpuserver009:~# ifdown enp14s0;ifup enp14s0
#或者
root@gpuserver009:~# ip addr flush dev enp14s0
root@gpuserver009:~# systemctl restart networking
#如果要配置固定的静态IP防止IP莫名其妙的变动或者消失
root@gpuserver009:~# systemctl stop network-manager
root@gpuserver009:~# systemctl disable network-manager
#更换为中科大的apt源
root@gpuserver009:~# cat /etc/apt/sources.list
deb http://mirrors.aliyun.com/ubuntu/ bionic main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ bionic-security main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic-security main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ bionic-proposed main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ bionic-proposed main restricted universe multiverse
root@gpuserver009:~# apt-get update
root@gpuserver009:~# apt-get install -y openssh-server vim
#永久修改,需要重启生效
root@gpuserver009:~# cat /etc/hostname
gpuserver009
#更改完开启新的bash直接生效,重启后失效
root@gpuserver009:~# hostname gpuserver009
#防止因自动休眠导致断网连接不上
root@gpuserver009:~# systemctl mask sleep.target suspend.target hibernate.target hybrid-sleep.target
root@gpuserver009:~# cat /etc/resolv.conf
nameserver 1.1.1.1
nameserver 114.114.114.114
nameserver 202.106.46.151
root@gpuserver009:~# cat /etc/resolv.conf
nameserver 219.141.136.10
nameserver 202.106.0.20
前提:
1.切换到root用户,因为下面的脚本都没有加sudo;
2.安装cudnn的时候,需要开启阿里云(47.95.111.195)上面的http服务下载安装包。
CUDA是显卡厂商NVidia推出的运算平台。
随着显卡的发展,GPU越来越强大,而且GPU为显示图像做了优化。在计算上GPU已经超越了通用的CPU。如此强大的芯片如果只是作为显卡就太浪费了,因此N卡厂商推出CUDA,让显卡可以用于图像计算以外的目的。
#如果没有安装驱动,安装cuda的时候会自动安装驱动,但驱动可能不是最新版本,如果对于驱动有版本需求,可以在安装cuda之前预先安装
#驱动安装脚本
root@gpuserver009:~# cat driver-install.sh
#!/bin/bash
add-apt-repository ppa:graphics-drivers/ppa
add-apt-repository ppa:xorg-edgers/ppa
apt-get update
apt-get install -y nvidia-418
#装完驱动记得重启(init 6)
#查看安装的驱动版本可以使用nvidia-smi
#cuda一键安装脚本
root@gpuserver009:~# cat cuda-install.sh
#!/bin/bash
mkdir /cuda
cd /cuda
apt-get install -y wget
wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
mv cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64.deb
dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64.deb
apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
apt update
apt install -y cuda
rm -rf cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
echo "please reboot"
#如何检查cuda安装成功
#如果需要更新驱动版本,需要先卸载原有cuda,再安装新的cuda
#cuda一键卸载脚本
root@gpuserver009:~# cat cuda-remove.sh
#!/bin/bash
apt-get autoremove -y --purge nvidia*
apt-get autoclean
conda是用于python项目做多版本环境创建与切换的,用于切换不同版本的python虚拟环境。
#miniconda安装脚本
root@gpuserver009:~# cat miniconda-install.sh
#!/bin/bash
mkdir /miniconda
cd /miniconda
apt-get install -y wget
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
chmod a+x Miniconda3-latest-Linux-x86_64.sh
bash ./Miniconda3-latest-Linux-x86_64.sh
#安装目录采用"/opt/miniconda3"
ln -s /opt/miniconda3/bin/* /usr/local/bin
rm -rf Miniconda3-latest-Linux-x86_64.sh
NVIDIA cudnn是用于深度神经网络的GPU加速库。它强调性能、易用性和低内存开销。简单的,插入式设计可以让开发人员专注于设计和实现神经网络模型,而不是调整性能,同时还可以在GPU上实现高性能现代并行计算。
#cudnn安装脚本
root@gpuserver009:~# cat cudnn.sh
#!/bin/bash
apt-get install python-pip
pip install --upgrade pip
pip install numpy
mkdir /cudnn
cd /cudnn
wget https://developer.download.nvidia.com/compute/machine-learning/cudnn/secure/v7.4.1.5/prod/9.0_20181108/cudnn-9.0-linux-x64-v7.4.1.5.tgz?KZAFNBHsM0A3Rbtgtu_pHef5uygqb8bJfkXxsgq6BVeJYHVf1l5rYZjtDuHPQ6IInV84m8XaAq8IvL5GyOZ2S7jMzchBeI6o62e-yYwLGWPZYwT48s-3aCRHwY24MT_DZC_3XcgR5HRdBQsduwKIEyLVZZFusaYnWYZ2DJpXkoAiEsXevNRrRW5sCmS_NnVdYwQAdgVsAm2Qaom2dEwObw
tar xf cudnn-9.0-linux-x64-v7.4.1.5.tgz
cd cuda/
cp include/cudnn.h /usr/local/cuda/include
cp lib64/libcudnn* /usr/local/cuda/lib64
cd /usr/local/cuda/lib64
rm -rf libcudnn.so
rm -rf libcudnn.so.7
ln -s libcudnn.so.7.4.1 libcudnn.so.7
ln -s libcudnn.so.7 libcudnn.so
echo "export PATH=/usr/local/cuda/bin:$PATH" >> /etc/profile
echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> /etc/profile
echo "please source /etc/profile"
TensorFlows是人工智能AI领域的一个重要软件工具,是技术层中的学习框架。你可以用它来处理大量数据,快速建立数学模型,这些模型可以完成智能功能等。
#tensorflow安装脚本
root@gpuserver009:~# cat temsorflow.sh
#!/bin/bash
mkdir /tensorflow
cd /tensorflow
wget https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl
cd /data/
mkdir -p sharedata/installation_packages
chmod -R 775 sharedata
groupadd shannon
chown -R :shannon sharedata
cd sharedata/installation_packages/
cp /tensorflow/tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl .
chmod 775 tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl
chown :shannon tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl
注意:如果下载tensorflow非常慢,可以更换DNS为1.1.1.1试试,或者换DNS试试。
root@gpuserver009:~# cat docker.sh
#docker-ce一键安装脚本
apt-get install expect opencc ack-grep
apt-get install -y apt-transport-https ca-certificates curl software-properties-common
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
add-apt-repository \
"deb [arch=amd64] https://download.docker.com/linux/ubuntu \
$(lsb_release -cs) \
stable"
apt-get update
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu16.04/amd64/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
deb https://nvidia.github.io/libnvidia-container/ubuntu16.04/$(ARCH) /
deb https://nvidia.github.io/nvidia-container-runtime/ubuntu16.04/$(ARCH) /
deb https://nvidia.github.io/nvidia-docker/ubuntu16.04/$(ARCH) /
apt-get update
apt-get install -y docker-ce
#NVIDIA docker安装脚本(库在docker安装的时候就安装好了)
apt-get install -y nvidia-docker
pkill -SIGHUP dockerd
docker run --runtime=nvidia --rm nvidia/cuda:9.0-base nvidia-smi
#如果报错"docker: Error response from daemon: OCI runtime create failed: unable to retrieve OCI runtime error (open /run/containerd/io.containerd.runtime.v1.linux/moby/a07397325389a72f8bcb423d959dffb10118f5a6694e10e6df5a20dbebf98f3c/log.json: no such file or directory): exec: "nvidia-container-runtime": executable file not found in $PATH: unknown."
#请执行以下内容
$ sudo apt-get install nvidia-container-runtime
#安装最新版docker-compose
curl -sSL -o /usr/local/bin/docker-compose https://code.aliyun.com/k9kdqvbb/files/raw/master/docker-compose-Linux-x86_64
chmod +x /usr/local/bin/docker-compose
#一键安装脚本
root@gpuserver001:/# cat NFS-server.sh
#!/bin/bash
#designed by ZhangSiming
apt-get update
apt install -y nfs-kernel-server
mkdir -p /data/nfsdata
chown -R nobody:nogroup /data/nfsdata2
chmod -R 777 /data/nfsdata2
echo "/data/nfsdata2 172.22.0.0/16(rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports
#一定要有no_subtree_check(不强制检查父目录)或者subtree_check(强制检查父目录)
exportfs -a
systemctl restart nfs-kernel-server
systemctl enable nfs-kernel-server
echo "NFS-server configured successfully!!"
echo "command:mount -t nfs `hostname -I | awk '{print $1}'`:/data/nfsdata /data/nfsdata"
#查看开放挂载网段
root@gpuserver001:/# showmount -e localhost
Export list for localhost:
/data/nfsdata2 11.11.22.0/16
#一键安装挂载脚本
root@gpuserver001:/# cat NFS-client.sh
#!/bin/bash
#designed by ZhangSiming
apt-get install -y nfs-common
mkdir -p /data/nfsdata2
mkdir -p /data/nfsdata
#mount -t nfs 11.11.22.1:/data/nfsdata2 /data/nfsdata2
zhangsiming@gpuserver003:~$ tail -4 /etc/rc.local
mount -t nfs 11.11.22.2:/data/nfsdata /data/nfsdata
mount -t nfs 11.11.22.1:/data/nfsdata2 /data/nfsdata2
exit 0
#不要写在/etc/fstab中