[关闭]
@zhangsiming65965 2020-06-03T03:19:18.000000Z 字数 8902 阅读 164

企业gpu服务器应用安装

Linux Gpu

---Author:张思明 ZhangSiming

---Mail:m18600117869@163.com

---QQ:1030728296


一、自建网站服务供给下载安装包

事先在阿里云的公网服务器的目录中下载好相应的安装包,之后开放http服务,供外界访问wget下载。

  1. #必须有公网ip,否则只能局域网内访问下载
  2. 阿里云服务器
  3. root@kbqatest:/Ubuntupackages# pwd
  4. /Ubuntupackages
  5. #下载cuda的包
  6. root@kbqatest:/Ubuntupackages# wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
  7. #下载tensorflow的包
  8. root@kbqatest:/Ubuntupackages# wget https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl
  9. #下载miniconda的包
  10. root@kbqatest:/Ubuntupackages# wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
  11. #查看安装包目录
  12. root@kbqatest:/Ubuntupackages# ls
  13. cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
  14. #从别的服务器上传cudnnu过来
  15. cudnn-9.0-linux-x64-v7.4.1.5.tgz
  16. Miniconda3-latest-Linux-x86_64.sh
  17. tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl
  1. root@kbqatest:/Ubuntupackages# python -m http.server 8080
  2. Serving HTTP on 0.0.0.0 port 8080 (http://0.0.0.0:8080/) ...

image_1d7u1sh2t14vg1agp5bhiqqki9.png-29.7kB

二、开始安装

2.1机房服务器初始化配置

2.1.1配置合理的ip

  1. root@gpuserver009:~# cat /etc/network/interfaces
  2. # interfaces(5) file used by ifup(8) and ifdown(8)
  3. auto lo
  4. iface lo inet loopback
  5. auto enp14s0
  6. iface enp14s0 inet static
  7. #静态IP的网段需要和路由分配的局域网子网ip网段相同
  8. address 10.10.3.17
  9. netmask 255.255.0.0
  10. gateway 10.10.0.1
  11. #重启网卡方法
  12. root@gpuserver009:~# ip addr flush dev enp14s0
  13. root@gpuserver009:~# ifdown enp14s0;ifup enp14s0
  14. #或者
  15. root@gpuserver009:~# ip addr flush dev enp14s0
  16. root@gpuserver009:~# systemctl restart networking
  17. #如果要配置固定的静态IP防止IP莫名其妙的变动或者消失
  18. root@gpuserver009:~# systemctl stop network-manager
  19. root@gpuserver009:~# systemctl disable network-manager

image_1d7u2t73csuog3l1k1417t01ljg1m.png-212.8kB

2.1.1更换apt源(可选),安装ssh和vim

  1. #更换为中科大的apt源
  2. root@gpuserver009:~# cat /etc/apt/sources.list
  3. deb http://mirrors.aliyun.com/ubuntu/ bionic main restricted universe multiverse
  4. deb-src http://mirrors.aliyun.com/ubuntu/ bionic main restricted universe multiverse
  5. deb http://mirrors.aliyun.com/ubuntu/ bionic-security main restricted universe multiverse
  6. deb-src http://mirrors.aliyun.com/ubuntu/ bionic-security main restricted universe multiverse
  7. deb http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted universe multiverse
  8. deb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted universe multiverse
  9. deb http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
  10. deb-src http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiverse
  11. deb http://mirrors.aliyun.com/ubuntu/ bionic-proposed main restricted universe multiverse
  12. deb-src http://mirrors.aliyun.com/ubuntu/ bionic-proposed main restricted universe multiverse
  13. root@gpuserver009:~# apt-get update
  1. root@gpuserver009:~# apt-get install -y openssh-server vim

2.2.2更换主机名

  1. #永久修改,需要重启生效
  2. root@gpuserver009:~# cat /etc/hostname
  3. gpuserver009
  4. #更改完开启新的bash直接生效,重启后失效
  5. root@gpuserver009:~# hostname gpuserver009

2.2.3关闭自动休眠

  1. #防止因自动休眠导致断网连接不上
  2. root@gpuserver009:~# systemctl mask sleep.target suspend.target hibernate.target hybrid-sleep.target

2.2.4更换DNS

  1. root@gpuserver009:~# cat /etc/resolv.conf
  2. nameserver 1.1.1.1
  3. nameserver 114.114.114.114
  4. nameserver 202.106.46.151
  1. root@gpuserver009:~# cat /etc/resolv.conf
  2. nameserver 219.141.136.10
  3. nameserver 202.106.0.20

2.2安装开发用的环境包

前提:
1.切换到root用户,因为下面的脚本都没有加sudo;
2.安装cudnn的时候,需要开启阿里云(47.95.111.195)上面的http服务下载安装包。

2.2.1安装cuda

CUDA是显卡厂商NVidia推出的运算平台。
随着显卡的发展,GPU越来越强大,而且GPU为显示图像做了优化。在计算上GPU已经超越了通用的CPU。如此强大的芯片如果只是作为显卡就太浪费了,因此N卡厂商推出CUDA,让显卡可以用于图像计算以外的目的。

cuda-nvidia.driver兼容对照表

image_1dc9oh6q91ped59217uk1salonc9.png-109.8kB

  1. #如果没有安装驱动,安装cuda的时候会自动安装驱动,但驱动可能不是最新版本,如果对于驱动有版本需求,可以在安装cuda之前预先安装
  2. #驱动安装脚本
  3. root@gpuserver009:~# cat driver-install.sh
  4. #!/bin/bash
  5. add-apt-repository ppa:graphics-drivers/ppa
  6. add-apt-repository ppa:xorg-edgers/ppa
  7. apt-get update
  8. apt-get install -y nvidia-418
  9. #装完驱动记得重启(init 6)
  10. #查看安装的驱动版本可以使用nvidia-smi
  11. #cuda一键安装脚本
  12. root@gpuserver009:~# cat cuda-install.sh
  13. #!/bin/bash
  14. mkdir /cuda
  15. cd /cuda
  16. apt-get install -y wget
  17. wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
  18. mv cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64.deb
  19. dpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64.deb
  20. apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
  21. apt update
  22. apt install -y cuda
  23. rm -rf cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb
  24. echo "please reboot"
  25. #如何检查cuda安装成功
  26. #如果需要更新驱动版本,需要先卸载原有cuda,再安装新的cuda
  27. #cuda一键卸载脚本
  28. root@gpuserver009:~# cat cuda-remove.sh
  29. #!/bin/bash
  30. apt-get autoremove -y --purge nvidia*
  31. apt-get autoclean

2.2.2安装Miniconda

conda是用于python项目做多版本环境创建与切换的,用于切换不同版本的python虚拟环境。

  1. #miniconda安装脚本
  2. root@gpuserver009:~# cat miniconda-install.sh
  3. #!/bin/bash
  4. mkdir /miniconda
  5. cd /miniconda
  6. apt-get install -y wget
  7. wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
  8. chmod a+x Miniconda3-latest-Linux-x86_64.sh
  9. bash ./Miniconda3-latest-Linux-x86_64.sh
  10. #安装目录采用"/opt/miniconda3"
  11. ln -s /opt/miniconda3/bin/* /usr/local/bin
  12. rm -rf Miniconda3-latest-Linux-x86_64.sh

2.2.3安装cudnn

NVIDIA cudnn是用于深度神经网络的GPU加速库。它强调性能、易用性和低内存开销。简单的,插入式设计可以让开发人员专注于设计和实现神经网络模型,而不是调整性能,同时还可以在GPU上实现高性能现代并行计算。

cudnn、cuda、tensorflow兼容对照表

image_1dc9pd5qh19vq1ioltd9gnb8qpm.png-593.4kB

  1. #cudnn安装脚本
  2. root@gpuserver009:~# cat cudnn.sh
  3. #!/bin/bash
  4. apt-get install python-pip
  5. pip install --upgrade pip
  6. pip install numpy
  7. mkdir /cudnn
  8. cd /cudnn
  9. wget https://developer.download.nvidia.com/compute/machine-learning/cudnn/secure/v7.4.1.5/prod/9.0_20181108/cudnn-9.0-linux-x64-v7.4.1.5.tgz?KZAFNBHsM0A3Rbtgtu_pHef5uygqb8bJfkXxsgq6BVeJYHVf1l5rYZjtDuHPQ6IInV84m8XaAq8IvL5GyOZ2S7jMzchBeI6o62e-yYwLGWPZYwT48s-3aCRHwY24MT_DZC_3XcgR5HRdBQsduwKIEyLVZZFusaYnWYZ2DJpXkoAiEsXevNRrRW5sCmS_NnVdYwQAdgVsAm2Qaom2dEwObw
  10. tar xf cudnn-9.0-linux-x64-v7.4.1.5.tgz
  11. cd cuda/
  12. cp include/cudnn.h /usr/local/cuda/include
  13. cp lib64/libcudnn* /usr/local/cuda/lib64
  14. cd /usr/local/cuda/lib64
  15. rm -rf libcudnn.so
  16. rm -rf libcudnn.so.7
  17. ln -s libcudnn.so.7.4.1 libcudnn.so.7
  18. ln -s libcudnn.so.7 libcudnn.so
  19. echo "export PATH=/usr/local/cuda/bin:$PATH" >> /etc/profile
  20. echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> /etc/profile
  21. echo "please source /etc/profile"

2.2.4安装tensorflow

TensorFlows是人工智能AI领域的一个重要软件工具,是技术层中的学习框架。你可以用它来处理大量数据,快速建立数学模型,这些模型可以完成智能功能等。

  1. #tensorflow安装脚本
  2. root@gpuserver009:~# cat temsorflow.sh
  3. #!/bin/bash
  4. mkdir /tensorflow
  5. cd /tensorflow
  6. wget https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl
  7. cd /data/
  8. mkdir -p sharedata/installation_packages
  9. chmod -R 775 sharedata
  10. groupadd shannon
  11. chown -R :shannon sharedata
  12. cd sharedata/installation_packages/
  13. cp /tensorflow/tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl .
  14. chmod 775 tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl
  15. chown :shannon tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl

注意:如果下载tensorflow非常慢,可以更换DNS为1.1.1.1试试,或者换DNS试试。

2.2.5安装docker,NVIDIA docker

  1. root@gpuserver009:~# cat docker.sh
  2. #docker-ce一键安装脚本
  3. apt-get install expect opencc ack-grep
  4. apt-get install -y apt-transport-https ca-certificates curl software-properties-common
  5. curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
  6. add-apt-repository \
  7. "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
  8. $(lsb_release -cs) \
  9. stable"
  10. apt-get update
  11. curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
  12. curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu16.04/amd64/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
  13. deb https://nvidia.github.io/libnvidia-container/ubuntu16.04/$(ARCH) /
  14. deb https://nvidia.github.io/nvidia-container-runtime/ubuntu16.04/$(ARCH) /
  15. deb https://nvidia.github.io/nvidia-docker/ubuntu16.04/$(ARCH) /
  16. apt-get update
  17. apt-get install -y docker-ce
  18. #NVIDIA docker安装脚本(库在docker安装的时候就安装好了)
  19. apt-get install -y nvidia-docker
  20. pkill -SIGHUP dockerd
  21. docker run --runtime=nvidia --rm nvidia/cuda:9.0-base nvidia-smi
  22. #如果报错"docker: Error response from daemon: OCI runtime create failed: unable to retrieve OCI runtime error (open /run/containerd/io.containerd.runtime.v1.linux/moby/a07397325389a72f8bcb423d959dffb10118f5a6694e10e6df5a20dbebf98f3c/log.json: no such file or directory): exec: "nvidia-container-runtime": executable file not found in $PATH: unknown."
  23. #请执行以下内容
  24. $ sudo apt-get install nvidia-container-runtime
  25. #安装最新版docker-compose
  26. curl -sSL -o /usr/local/bin/docker-compose https://code.aliyun.com/k9kdqvbb/files/raw/master/docker-compose-Linux-x86_64
  27. chmod +x /usr/local/bin/docker-compose

2.2.6安装NFS

  1. #一键安装脚本
  2. root@gpuserver001:/# cat NFS-server.sh
  3. #!/bin/bash
  4. #designed by ZhangSiming
  5. apt-get update
  6. apt install -y nfs-kernel-server
  7. mkdir -p /data/nfsdata
  8. chown -R nobody:nogroup /data/nfsdata2
  9. chmod -R 777 /data/nfsdata2
  10. echo "/data/nfsdata2 172.22.0.0/16(rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports
  11. #一定要有no_subtree_check(不强制检查父目录)或者subtree_check(强制检查父目录)
  12. exportfs -a
  13. systemctl restart nfs-kernel-server
  14. systemctl enable nfs-kernel-server
  15. echo "NFS-server configured successfully!!"
  16. echo "command:mount -t nfs `hostname -I | awk '{print $1}'`:/data/nfsdata /data/nfsdata"
  17. #查看开放挂载网段
  18. root@gpuserver001:/# showmount -e localhost
  19. Export list for localhost:
  20. /data/nfsdata2 11.11.22.0/16
  1. #一键安装挂载脚本
  2. root@gpuserver001:/# cat NFS-client.sh
  3. #!/bin/bash
  4. #designed by ZhangSiming
  5. apt-get install -y nfs-common
  6. mkdir -p /data/nfsdata2
  7. mkdir -p /data/nfsdata
  8. #mount -t nfs 11.11.22.1:/data/nfsdata2 /data/nfsdata2
  1. zhangsiming@gpuserver003:~$ tail -4 /etc/rc.local
  2. mount -t nfs 11.11.22.2:/data/nfsdata /data/nfsdata
  3. mount -t nfs 11.11.22.1:/data/nfsdata2 /data/nfsdata2
  4. exit 0
  5. #不要写在/etc/fstab中
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注