@zhangsiming65965
2020-06-03T03:19:18.000000Z
字数 8902
阅读 199
Linux Gpu
事先在阿里云的公网服务器的目录中下载好相应的安装包,之后开放http服务,供外界访问wget下载。
#必须有公网ip,否则只能局域网内访问下载阿里云服务器root@kbqatest:/Ubuntupackages# pwd/Ubuntupackages#下载cuda的包root@kbqatest:/Ubuntupackages# wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb#下载tensorflow的包root@kbqatest:/Ubuntupackages# wget https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl#下载miniconda的包root@kbqatest:/Ubuntupackages# wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh#查看安装包目录root@kbqatest:/Ubuntupackages# lscuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb#从别的服务器上传cudnnu过来cudnn-9.0-linux-x64-v7.4.1.5.tgzMiniconda3-latest-Linux-x86_64.shtensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl
root@kbqatest:/Ubuntupackages# python -m http.server 8080Serving HTTP on 0.0.0.0 port 8080 (http://0.0.0.0:8080/) ...

root@gpuserver009:~# cat /etc/network/interfaces# interfaces(5) file used by ifup(8) and ifdown(8)auto loiface lo inet loopbackauto enp14s0iface enp14s0 inet static#静态IP的网段需要和路由分配的局域网子网ip网段相同address 10.10.3.17netmask 255.255.0.0gateway 10.10.0.1#重启网卡方法root@gpuserver009:~# ip addr flush dev enp14s0root@gpuserver009:~# ifdown enp14s0;ifup enp14s0#或者root@gpuserver009:~# ip addr flush dev enp14s0root@gpuserver009:~# systemctl restart networking#如果要配置固定的静态IP防止IP莫名其妙的变动或者消失root@gpuserver009:~# systemctl stop network-managerroot@gpuserver009:~# systemctl disable network-manager

#更换为中科大的apt源root@gpuserver009:~# cat /etc/apt/sources.listdeb http://mirrors.aliyun.com/ubuntu/ bionic main restricted universe multiversedeb-src http://mirrors.aliyun.com/ubuntu/ bionic main restricted universe multiversedeb http://mirrors.aliyun.com/ubuntu/ bionic-security main restricted universe multiversedeb-src http://mirrors.aliyun.com/ubuntu/ bionic-security main restricted universe multiversedeb http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted universe multiversedeb-src http://mirrors.aliyun.com/ubuntu/ bionic-updates main restricted universe multiversedeb http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiversedeb-src http://mirrors.aliyun.com/ubuntu/ bionic-backports main restricted universe multiversedeb http://mirrors.aliyun.com/ubuntu/ bionic-proposed main restricted universe multiversedeb-src http://mirrors.aliyun.com/ubuntu/ bionic-proposed main restricted universe multiverseroot@gpuserver009:~# apt-get update
root@gpuserver009:~# apt-get install -y openssh-server vim
#永久修改,需要重启生效root@gpuserver009:~# cat /etc/hostnamegpuserver009#更改完开启新的bash直接生效,重启后失效root@gpuserver009:~# hostname gpuserver009
#防止因自动休眠导致断网连接不上root@gpuserver009:~# systemctl mask sleep.target suspend.target hibernate.target hybrid-sleep.target
root@gpuserver009:~# cat /etc/resolv.confnameserver 1.1.1.1nameserver 114.114.114.114nameserver 202.106.46.151
root@gpuserver009:~# cat /etc/resolv.confnameserver 219.141.136.10nameserver 202.106.0.20
前提:
1.切换到root用户,因为下面的脚本都没有加sudo;
2.安装cudnn的时候,需要开启阿里云(47.95.111.195)上面的http服务下载安装包。
CUDA是显卡厂商NVidia推出的运算平台。
随着显卡的发展,GPU越来越强大,而且GPU为显示图像做了优化。在计算上GPU已经超越了通用的CPU。如此强大的芯片如果只是作为显卡就太浪费了,因此N卡厂商推出CUDA,让显卡可以用于图像计算以外的目的。

#如果没有安装驱动,安装cuda的时候会自动安装驱动,但驱动可能不是最新版本,如果对于驱动有版本需求,可以在安装cuda之前预先安装#驱动安装脚本root@gpuserver009:~# cat driver-install.sh#!/bin/bashadd-apt-repository ppa:graphics-drivers/ppaadd-apt-repository ppa:xorg-edgers/ppaapt-get updateapt-get install -y nvidia-418#装完驱动记得重启(init 6)#查看安装的驱动版本可以使用nvidia-smi#cuda一键安装脚本root@gpuserver009:~# cat cuda-install.sh#!/bin/bashmkdir /cudacd /cudaapt-get install -y wgetwget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-debmv cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-deb cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64.debdpkg -i cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64.debapt-key add /var/cuda-repo-9-0-local/7fa2af80.pubapt updateapt install -y cudarm -rf cuda-repo-ubuntu1604-9-0-local_9.0.176-1_amd64-debecho "please reboot"#如何检查cuda安装成功#如果需要更新驱动版本,需要先卸载原有cuda,再安装新的cuda#cuda一键卸载脚本root@gpuserver009:~# cat cuda-remove.sh#!/bin/bashapt-get autoremove -y --purge nvidia*apt-get autoclean
conda是用于python项目做多版本环境创建与切换的,用于切换不同版本的python虚拟环境。
#miniconda安装脚本root@gpuserver009:~# cat miniconda-install.sh#!/bin/bashmkdir /minicondacd /minicondaapt-get install -y wgetwget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.shchmod a+x Miniconda3-latest-Linux-x86_64.shbash ./Miniconda3-latest-Linux-x86_64.sh#安装目录采用"/opt/miniconda3"ln -s /opt/miniconda3/bin/* /usr/local/binrm -rf Miniconda3-latest-Linux-x86_64.sh
NVIDIA cudnn是用于深度神经网络的GPU加速库。它强调性能、易用性和低内存开销。简单的,插入式设计可以让开发人员专注于设计和实现神经网络模型,而不是调整性能,同时还可以在GPU上实现高性能现代并行计算。

#cudnn安装脚本root@gpuserver009:~# cat cudnn.sh#!/bin/bashapt-get install python-pippip install --upgrade pippip install numpymkdir /cudnncd /cudnnwget https://developer.download.nvidia.com/compute/machine-learning/cudnn/secure/v7.4.1.5/prod/9.0_20181108/cudnn-9.0-linux-x64-v7.4.1.5.tgz?KZAFNBHsM0A3Rbtgtu_pHef5uygqb8bJfkXxsgq6BVeJYHVf1l5rYZjtDuHPQ6IInV84m8XaAq8IvL5GyOZ2S7jMzchBeI6o62e-yYwLGWPZYwT48s-3aCRHwY24MT_DZC_3XcgR5HRdBQsduwKIEyLVZZFusaYnWYZ2DJpXkoAiEsXevNRrRW5sCmS_NnVdYwQAdgVsAm2Qaom2dEwObwtar xf cudnn-9.0-linux-x64-v7.4.1.5.tgzcd cuda/cp include/cudnn.h /usr/local/cuda/includecp lib64/libcudnn* /usr/local/cuda/lib64cd /usr/local/cuda/lib64rm -rf libcudnn.sorm -rf libcudnn.so.7ln -s libcudnn.so.7.4.1 libcudnn.so.7ln -s libcudnn.so.7 libcudnn.soecho "export PATH=/usr/local/cuda/bin:$PATH" >> /etc/profileecho "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> /etc/profileecho "please source /etc/profile"
TensorFlows是人工智能AI领域的一个重要软件工具,是技术层中的学习框架。你可以用它来处理大量数据,快速建立数学模型,这些模型可以完成智能功能等。
#tensorflow安装脚本root@gpuserver009:~# cat temsorflow.sh#!/bin/bashmkdir /tensorflowcd /tensorflowwget https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whlcd /data/mkdir -p sharedata/installation_packageschmod -R 775 sharedatagroupadd shannonchown -R :shannon sharedatacd sharedata/installation_packages/cp /tensorflow/tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl .chmod 775 tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whlchown :shannon tensorflow_gpu-1.12.0-cp36-cp36m-linux_x86_64.whl
注意:如果下载tensorflow非常慢,可以更换DNS为1.1.1.1试试,或者换DNS试试。
root@gpuserver009:~# cat docker.sh#docker-ce一键安装脚本apt-get install expect opencc ack-grepapt-get install -y apt-transport-https ca-certificates curl software-properties-commoncurl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -add-apt-repository \"deb [arch=amd64] https://download.docker.com/linux/ubuntu \$(lsb_release -cs) \stable"apt-get updatecurl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -curl -s -L https://nvidia.github.io/nvidia-docker/ubuntu16.04/amd64/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.listdeb https://nvidia.github.io/libnvidia-container/ubuntu16.04/$(ARCH) /deb https://nvidia.github.io/nvidia-container-runtime/ubuntu16.04/$(ARCH) /deb https://nvidia.github.io/nvidia-docker/ubuntu16.04/$(ARCH) /apt-get updateapt-get install -y docker-ce#NVIDIA docker安装脚本(库在docker安装的时候就安装好了)apt-get install -y nvidia-dockerpkill -SIGHUP dockerddocker run --runtime=nvidia --rm nvidia/cuda:9.0-base nvidia-smi#如果报错"docker: Error response from daemon: OCI runtime create failed: unable to retrieve OCI runtime error (open /run/containerd/io.containerd.runtime.v1.linux/moby/a07397325389a72f8bcb423d959dffb10118f5a6694e10e6df5a20dbebf98f3c/log.json: no such file or directory): exec: "nvidia-container-runtime": executable file not found in $PATH: unknown."#请执行以下内容$ sudo apt-get install nvidia-container-runtime#安装最新版docker-composecurl -sSL -o /usr/local/bin/docker-compose https://code.aliyun.com/k9kdqvbb/files/raw/master/docker-compose-Linux-x86_64chmod +x /usr/local/bin/docker-compose
#一键安装脚本root@gpuserver001:/# cat NFS-server.sh#!/bin/bash#designed by ZhangSimingapt-get updateapt install -y nfs-kernel-servermkdir -p /data/nfsdatachown -R nobody:nogroup /data/nfsdata2chmod -R 777 /data/nfsdata2echo "/data/nfsdata2 172.22.0.0/16(rw,sync,no_root_squash,no_subtree_check)" >> /etc/exports#一定要有no_subtree_check(不强制检查父目录)或者subtree_check(强制检查父目录)exportfs -asystemctl restart nfs-kernel-serversystemctl enable nfs-kernel-serverecho "NFS-server configured successfully!!"echo "command:mount -t nfs `hostname -I | awk '{print $1}'`:/data/nfsdata /data/nfsdata"#查看开放挂载网段root@gpuserver001:/# showmount -e localhostExport list for localhost:/data/nfsdata2 11.11.22.0/16
#一键安装挂载脚本root@gpuserver001:/# cat NFS-client.sh#!/bin/bash#designed by ZhangSimingapt-get install -y nfs-commonmkdir -p /data/nfsdata2mkdir -p /data/nfsdata#mount -t nfs 11.11.22.1:/data/nfsdata2 /data/nfsdata2
zhangsiming@gpuserver003:~$ tail -4 /etc/rc.localmount -t nfs 11.11.22.2:/data/nfsdata /data/nfsdatamount -t nfs 11.11.22.1:/data/nfsdata2 /data/nfsdata2exit 0#不要写在/etc/fstab中