k8s install

1. os : centos

Installing K8s

    0. nvidia 설치
# yum -y install gcc gcc-c++ make binutils libtool autoconf automake patch pkgconfig redhat-rpm-config gettext
# yum -y install epel-release

# cat /etc/modprobe.d/nvidia-installer-disable-nouveau.conf 
# cat /etc/modprobe.d/blacklist.conf
blacklist nouveau
options nouveau modeset=0
# mv /boot/initramfs-$(uname -r).img /boot/initramfs-$(uname -r).img.bak    
-> 파일 목록이 생성되지 않으면 부팅 시 Error가 발생합니다.
# dracut -v /boot/initramfs-$(uname -r).img $(uname -r)
# reboot
# systemctl isolate multi-user.target
# sudo service lightdm stop
# sudo init 3

- 그래픽 드라이버 확인
# lspci -k | grep -EA3 'VGA|3D|Display'
- https://www.nvidia.co.kr/Download/index.aspx?lang=kr 드라이버 다운받기
# chmod +x NVIDIA-Linux-x86_64-xxx.xx.run -> xxx.xx는 설치한 nvidia 그래픽 드라이버 version 입니다.
# ./NVIDIA-Linux-x86_64-xxx.xx.run



    1. nvidia-docker 설치
sudo yum install -y docker docker-registry

distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo

sudo yum install -y nvidia-container-toolkit
sudo systemctl restart docker
or
yum install nvidia-docker2


# nvidai docker 실행 테스트 
sudo systemctl daemon-reload
sudo systemctl restart docker
docker pull nvidia/cuda
sudo docker run --runtime=nvidia --rm nvidia/cuda nvidia-smi

    2. 도커 설정 - 설정하면 에러남 나중에 확인해보기 
$ sudo su
$ cat > /etc/docker/daemon.json <<EOF
{
  "exec-opts": ["native.cgroupdriver=systemd"],
  "log-driver": "json-file",
  "log-opts": {
    "max-size": "100m"
  },
  "storage-driver": "overlay2",
  "default-runtime": "nvidia",
    "runtimes": {
        "nvidia": {
            "path": "/usr/bin/nvidia-container-runtime",
            "runtimeArgs": []
        }
    }
}

$ mkdir -p /etc/systemd/system/docker.service.d
sudo systemctl restart docker
$ systemctl daemon-reload
$ systemctl restart docker

    3. k8s 설치
# cat <<EOF > /etc/yum.repos.d/kubernetes.repo
[kubernetes]
name=Kubernetes
baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64
enabled=1
gpgcheck=1
repo_gpgcheck=1
gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
EOF

# sudo yum install -y kubelet kubeadm kubectl
-> 이미 설치시 지우고 버전 변경 sudo yum remove  -y kubelet kubeadm kubectl
sudo yum install -y kubelet-1.15.5 kubeadm-1.15.5 kubectl-1.15.5
sudo yum install -y kubelet-1.14.5 kubeadm-1.14.5 kubectl-1.14.5
# ubuntu 관리자 권한
apt-get update && apt-get install -y apt-transport-https curl
-s https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
deb http://apt.kubernetes.io/ kubernetes-xenial main > /etc/apt/sources.list.d/kubernetes.list
apt-get update
apt-get install -y kubelet=1.15.5-00 kubeadm=1.15.5-00 kubectl=1.15.5-00 


sudo systemctl enable kubelet
sudo systemctl start kubelet

- set hostname on nodes
sudo hostnamectl set-hostname master-node
or
sudo hostnamectl set-hostname worker-node1

- host entry or DNS record to resolve the hostname for all nodes
192.168.1.10 master.phoenixnap.com master-node
192.168.1.20 node1. phoenixnap.com node1 worker-node

-  Configure Firewall
sudo firewall-cmd --permanent --add-port=6443/tcp
sudo firewall-cmd --permanent --add-port=2379-2380/tcp
sudo firewall-cmd --permanent --add-port=10250/tcp
sudo firewall-cmd --permanent --add-port=10251/tcp
sudo firewall-cmd --permanent --add-port=10252/tcp
sudo firewall-cmd --permanent --add-port=10255/tcp
sudo firewall-cmd –-reload
-> ubuntu
sudo ufw allow 6443/tcp
sudo ufw allow 2379-2380/tcp
sudo ufw allow 10250/tcp
sudo ufw allow 10251/tcp
sudo ufw allow 10252/tcp
sudo ufw allow 10255/tcp
sudo ufw allow from "허용할 아이피"
# http 열기
sudo ufw allow http
sudo ufw allow https
or
sudo ufw allow 80/tcp
sudo ufw allow 443/tcp

sudo ufw status verbose

- Disable SELinux
컨테이너가 filesystem의 접근권한을 허용해주기 위해
sudo setenforce 0
sudo sed -i ‘s/^SELINUX=enforcing$/SELINUX=permissive/’ /etc/selinux/config

- Disable SWAP
sudo sed -i '/swap/d' /etc/fstab
sudo swapoff -a

Setup K8s

<!--curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64-->
<!--sudo install minikube-linux-amd64 /usr/local/bin/minikube-->
yum install kernel-devel
yum install kernel-headers
sudo yum install -y yum-plugin-copr
sudo yum -y copr enable ngompa/snapcore-el7
sudo yum -y install snapd
sudo ln -s /var/lib/snapd/snap /snap

    - 쿠버네티스 초기화

# 아이피 기존 네트워크와 안겹치게 조심하기 
sudo sysctl net.bridge.bridge-nf-call-iptables=1
sudo kubeadm init --pod-network-cidr=172.16.0.0/16  --apiserver-advertise-address="내서버 ip" --kubernetes-version=1.14.5

    -> error 
[ERROR FileContent--proc-sys-net-bridge-bridge-nf-call-iptables]: /proc/sys/net/bridge/bridge-nf-call-iptables contents are not set to 1
해결
echo '1' > /proc/sys/net/bridge/bridge-nf-call-iptables
vi /etc/sysctl.conf
net.bridge.bridge-nf-call-iptables = 1
sysctl -p
    -> error
[ERROR DirAvailable--etc-kubernetes-manifests]: /etc/kubernetes/manifests is not empty
[ERROR FileAvailable--etc-kubernetes-kubelet.conf]: /etc/kubernetes/kubelet.conf already exists
[ERROR Port-10250]: Port 10250 is in use
[ERROR FileAvailable--etc-kubernetes-pki-ca.crt]: /etc/kubernetes/pki/ca.crt already exists

* 해결
sudo systemctl stop kubelet
or 
sudo kubeadm reset
or
microk8s.reset
-> 나머지 3개 에러는 해당 데이터 삭제 


mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
# enable master node scheduling
export KUBECONFIG=/etc/kubernetes/kubelet.conf
kubectl get nodes

# calico 사용시 쿠버네티스 레디상태로 이동
$ kubectl taint nodes --all node-role.kubernetes.io/master-
$ kubectl apply -f https://docs.projectcalico.org/v3.11/manifests/calico.yaml
-> 에러시 calico.yaml 설치 
curl -O https://docs.projectcalico.org/v3.9/manifests/calico.yaml
sed s/192.168.0.0\\/16/20.96.0.0\\/12/g -i calico.yaml
kubectl apply -f calico.yaml

-> 위 명령어실행 시 kubeadm ~~ 어쩌구 나옴 salve sever에서 해당 명령어 실행해서 join
sudo kubeadm join "서버 ip"5:6443 --token 1kjhqe.02wl7j80euplruj6 --discovery-token-ca-cert-hash sha256:8e95e3278fc0af5283760d65ae66adcff733fe25dfee1e651d64ae2d9217bf

    -> error
error execution phase preflight: couldn't validate the identity of the API Server: could not find a JWS signature in the cluster-info ConfigMap for token ID 
해결 1
vi /etc/sysctl.conf
net.ipv6.conf.all.disable_ipv6 = 1
net.ipv6.conf.default.disable_ipv6 = 1
sysctl -p
해결 2
# centos REdhat 인경우 보안끄기
sudo systemctl stop firewalld
sudo systemctl stop iptables
sudo systemctl stop ip6tables
# disable은 영구 해지 -> systemctl disable firwalld

    -> error
[kubelet-check] The HTTP call equal to 'curl -sSL http://localhost:10248/~~' failed with error: Get http://localhost:10248/~~: dial tcp 127.0.0.1:10248: connect: connection refused.
error execution phase kubelet-start: error uploading crisocket: timed out waiting for the condition

# /etc/systemd/system/kubelet.service.d/10-kubeadm.conf 아래내용 추가
Environment="KUBELET_EXTRA_ARGS=--fail-swap-on=false"

sudo systemctl daemon-reload
sudo systemctl restart kubelet
kubeadm init --skip-preflight-checks

    -> error
# sudo kubectl get nodes
error: no configuration has been provided, try setting KUBERNETES_MASTER environment variable
해결
export KUBECONFIG=/etc/kubernetes/admin.conf
source /etc/profile

    -> error
error execution phase kubelet-start: error uploading crisocket: timed out waiting for the conditio
해결 -> 보안 확인 
# 확인방법
getenforce
# selinux 끄기
vi /etc/sysconfig/selinux
SELINUX=enforcing 을 SELINUX=disabled 로 변경후 저장한다.
reboot
# 혹은 초기화
kubeadm reset
ifconfig cni0 down && ip link delete cni0
ifconfig flannel.1 down && ip link delete flannel.1
rm -rf /var/lib/cni/

kubectl get --raw /api/v1/namespaces/istio-system/services/https:istio-galley:https-validation/proxy/ready -v9 --request-timeout=2s


StorageClass

    - Local Path Provisioner 설치
    
# https://github.com/rancher/local-path-provisioner
kubectl apply -f https://raw.githubusercontent.com/rancher/local-path-provisioner/master/deploy/local-path-storage.yaml
# install 확인
kubectl -n local-path-storage get pod
# log 확인
kubectl -n local-path-storage logs -f "name 위 install 확인 명력어로 확인 가능"

    - NFS Client 설치
 # brew 설치
yum install curl git irb m4 ruby texinfo bzip2-devel curl-devel expat-devel ncurses-devel zlib-devel
export PATH="$HOME/.linuxbrew/bin:$PATH"
export MANPATH="$HOME/.linuxbrew/share/man:$MANPATH"
export INFOPATH="$HOME/.linuxbrew/share/info:$INFOPATH"
source /etc/profile

# https://github.com/helm/charts/tree/master/stable/nfs-client-provisioner
# helm 을 이용해서 nfs-client-pro-visioner 패키지 설치
# NFS 설치     
sudo yum install -y nfs-utils
sudo systemctl enable rpcbind
sudo systemctl enable nfs-server
sudo systemctl start rpcbind
sudo systemctl start nfs-server
# NFS change the permission 
# 원하는 위치에  마운트할 디렉토리 설정 - 서버
vi /etc/exports
/nfs/ *.*.*.*(rw,all_squash,sync,no_root_squash)
# 변경된 내용 반영
exportfs -r
-> (rw,insecure,sync,no_subtree_check,no_root_squash)

- 위에 처음은 공유할 대상 디렉토리 그다음아이피 (권한)
# 옵션 
ro                      -> 읽기 권한 부여 한다.
rw                     -> 읽기 쓰기 권한 부여 한다.
root_squash         -> 클라이언트에서 root를 서버상의 nobody 계정으로 매핑한다.
no_root_squash    -> 클라이언트 및 서버 모두 root 계정 사용한다.
sync                  -> 동기화한다.
all_squash          -> root 계정이 아닌 다른 계정도 사용 할  수 있게한다.
# 적용하기
sudo systemctl restart nfs
chmod o+w "대상 디렉토리"
# 서비스 등록하기 - 관리자권한에서 
systemctl   restart   rpcbind
systemctl   start   nfs-server
systemctl   start   nfs-lock
systemctl   start   nfs-idmap

systemctl   enable   rpcbind
systemctl   enable   nfs-server
systemctl   enable   nfs-lock
systemctl   enable   nfs-idmap

# 보안 해제
firewall-cmd --permanent --zone public --add-service mountd
firewall-cmd --permanent --zone public --add-service rpc-bind
firewall-cmd --permanent --zone public --add-service nfs
firewall-cmd --reload

nfsstat -s ->로 마운트 확인

# 클라이언트 설정
sudo mount -t nfs "server ip":/nfs /nfs
touch /nfs/client.txt
yum install showmount  or sudo apt-get install nfs-common
mount -t nfs <공유서버명>:<공유디렉토리명>  <연결디렉토리>

showmount -e "공유서버 ip"
# nfs를 재실행
sudo systemctl stop nfs-server
sudo systemctl start nfs-server

    -> tiller 삭제시
kubectl delete all -l app=helm -n kube-system

kubectl apply -f https://raw.githubusercontent.com/rancher/local-path-provisioner/master/deploy/local-path-storage.yaml
curl https://raw.githubusercontent.com/helm/helm/master/scripts/get > get_helm.sh
./get_helm.sh
kubectl -n kube-system create sa tiller
kubectl create clusterrolebinding tiller --clusterrole cluster-admin --serviceaccount=kube-system:tiller
helm init --service-account tiller
    -> error 
helm init flag 없음
해결
# helm 3.x 이상부터 helm init가 사라짐  2버전으로 다운그래이드 하기
brew uninstall helm
brew install helm@2  
brew link --force helm@2
helm repo update

#  여기서부터 nfs-client-provisioner install
helm repo update
helm install --name my-release --set nfs.server="서버 ip" --set nfs.path="nfs 위치" stable/nfs-client-provisioner
kubectl patch storageclass nfs-client -p '{"metadata": { "annotations" : { "storageclass.kubernetes.io/is-default-class":"true"}}}'
# 설치 확인
kubectl get storageclass

     - nvidia-gpu-plugin 설치
kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/1.0.0-beta6/nvidia-device-plugin.yml
# 설치 확인
kubectl get pod -n kube-system


    - 도커 레지스트리 만들기
도커 이미지를 저장할 프라이빗 도커 레지스트리를 설치해야함 
docker pull registry:latest

wget https://raw.githubusercontent.com/mojokb/handson-kubeflow/master/registry/kubeflow-registry-deploy.yaml
wget https://raw.githubusercontent.com/mojokb/handson-kubeflow/master/registry/kubeflow-registry-svc.yaml

kubectl apply -f kubeflow-registry-deploy.yaml
kubectl apply -f kubeflow-registry-svc.yaml

 
내용 : kubeflow-registry-deploy.yaml 파일 참조 
내용 : kubeflow-registry-svc.yaml 파일 참조

** deploy 다운받아서 하는경우 변경해줘야 정상실행됨 -> apiVersion: apps/v1 

# /etc/hosts 에 아래 내용 추가  ip는 본인 아이피
10.X.X.X     kubeflow-registry.defalut.svc.cluster.local

# 명령어로 실행 여부 확인 
curl kubeflow-registry.defalut.svc.cluster.local:30000/v2/_catalog
{"repositories":[]} -> 등록된 이미지가 없어서 이렇게 반환됨 

# 프라이빗 레지스트리에서 보안 허용체크 하기 
# vi /etc/docker/daemon.json  아래내용 추가 
"insecure-registries" : [
    "kubeflow-registry.defalut.svc.cluster.local:30000"
 ]

-> full file   
{
    "runtimes": {
        "nvidia": {
            "path": "nvidia-container-runtime",
            "runtimeArgs": [
                "kubeflow-registry.defalut.svc.cluster.local:30000"
            ]
        }
    },
    "insecure-registries": ["kubeflow-registry.defalut.svc.cluster.local:30000"]
}

sudo systemctl restart docker

# 이미지 올리기   
sudo docker login
sudo docker pull busybox
sudo docker tag busybox:latest kubeflow-registry.defalut.svc.cluster.local:30000/busybox:latest
sudo docker push kubeflow-registry.defalut.svc.cluster.local:30000/busybox:latest

# 다시 확인 
curl kubeflow-registry.defalut.svc.cluster.local:30000/v2/_catalog
{"repositories":["busybox"]}

k9s

k9s : 쿠버네티스 관리 툴 

wget https://github.com/derailed/k9s/releases/download/v0.19.6/k9s_Linux_x86_64.tar.gz
tar xvzf k9s_Linux_x86_64.tar.gz
sudo mv k9s /usr/bin
k9s 
->  그래픽 기반 관리툴을 볼 수 있음


다음은 kubeflow_install 로

참고


  • 쿠버네티스에서 머신러닝이 처음이라면! 쿠브플로우
  • nvidia 설치
    https://coding-chobo.tistory.com/20
  • nvidia-docker 설치
    https://github.com/NVIDIA/nvidia-docker
  • k8s 설치
    https://phoenixnap.com/kb/how-to-install-kubernetes-on-centos
  • nfs 설정
    https://epdl-studio.tistory.com/43
  • 도커 레지스트리 만들기
    https://www.44bits.io/ko/post/running-docker-registry-and-using-s3-storage
  • 도커 설치 문제시 삭제 방법
    https://docs.docker.com/engine/install/centos/
  • istio 설치문제
    https://github.com/kubeflow/kfctl/issues/237