worker-pod.yaml

apiVersion: v1
kind: Pod
metadata:
  name: work-pod-1
  namespace: sfr-ns-yihaofeng
spec:
  restartPolicy: OnFailure
  volumes:
    - name: sfr-home-pv-yihaofeng
      persistentVolumeClaim:
        claimName: sfr-home-pvc-yihaofeng
    - name: sfr-share-pv-yihaofeng
      persistentVolumeClaim:
        claimName: sfr-share-pvc-yihaofeng
    - name: dshm
      emptyDir:
        medium: Memory
  containers:
    - name: research-container
      image: "gcr.io/deeplearning-platform-release/pytorch-gpu:latest"
      command: ["/bin/sh", "-c"]
      args: ["sleep 40d"]
      resources:
        limits:
          nvidia.com/gpu: 8
          cpu: "63"
          memory: 400G
      volumeMounts:
        - name: sfr-home-pv-yihaofeng
          mountPath: "/export/home"
        - name: sfr-share-pv-yihaofeng
          mountPath: "/export/share"
        - mountPath: /dev/shm
          name: dshm
  nodeSelector:
    cloud.google.com/gke-accelerator: nvidia-tesla-v100
  tolerations:
  - key: "gpu_num"
    operator: "Equal"
    value: "gvnic-8"
    effect: "NoSchedule"