Hello, I'm trying to use gpushare device plugin only for exposing gpu_mem resource from k8s gpu node in MiB. I have all the NVIDIA things like drivers, nvidia-container-runtime etc. installed and everything works fine except one thing. For example, there is a pod YAML
{
"ociVersion": "1.0.1-dev",
"process": {
"user": {
"uid": 0,
"gid": 0
},
"args": [
"/bin/sh",
"-c",
"./vectorAdd"
],
"env": [
"PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"HOSTNAME=gpu-test-bald",
"NVIDIA_VISIBLE_DEVICES=no-gpu-has-151MiB-to-run", < ------ Here it is
"ALIYUN_COM_GPU_MEM_IDX=-1",
"ALIYUN_COM_GPU_MEM_POD=151",
"ALIYUN_COM_GPU_MEM_CONTAINER=151",
"ALIYUN_COM_GPU_MEM_DEV=32768",
"TEXT_DETECTOR_STAGING_PORT_8890_TCP_PORT=8890",
"TEXT_DETECTOR_STAGING_SERVICE_HOST=10.62.55.112",
"TEXT_DETECTOR_STAGING_SERVICE_PORT=8890",
"TEXT_DETECTOR_STAGING_PORT=tcp://10.62.55.112:8890",
"TEXT_DETECTOR_STAGING_PORT_8890_TCP=tcp://10.62.55.112:8890",
"KUBERNETES_SERVICE_HOST=10.62.0.1",
"KUBERNETES_PORT_443_TCP=tcp://10.62.0.1:443",
"KUBERNETES_PORT_443_TCP_PORT=443",
"TEXT_DETECTOR_STAGING_SERVICE_PORT_HTTP=8890",
"TEXT_DETECTOR_STAGING_PORT_8890_TCP_PROTO=tcp",
"TEXT_DETECTOR_STAGING_PORT_8890_TCP_ADDR=10.62.55.112",
"KUBERNETES_PORT_443_TCP_ADDR=10.62.0.1",
"KUBERNETES_SERVICE_PORT=443",
"KUBERNETES_SERVICE_PORT_HTTPS=443",
"KUBERNETES_PORT=tcp://10.62.0.1:443",
"KUBERNETES_PORT_443_TCP_PROTO=tcp",
"CUDA_VERSION=8.0.61",
"CUDA_PKG_VERSION=8-0=8.0.61-1",
"LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64",
"LIBRARY_PATH=/usr/local/cuda/lib64/stubs:"
],
"cwd": "/usr/local/cuda/samples/0_Simple/vectorAdd",
"capabilities": {
"bounding": [
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE"
],
"effective": [
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE"
],
"inheritable": [
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE"
],
"permitted": [
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE"
]
},
"oomScoreAdj": 1000
},
"root": {
"path": "/var/lib/docker/overlay2/5b9782752b5d79f2d3646b92e41511a3b959f3d2e7ed1c57c4e299dfb8cd6965/merged"
},
"hostname": "gpu-test-bald",
"mounts": [
{
"destination": "/proc",
"type": "proc",
"source": "proc",
"options": [
"nosuid",
"noexec",
"nodev"
]
},
{
"destination": "/dev",
"type": "tmpfs",
"source": "tmpfs",
"options": [
"nosuid",
"strictatime",
"mode=755",
"size=65536k"
]
},
{
"destination": "/dev/pts",
"type": "devpts",
"source": "devpts",
"options": [
"nosuid",
"noexec",
"newinstance",
"ptmxmode=0666",
"mode=0620",
"gid=5"
]
},
{
"destination": "/sys",
"type": "sysfs",
"source": "sysfs",
"options": [
"nosuid",
"noexec",
"nodev",
"ro"
]
},
{
"destination": "/sys/fs/cgroup",
"type": "cgroup",
"source": "cgroup",
"options": [
"ro",
"nosuid",
"noexec",
"nodev"
]
},
{
"destination": "/dev/mqueue",
"type": "mqueue",
"source": "mqueue",
"options": [
"nosuid",
"noexec",
"nodev"
]
},
{
"destination": "/dev/termination-log",
"type": "bind",
"source": "/var/lib/kubelet/pods/685974b9-5eb0-11ed-bada-001eb9697543/containers/cuda-vector-add/8473aa30",
"options": [
"rbind",
"rprivate"
]
},
{
"destination": "/etc/resolv.conf",
"type": "bind",
"source": "/var/lib/docker/containers/a9b9ee7c563781578218738165e6089442e0d24bdb28ed8c320c40817680f9f7/resolv.conf",
"options": [
"rbind",
"rprivate"
]
},
{
"destination": "/etc/hostname",
"type": "bind",
"source": "/var/lib/docker/containers/a9b9ee7c563781578218738165e6089442e0d24bdb28ed8c320c40817680f9f7/hostname",
"options": [
"rbind",
"rprivate"
]
},
{
"destination": "/etc/hosts",
"type": "bind",
"source": "/var/lib/kubelet/pods/685974b9-5eb0-11ed-bada-001eb9697543/etc-hosts",
"options": [
"rbind",
"rprivate"
]
},
{
"destination": "/dev/shm",
"type": "bind",
"source": "/var/lib/docker/containers/a9b9ee7c563781578218738165e6089442e0d24bdb28ed8c320c40817680f9f7/mounts/shm",
"options": [
"rbind",
"rprivate"
]
},
{
"destination": "/var/run/secrets/kubernetes.io/serviceaccount",
"type": "bind",
"source": "/var/lib/kubelet/pods/685974b9-5eb0-11ed-bada-001eb9697543/volumes/kubernetes.io~secret/default-token-thv9d",
"options": [
"rbind",
"ro",
"rprivate"
]
}
],
"hooks": {
"prestart": [
{
"path": "/usr/bin/nvidia-container-runtime-hook",
"args": [
"/usr/bin/nvidia-container-runtime-hook",
"prestart"
]
}
]
},
"linux": {
"resources": {
"devices": [
{
"allow": false,
"access": "rwm"
},
{
"allow": true,
"type": "c",
"major": 1,
"minor": 5,
"access": "rwm"
},
{
"allow": true,
"type": "c",
"major": 1,
"minor": 3,
"access": "rwm"
},
{
"allow": true,
"type": "c",
"major": 1,
"minor": 9,
"access": "rwm"
},
{
"allow": true,
"type": "c",
"major": 1,
"minor": 8,
"access": "rwm"
},
{
"allow": true,
"type": "c",
"major": 5,
"minor": 0,
"access": "rwm"
},
{
"allow": true,
"type": "c",
"major": 5,
"minor": 1,
"access": "rwm"
},
{
"allow": false,
"type": "c",
"major": 10,
"minor": 229,
"access": "rwm"
}
],
"memory": {
"disableOOMKiller": false
},
"cpu": {
"shares": 2,
"period": 100000
},
"blockIO": {
"weight": 0
}
},
"cgroupsPath": "kubepods-besteffort-pod685974b9_5eb0_11ed_bada_001eb9697543.slice:docker:664e21c310b62b2e1c3537388127812c7e2f482cb5cf40fa52280e3b62cf2646",
"namespaces": [
{
"type": "mount"
},
{
"type": "network",
"path": "/proc/27057/ns/net"
},
{
"type": "uts"
},
{
"type": "pid"
},
{
"type": "ipc",
"path": "/proc/27057/ns/ipc"
}
],
"maskedPaths": [
"/proc/acpi",
"/proc/kcore",
"/proc/keys",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
"/proc/scsi",
"/sys/firmware"
],
"readonlyPaths": [
"/proc/asound",
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger"
]
}
}
{
"ociVersion": "1.0.1-dev",
"process": {
"user": {
"uid": 0,
"gid": 0
},
"args": [
"/bin/sh",
"-c",
"./vectorAdd"
],
"env": [
"PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
"HOSTNAME=gpu-test-bald",
"ALIYUN_COM_GPU_MEM_DEV=32768",
"NVIDIA_VISIBLE_DEVICES=no-gpu-has-153MiB-to-run", <----------Here it is
"ALIYUN_COM_GPU_MEM_IDX=-1",
"ALIYUN_COM_GPU_MEM_POD=153",
"ALIYUN_COM_GPU_MEM_CONTAINER=153",
"NVIDIA_VISIBLE_DEVICES=all", <-------------------Here it is
"TEXT_DETECTOR_STAGING_PORT_8890_TCP=tcp://10.62.55.112:8890",
"KUBERNETES_SERVICE_PORT_HTTPS=443",
"KUBERNETES_PORT=tcp://10.62.0.1:443",
"TEXT_DETECTOR_STAGING_SERVICE_HOST=10.62.55.112",
"TEXT_DETECTOR_STAGING_SERVICE_PORT=8890",
"KUBERNETES_SERVICE_PORT=443",
"TEXT_DETECTOR_STAGING_PORT_8890_TCP_ADDR=10.62.55.112",
"KUBERNETES_SERVICE_HOST=10.62.0.1",
"KUBERNETES_PORT_443_TCP=tcp://10.62.0.1:443",
"TEXT_DETECTOR_STAGING_PORT=tcp://10.62.55.112:8890",
"TEXT_DETECTOR_STAGING_PORT_8890_TCP_PORT=8890",
"KUBERNETES_PORT_443_TCP_PROTO=tcp",
"KUBERNETES_PORT_443_TCP_PORT=443",
"KUBERNETES_PORT_443_TCP_ADDR=10.62.0.1",
"TEXT_DETECTOR_STAGING_SERVICE_PORT_HTTP=8890",
"TEXT_DETECTOR_STAGING_PORT_8890_TCP_PROTO=tcp",
"CUDA_VERSION=8.0.61",
"CUDA_PKG_VERSION=8-0=8.0.61-1",
"LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64",
"LIBRARY_PATH=/usr/local/cuda/lib64/stubs:"
],
...