From 52deaaac46c1949872ee0cf904e307384f8ee391 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=81=B0=E3=81=8B=E9=9B=AA?= <165359296+bakayukii@users.noreply.github.com> Date: Tue, 9 Apr 2024 16:51:26 +0900 Subject: [PATCH] feat(nvidia-device-plugin): add nvidia-device-plugin (#20132) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Description** Hello, This adds the nvidia-device-plugin preconfigured for 5 vcpu per pgpu. ⚒️ Fixes # **⚙️ Type of change** - [X] ⚙️ Feature/App addition - [ ] 🪛 Bugfix - [ ] ⚠️ Breaking change (fix or feature that would cause existing functionality to not work as expected) - [ ] 🔃 Refactor of current code **🧪 How Has This Been Tested?** **📃 Notes:** **✔️ Checklist:** - [X] ⚖️ My code follows the style guidelines of this project - [X] 👀 I have performed a self-review of my own code - [X] #️⃣ I have commented my code, particularly in hard-to-understand areas - [X] 📄 I have made corresponding changes to the documentation - [ ] ⚠️ My changes generate no new warnings - [ ] 🧪 I have added tests to this description that prove my fix is effective or that my feature works - [X] ⬆️ I increased versions for any altered app according to semantic versioning - [X] I made sure the title starts with `feat(chart-name):`, `fix(chart-name):` or `chore(chart-name):` **➕ App addition** If this PR is an app addition please make sure you have done the following. - [ ] 🖼️ I have added an icon in the Chart's root directory called `icon.png` --- _Please don't blindly check all the boxes. Read them and only check those that apply. Those checkboxes are there for the reviewer to see what is this all about and the status of this PR with a quick glance._ --------- Signed-off-by: bitpushr <91350598+bitpushr@users.noreply.github.com> Signed-off-by: Kjeld Schouten Co-authored-by: bitpushr <91350598+bitpushr@users.noreply.github.com> Co-authored-by: Kjeld Schouten --- .../system/nvidia-device-plugin/.helmignore | 30 ++++++ charts/system/nvidia-device-plugin/Chart.yaml | 44 +++++++++ charts/system/nvidia-device-plugin/README.md | 28 ++++++ .../nvidia-device-plugin/docs/installation.md | 93 ++++++++++++++++++ charts/system/nvidia-device-plugin/icon.png | Bin 0 -> 5100 bytes .../nvidia-device-plugin/templates/NOTES.txt | 1 + .../templates/common.yaml | 5 + .../system/nvidia-device-plugin/values.yaml | 38 +++++++ 8 files changed, 239 insertions(+) create mode 100644 charts/system/nvidia-device-plugin/.helmignore create mode 100644 charts/system/nvidia-device-plugin/Chart.yaml create mode 100644 charts/system/nvidia-device-plugin/README.md create mode 100644 charts/system/nvidia-device-plugin/docs/installation.md create mode 100644 charts/system/nvidia-device-plugin/icon.png create mode 100644 charts/system/nvidia-device-plugin/templates/NOTES.txt create mode 100644 charts/system/nvidia-device-plugin/templates/common.yaml create mode 100644 charts/system/nvidia-device-plugin/values.yaml diff --git a/charts/system/nvidia-device-plugin/.helmignore b/charts/system/nvidia-device-plugin/.helmignore new file mode 100644 index 00000000000..77ca5567b26 --- /dev/null +++ b/charts/system/nvidia-device-plugin/.helmignore @@ -0,0 +1,30 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ +# OWNERS file for Kubernetes +OWNERS +# helm-docs templates +*.gotmpl +# docs folder +/docs +# icon +icon.png diff --git a/charts/system/nvidia-device-plugin/Chart.yaml b/charts/system/nvidia-device-plugin/Chart.yaml new file mode 100644 index 00000000000..e0f9ee7acfb --- /dev/null +++ b/charts/system/nvidia-device-plugin/Chart.yaml @@ -0,0 +1,44 @@ +annotations: + max_scale_version: 24.04.0 + min_scale_version: 23.10.0 + truecharts.org/SCALE-support: "false" + truecharts.org/category: operators + truecharts.org/max_helm_version: "3.14" + truecharts.org/min_helm_version: "3.11" + truecharts.org/train: system +apiVersion: v2 +appVersion: 0.0.1 +dependencies: + - name: common + version: 20.2.10 + repository: oci://tccr.io/truecharts + condition: "" + alias: "" + tags: [] + import-values: [] + - name: nvidia-device-plugin + version: 0.14.5 + repository: https://nvidia.github.io/k8s-device-plugin + condition: "" + alias: nvdp + tags: [] + import-values: [] +deprecated: false +description: NVIDIA device plugin for Kubernetes +home: https://truecharts.org/charts/system/nvidia-device-plugin +icon: https://truecharts.org/img/hotlink-ok/chart-icons/nvidia-device-plugin.png +keywords: + - nvidia + - plugins +kubeVersion: ">=1.24.0-0" +maintainers: + - name: TrueCharts + email: info@truecharts.org + url: https://truecharts.org +name: kubeapps +sources: + - https://cert-manager.io/ + - https://github.com/NVIDIA/k8s-device-plugin?tab=readme-ov-file#deployment-via-helm + - https://github.com/truecharts/charts/tree/master/charts/system/nvidia-device-plugin +type: application +version: 0.14.5 diff --git a/charts/system/nvidia-device-plugin/README.md b/charts/system/nvidia-device-plugin/README.md new file mode 100644 index 00000000000..e0ed454cbce --- /dev/null +++ b/charts/system/nvidia-device-plugin/README.md @@ -0,0 +1,28 @@ +--- +title: README +--- + +## General Info + +TrueCharts can be installed as both _normal_ Helm Charts or as Apps on TrueNAS SCALE. +However only installations using the TrueNAS SCALE Apps system are supported. + +For more information about this App, please check the docs on the TrueCharts [website](https://truecharts.org/charts/system/kubeapps) + +**This chart is not maintained by the upstream project and any issues with the chart should be raised [here](https://github.com/truecharts/charts/issues/new/choose)** + +## Support + +- Please check our [quick-start guides for TrueNAS SCALE](https://truecharts.org/manual/SCALE/guides/scale-intro). +- See the [Website](https://truecharts.org) +- Check our [Discord](https://discord.gg/tVsPTHWTtr) +- Open a [issue](https://github.com/truecharts/charts/issues/new/choose) + +--- + +## Sponsor TrueCharts + +TrueCharts can only exist due to the incredible effort of our staff. +Please consider making a [donation](https://truecharts.org/sponsor) or contributing back to the project any way you can! + +_All Rights Reserved - The TrueCharts Project_ diff --git a/charts/system/nvidia-device-plugin/docs/installation.md b/charts/system/nvidia-device-plugin/docs/installation.md new file mode 100644 index 00000000000..efc50d51a8f --- /dev/null +++ b/charts/system/nvidia-device-plugin/docs/installation.md @@ -0,0 +1,93 @@ +# Talos Linux Setup + +## Enable NVIDIA kernel modules +Before installing the device plugin, some initial steps need to be taken per +[Talos Documentation][1]. Please make sure you have installed the correct system +extensions through a combination of patches + the correct [factory image][2] for your +use case. + +example gpu-worker-patch.yaml +```yaml +machine: + kernel: + modules: + - name: nvidia + - name: nvidia_uvm + - name: nvidia_drm + - name: nvidia_modeset + sysctls: + net.core.bpf_jit_harden: 1 +``` + +### Quick Sanity Check +If running these commands does not produce similar output, you haven't set up base +system completely: +``` +❯ talosctl read /proc/modules +nvidia_uvm 1482752 - - Live 0xffffffffc3b4e000 (PO) +nvidia_drm 73728 - - Live 0xffffffffc3b3b000 (PO) +nvidia_modeset 1290240 - - Live 0xffffffffc39dc000 (PO) +nvidia 56602624 - - Live 0xffffffffc03e0000 (PO) + +❯ talosctl get extensions +NODE NAMESPACE TYPE ID VERSION NAME VERSION +192.168.2.104 runtime ExtensionStatus 0 1 nonfree-kmod-nvidia 535.129.03-v1.6.7 +192.168.2.104 runtime ExtensionStatus 2 1 nvidia-container-toolkit 535.129.03-v1.13.5 +192.168.2.104 runtime ExtensionStatus 4 1 schematic a22f54cdf137d9d058e9a399adecf4bab2f3cc74b15b5bee00005811433e06b0 +192.168.2.104 runtime ExtensionStatus modules.dep 1 modules.dep 6.1.82-talos +``` + +## Create NVIDIA runtime class: +You will need to add this runtime class to pods you wish to add GPU resources to. +``` +❯ cat <uqB}yOoOV&7e02^%B07gtLWmHFl0)1@PvSxxqMRBf z>gm0=(?yH!$?s|N{GR#D&S&PeyZ`L$?7Vkh85=&Nr{$m}At9mHee}STgoG6SFQXv; zOX9o3R{wrt#s-hIcXxL1Cx6$Tcu;G)b4v{n#tIAtmN)?o%Uh>a;CJOg;^ z|8dIFD3X6g{2%-ncB@9@KhtngeToLa&mvIByQ2-gYn&(%>7-;LyX3pAwPpq*cy4#m znh~n$YY)NIn(%Ei7(}BWy&;;=HTdtZdT`Jz#yY&*<{V2w+T6IAl@b1A7dGU#ONMJe zOb6lxu5#6OlnW8v808MnhUXv#5$>T599_URVTfOHc}T^QGo1N6T{BRp@K(745xrqq z;mr$n0A%Y&0!uG{quQ~Is9p_*PR;YUUC?;|WD@|~n#?DBRW9>p1h0qvyzdJiJ$0Vp zfZydu$pBF6+@;oJy#e5XCkJTubNd+}af@6Bl&rtl5)|yRVTER(pdABGZ9aUMk-_$` zP9StUtz-9Wk&1>;5bdfRua5V@1Zu~^1U+_#Ytf|HL}H70S;xzlwXP>?mtUJ zb!&xf!!`>gk}`8! zTU)rf?=5+VzY{Zsa!whXTS3(`D3uk|W-oEYpDU#q?Y4MsIgj*7Iy) zBsai)Vk$Lq@WH|Jd0v$PIpNKzJh+;wyd*7G6~IvFPieaZG*ZqZ+22Ng{1EAi`t96$ z-4CQ~BQmLc$eFlh6|&WH>-7y>#@jZXH!iKu#BZwbw0s7lHfo`iq)jS)#>GjLoVWRz zKy6Mh0oLK33gq;6OhRKk>v9b~NVEr#$Rn`P>B6^rzeG~uZR_GZ-C(u`@$C~5+18Y; z!KV5<>`)D@wfov0Gr(Pu2Q7vVF>w10T|2%u#DwnSg6-@0sfvC?@|;(c3F zigPHss&m){<4=INC;!;fb_M5$XBHZCPgTgNo1VnpFtBAkVYt9cS9;ooQZzHM38Z`am&}tlPtw|zEz2^5AtPj^W6c>O*G zHPZ?_=oP1-Q;X&(0vtWPQ2tDU(v>hy>liwUkpT+m*;gW5xWY^ zni&a<$hiTZK`XVs0m0O_H!t2W%8#t5ckk!#n2719klTf4c9C3&`n_bDY31oG>d#ld zv1#S6y|mkup$_$MJB6K?W@vaC%SE2$Y$7=?9mibOgS8g9cAp|OA7`e|-#{U6`UzSU zT4(K$=Kjo0ojj#LHKcSq0@;)_)`-{UiaG8hwScrkE~BLPnn2K{$igpUY0IdF{E=cA z=keOQx1p|nP~_Dd;GcjE9rrkjOSzmRVV`_Anzo~aOgF;eEH@|1=k z9wwFAYk8k-^ZD&PQ97!0Di_&=N1rCYJyj{~*PObtq~@=*PaQ;)&}4@wB3BKezUWHW zof1_5*|K$B0DO-CZxzVsibYh@p&E$uBxJ+`T(}9IJY>V7r2S*qK5RDRr^^Y8P6dQ7 zYj)zO)6tDk@`_M&=*woJ$r0FT-F^Ts^%*9+Orz(L=A(OW0yi$(m-|SsGqy891f^bp zyxoaKRAKdtlN2deDx`jgU4kC$jo*)=T|@*dM2_|W!e&)Lk1A~FwgE}?F5t+a48Qj+0q`x7HV-Ir6iIpP9Cw!`3vFd93M@5m&Ix+}{?^2EZLVCe=6U z3Hl*G+`;Ym!1GSUuM0n>yTl}*O+BwYd)5sSuw9tSTj&zUH9|w5Yk8ExzMceq6!wuw zo2FgeT#-xH=Yi|CUPW8vT}}_*MP;dj$B_Xc#&S(SP|{o~XL)$# zgyZgBUgmmmDKj;+SC|4U=v-p0b(++9?jf1P)i#7n2Sg2rsFrO}Ox%wfqmA~V>(m#+Yi{3Y)(vxu8a8L0>w{IRAtVaoUGQLF_rPd zyZDu<(FI~vEv1`%r0#5pj;YQA$!=C=3xri`wj@oAAaDet1ye3USAAh0bZ+TcG(He0 zT467>Wd?VNznWbZ)+BNmYQHg^V)z-G-nV(Dqx3vd_%xvwtFJUbJ~WT}WgJE#@c6 z>)Cer**r0g>Hn&4oqpGXF3B`3G)CqgQVk^A{bC+)R8P|sCz5m^n?LRZ8O2dG`uglayE%5zhRqgyD;RBydZMXVNLV=_xLC&{!A_1f7g@a(1|yI6;-~T z*WfgpO-b3%y`c^!sp7Y zGuYyq?-p(6b(`3USMz#0MdCbqSxYy5j#=W!olupaP4p_*y|R z=ILCa_kSd&kM?ZnIT2v6A#0OY>iDY#usbbEHV0$6QKm>NWcV-1eTE{qtPLpy75|$A zM>k5qZ&8R4bdUzzCeH?{SLPP4BUo26u12F)lEXc@ zUK$$O-DHAYcYF&mr%)ZEz0&P!na=KS*bDWovg6LMGtpeue9Rg_jb4hTR>Jd$Aw{_^ z*dW4Vi=ecD*xW=3NzoUH42}o+a}hhad+l@50=b4s)qa+&Y}e$|f{OT9jPi~7tHyuG zCmZ|aq@xUdev`*s-9uif(BS5#$xSIPRb#HA!$HK+X}eDq*V(wEydLSb`d?`4#0W?B z+;+ko8FAjE2I%Lx!#z&Y&vWP#`GSu)dOf9(#oFN|^TtgVxGl-6gu7(nnAW;$Z&|J; zKo#+M8D|XF1?I#eZH0=$QJP^yiNiW@_`m_5ewDx<5jnQdr>AR;NA*^s4P7P$B_I!d zy13b5=3EaJD?9(gm*I2@pTPjeqDS!GFt_3PgPT?x!ipzafvt*Zg#pEqvScB7cWHU0 zsB@Ut7H-VwZa)s`$Bf-&szum5<919W%>R}mB}m4@++E}Ff=YFT`~-$LNc~jnS~G1Y z11e9~YQDVdxN(Oxd8XyDG0LS!UcLH`Bw0H;%KG_)O%{Zh)VcDq}_H-o$Y?(ZW#D|%)OEC^*znY*E6VypKjonK0c z%tDPI<0R*sam@20IkDk(PDkb-qAC|*?p#~*ew1vka=CyF7Zb5$tGB9Fb>>*A<0&as zAdV>dYJM^PEw}hN5~Gp(ArDKRTkX`>s^xA_Yoli;3;H#{)QWCa#aXGb ztNHY2$Hfh1e#zd_b4$oyi~51ONmM=X?UzWOZ|G7<)8P)T7EcO@|6Gj~qOfB4e%g5q z4BNjWXH@Vz&2CFC50rXs(T816)qapu^S#TNmr`7zIob@N-}ceSB7dRfYg4LGZpi>~ z%(UwlUnInhla_wXZ$)Sei^s@$jHh?4=s7+~fAQ{jT!SnT1wXj?G4OFcI+hY8B1ybR z?dtSjL|bs;`0qk?56~v-HnMUAWEhAVQ9yuSu(8giUlFQJ3XV=xUA*$?pA){Gykx?? zZzh!90Er=14;FNP<9LB6MP@(U1%jie*;yAqma(1>^Fa=<)|h@hmd!fd!S1dp;Rh|_ zSwL7zxYno`E5-5j<-Ecw@nI~M)s$OaY*%H};{=JQm z&$o=``7!Q0dr$8(a1-&8ucHU;<8H|PQt3rf6;$Par;ZHg>Q$xK;yktTdG(ecps8ta z%|e3z_kv0OxiwnBcaC@ow&k|t@!DSK8QaA+xym5~2bhbe7z{HYmzI)|}LpVf0x zjKGOX3jG~rgxgOr5D`S$OT*U#U)R(6MjllJOS)-D^9EaDxPL!J;U?6r-iLEVV_hMe ztq?9 zo%mEB7!S9t8z^2+V6K{2X}rQ#J?Exk1|9e0^J(nK|5h*7I9BYzH9+<`5a+)1P_Y2l z%VznWO8FwuE5Xtcj05jPWIhhrf0yl8UbWbm#f;mEkG;25AZi_zCRKqMl-apgfiT~) zN*^E`HEC$3d5cFP2Nh7Ue1{XhdC!OpU4Ggay)OmY?pqhn;1^2rVmQ9*Cz70dXz zP10&sqS_=Ym0!er33=;V&Q9X8{g;pt|M|q`<8JXwmmg|)cn>EP7?XP#A$9|Y6|`eN z42?K6ikWRpNy@p%Q=8OmY~0&E;&v((&7RE%ADUKO2|p}bFm+~C>7{J_uw1t!oRt{W zVYIWAB&}AcZxlCHn6^Qq1Q8}u8_V;{8qN{EA0IO*q^&ewH^s4{> literal 0 HcmV?d00001 diff --git a/charts/system/nvidia-device-plugin/templates/NOTES.txt b/charts/system/nvidia-device-plugin/templates/NOTES.txt new file mode 100644 index 00000000000..efcb74cb772 --- /dev/null +++ b/charts/system/nvidia-device-plugin/templates/NOTES.txt @@ -0,0 +1 @@ +{{- include "tc.v1.common.lib.chart.notes" $ -}} diff --git a/charts/system/nvidia-device-plugin/templates/common.yaml b/charts/system/nvidia-device-plugin/templates/common.yaml new file mode 100644 index 00000000000..acef5795b3c --- /dev/null +++ b/charts/system/nvidia-device-plugin/templates/common.yaml @@ -0,0 +1,5 @@ +{{/* Make sure all variables are set properly */}} +{{- include "tc.v1.common.loader.init" . }} + +{{/* Render the templates */}} +{{ include "tc.v1.common.loader.apply" . }} \ No newline at end of file diff --git a/charts/system/nvidia-device-plugin/values.yaml b/charts/system/nvidia-device-plugin/values.yaml new file mode 100644 index 00000000000..2ce65510a3e --- /dev/null +++ b/charts/system/nvidia-device-plugin/values.yaml @@ -0,0 +1,38 @@ +image: + repository: tccr.io/tccr/scratch + pullPolicy: IfNotPresent + tag: latest + +# don't install this unless you've followed our docs or talos docs! +# ref - https://www.talos.dev/v1.6/talos-guides/configuration/nvidia-gpu-proprietary/ + +# need to override naming +configmap: + nvidia-device-plugin-configs: + enabled: true + expandObjectName: false + data: + # set "replicas" key to number of shares per original resource + # example: 2 physical GPU * 5 replica = 10vGPU + config: | + version: v1 + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 5 + +nvdp: + runtimeClassName: nvidia + gfd: + enabled: true +# don't edit below here +service: + main: + enabled: false + ports: + main: + enabled: false +workload: + main: + enabled: false