标题:Ansible 在 HPC 集群的快速落地(账户/密钥/目录/Slurm/OOD/监控)
raw 先装 pythonuser、authorized_key、file、template、service、acl、mysql_* 等
hpc-ansible/ ├── inventories/hosts.ini ├── group\\\\_vars/ │ ├── all.yml │ ├── controller.yml │ └── computes.yml ├── roles/ │ ├── base/ # 通用:时区、ntp、包、sudo、selinux │ ├── users/ # 组/用户/authorized\\\\_keys/共享ACL │ ├── ssh/ # sshd 硬化与重载 │ ├── munge/ # munge.key 分发与服务 │ ├── slurm/ # slurm.conf/StateSave/服务 │ ├── ood/ # OOD\\\\_DATAROOT & PUN 重启 │ ├── exporters/ # node/slurm/dcgm 等 │ └── monitoring/ # prometheus/grafana(如独立主机) └── site.yml
inventories/hosts.ini
[controller]
epic-control-node ansible_host=10.0.0.10
[controllers_backup]
slurm-ctrl-new ansible_host=10.0.0.11
[computes]
compute[01:32].lab.example ansible_user=root
[ood]
ood-gateway ansible_host=10.0.0.20
[all:vars]
ansible_python_interpreter=/usr/bin/python3
group_vars/all.yml
uid_base: 10000
gid_base: 20000
# 共享路径
shared_ood_root: /shared/ood
slurm_state_dir: /shared/slurm/state
slurm_conf_src: files/slurm/slurm.conf
# 项目组
project_group: projectA
project_gid: 20001
# 用户清单
hpc_users:
- { name: alice, uid: 10001, groups_extra: ["{{ project_group }}"], shell: /bin/bash, lock_password: true }
- { name: bob, uid: 10002, groups_extra: ["{{ project_group }}"], shell: /bin/bash, lock_password: true }
# OOD
ood_dataroot_tpl: "{{ shared_ood_root }}/$USER/ondemand/data"