I'm preparing a cluster with personal machine, I mount Centos 7 in the server and I'm trying to start the slurm clients but when I typed this command:
pdsh -w n[00-09] systemctl start slurmd
I had this error:
n07: Job for slurmd.service failed because the control process exited with error code. See "systemctl status slurmd.service" and "journalctl -xe" for details.
pdsh@localhost: n07: ssh exited with exit code 1
I had that message for all the nodes.
[root@localhost ~]# systemctl status slurmd.service -l
● slurmd.service - Slurm node daemon
Loaded: loaded (/usr/lib/systemd/system/slurmd.service; enabled; vendor preset: disabled)
Active: failed (Result: exit-code) since Tue 2020-12-22 18:27:30 CST; 27min ago
Process: 1589 ExecStart=/usr/sbin/slurmd $SLURMD_OPTIONS (code=exited, status=203/EXEC)
Dec 22 18:27:30 localhost.localdomain systemd[1]: Starting Slurm node daemon...
Dec 22 18:27:30 localhost.localdomain systemd[1]: slurmd.service: control process exited, code=exited status=203
Dec 22 18:27:30 localhost.localdomain systemd[1]: Failed to start Slurm node daemon.
Dec 22 18:27:30 localhost.localdomain systemd[1]: Unit slurmd.service entered failed state.
Dec 22 18:27:30 localhost.localdomain systemd[1]: slurmd.service failed.
This is the slurm.conf file:
ClusterName=linux
ControlMachine=localhost
#ControlAddr=
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/var/spool/slurm/ctld
SlurmdSpoolDir=/var/spool/slurm/d
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
ProctrackType=proctrack/pgid
#PluginDir=
#FirstJobId=
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
#TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
#SCHEDULING
SchedulerType=sched/backfill
#SchedulerAuth=
#SelectType=select/linear
FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
#LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurmd.log
JobCompType=jobcomp/none
#JobCompLoc=
#
#ACCOUNTING #JobAcctGatherType=jobacct_gather/linux #JobAcctGatherFrequency=30 # #AccountingStorageType=accounting_storage/slurmdbd #AccountingStorageHost= #AccountingStorageLoc= #AccountingStoragePass= #AccountingStorageUser= # #COMPUTE NODES # OpenHPC default configuration TaskPlugin=task/affinity PropagateResourceLimitsExcept=MEMLOCK AccountingStorageType=accounting_storage/filetxt Epilog=/etc/slurm/slurm.epilog.clean NodeName=n[00-09] Sockets=1 CoresPerSocket=6 ThreadsPerCore=2 State=UNKNOWN
PartitionName=normal Nodes=n[00-09] Default=YES MaxTime=24:00:00 State=UP
ReturnToService=1
The control machine was set by hostname -s
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…