slurm

Slurm not setting --ntasks correctly


I set --ntasks=8, --cpus-per-task=4 in my SLURM job script, but $SLURM_NTASKS does not exist, and $SLURM_TASKS_PER_NODE is set to 1, which is unexpected. Below is my test.sh script (partition info is also printed below):

#!/bin/bash
# SBATCH --partition=xeon-p8
# SBATCH --ntasks=8
# SBATCH --cpus-per-task=4
# SBATCH --ntasks-per-node=4
# SBATCH --nodes=2
# SBATCH --mem=8G
# SBATCH --time=00:02:00

# Print 
scontrol show job $SLURM_JOB_ID
scontrol show partition xeon-p8

echo "SLURM_TASKS_PER_NODE: $SLURM_TASKS_PER_NODE"
echo "SLURM_JOB_NODELIST: $SLURM_JOB_NODELIST"
echo "SLURM_JOB_ID: $SLURM_JOB_ID"
echo "SLURM_NODELIST: $SLURM_NODELIST"
echo "SLURM_NTASKS: $SLURM_NTASKS"

After sbatch test.sh, the output slurm-27688685.out is:

JobId=27688685 JobName=test.sh
   UserId=rsun(63073) GroupId=rsun(63073) MCS_label=N/A
   Priority=10862 Nice=0 Account=default_group QOS=normal
   JobState=RUNNING Reason=None Dependency=(null)
   Requeue=1 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0
   RunTime=00:00:00 TimeLimit=4-04:00:00 TimeMin=N/A
   SubmitTime=2024-12-31T00:48:48 EligibleTime=2024-12-31T00:48:48
   AccrueTime=2024-12-31T00:48:48
   StartTime=2024-12-31T00:48:53 EndTime=2025-01-04T04:48:53 Deadline=N/A
   PreemptEligibleTime=2024-12-31T00:48:53 PreemptTime=None
   SuspendTime=None SecsPreSuspend=0 LastSchedEval=2024-12-31T00:48:53 Scheduler=Main
   Partition=xeon-p8 AllocNode:Sid=login-3:1719444
   ReqNodeList=(null) ExcNodeList=(null)
   NodeList=d-19-13-4
   BatchHost=d-19-13-4
   NumNodes=1 NumCPUs=1 NumTasks=1 CPUs/Task=1 ReqB:S:C:T=0:0:*:*
   ReqTRES=cpu=1,mem=4000M,node=1,billing=1
   AllocTRES=cpu=1,mem=4000M,node=1,billing=1
   Socks/Node=* NtasksPerN:B:S:C=0:0:*:* CoreSpec=*
   MinCPUsNode=1 MinMemoryCPU=4000M MinTmpDiskNode=0
   Features=(null) DelayBoot=00:00:00
   OverSubscribe=USER Contiguous=0 Licenses=(null) Network=(null)
   Command=/home/gridsan/rsun/test.sh
   WorkDir=/home/gridsan/rsun
   StdErr=/home/gridsan/rsun/slurm-27688685.out
   StdIn=/dev/null
   StdOut=/home/gridsan/rsun/slurm-27688685.out
   Power=
   

PartitionName=xeon-p8
   AllowGroups=ALL AllowAccounts=ALL AllowQos=ALL
   AllocNodes=ALL Default=NO QoS=N/A
   DefaultTime=NONE DisableRootJobs=NO ExclusiveUser=YES GraceTime=0 Hidden=NO
   MaxNodes=UNLIMITED MaxTime=4-04:00:00 MinNodes=0 LLN=NO MaxCPUsPerNode=UNLIMITED MaxCPUsPerSocket=UNLIMITED
   Nodes=d-3-1-[3-4],d-3-2-[1-4],d-3-3-[1-4],d-3-4-[1-4],d-3-5-[1-4],d-3-6-[1-4],d-3-7-[1-4],d-3-8-[1-4],d-3-9-[1-4],d-3-10-[1-4],d-3-11-[1-4],d-3-12-[1-4],d-3-13-[1-4],d-3-14-[1-4],d-4-1-[1-4],d-4-2-[1-4],d-4-3-[1-4],d-4-4-[1-4],d-4-5-[1-4],d-4-6-[1-4],d-4-7-[1-4],d-4-8-[1-4],d-4-9-[1-4],d-4-10-[1-4],d-4-11-[1-4],d-4-12-[1-4],d-4-13-[1-4],d-4-14-[1-4],d-4-15-[1-4],d-5-4-[1-4],d-5-5-[1-4],d-5-6-[1-4],d-5-7-[1-4],d-5-8-[1-4],d-5-9-[1-4],d-5-10-[1-4],d-5-11-[1-4],d-5-12-[1-4],d-5-13-[1-4],d-5-14-[1-4],d-5-15-[1-4],d-6-1-[1-4],d-6-2-[1-4],d-6-3-[1-4],d-6-4-[1-4],d-6-5-[1-4],d-6-6-[1-4],d-6-7-[1-4],d-6-8-[1-4],d-6-9-[1-4],d-6-10-[1-4],d-6-11-[1-4],d-6-12-[1-4],d-6-13-[1-4],d-6-14-[1-4],d-6-15-[1-4],d-16-1-[1-4],d-16-2-[1-4],d-16-3-[1-4],d-16-4-[1-4],d-16-5-[1-4],d-16-6-[1-4],d-16-7-[1-4],d-16-8-[1-4],d-16-9-[1-4],d-16-10-[1-4],d-16-11-[1-4],d-16-12-[1-4],d-16-13-[1-4],d-16-14-[1-4],d-16-15-[1-4],d-17-1-[1-4],d-17-2-[1-4],d-17-3-[1-4],d-17-4-[1-4],d-17-5-[1-4],d-17-6-[1-4],d-17-7-[1-4],d-17-8-[1-4],d-17-9-[1-4],d-17-10-[1-4],d-17-11-[1-4],d-17-12-[1-4],d-17-13-[1-4],d-17-14-[1-4],d-17-15-[1-4],d-18-1-[1-4],d-18-2-[1-4],d-18-3-[1-4],d-18-4-[1-4],d-18-5-[1-4],d-18-6-[1-4],d-18-7-[1-4],d-18-8-[1-4],d-18-9-[1-4],d-18-10-[1-4],d-18-11-[1-4],d-18-12-[1-4],d-18-13-[1-4],d-18-14-[1-4],d-18-15-[1-4],d-19-1-[1-4],d-19-2-[1-4],d-19-3-[1-4],d-19-4-[1-4],d-19-5-[1-4],d-19-6-[1-4],d-19-7-[1-4],d-19-8-[1-4],d-19-9-[1-4],d-19-10-[1-4],d-19-11-[1-4],d-19-12-[1-4],d-19-13-[1-4],d-19-14-[1-4],d-19-15-[1-4]
   PriorityJobFactor=1 PriorityTier=1 RootOnly=NO ReqResv=NO OverSubscribe=NO
   OverTimeLimit=NONE PreemptMode=REQUEUE
   State=UP TotalCPUs=22176 TotalNodes=462 SelectTypeParameters=NONE
   JobDefaults=(null)
   DefMemPerCPU=4000 MaxMemPerNode=UNLIMITED
   TRES=cpu=22176,mem=86625G,node=462,billing=22176,gres/ijob=462

SLURM_TASKS_PER_NODE: 1
SLURM_JOB_NODELIST: d-19-13-4
SLURM_JOB_ID: 27688685
SLURM_NODELIST: d-19-13-4
SLURM_NTASKS: 

What is wrong?


Solution

  • Slurm is ignoring all directives because of the space between the # and SBATCH:

    # SBATCH --partition=xeon-p8
     ^
     + there should be no space here
    

    Try changing all the SBATCH lines to

    #SBATCH --partition=xeon-p8
    #SBATCH --ntasks=8
    #SBATCH --cpus-per-task=4
    #SBATCH --ntasks-per-node=4
    #SBATCH --nodes=2
    #SBATCH --mem=8G
    #SBATCH --time=00:02:00