#!/bin/bash
# Copyright(C) 2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
#
#  gpurun: Process launch utility for GPU applications. This is a wrapper
#          to execute a GPU application including OpenMPI GPU applications.
#          See help message below (gpurun -h) for more information.
#
#  Usage Examples:
#    _appbin=true
#    _appargs=""
#    gpurun $_appbin $_appargs
#    mpirun -np  4 gpurun $_appbin $_appargs
#    mpirun -np  8 gpurun $_appbin $_appargs
#    mpirun -np  9 gpurun $_appbin $_appargs
#    mpirun -np 23 gpurun $_appbin $_appargs
#    # For large numbers of ranks, increase slots with a hosfile.
#    _host_file="/tmp/host_file$$"
#    echo "`hostname` slots=64" >$_host_file
#    mpirun -np 64 -hostfile $_host_file gpurun $_appbin $_appargs
#    mpirun -np 60 -hostfile $_host_file gpurun $_appbin $_appargs
#    mpirun -np 55 -hostfile $_host_file gpurun $_appbin $_appargs
#
#  TODO:
#  - Add support for cuda.
#  - If gpurun becomes popular, convert this script to a program
#

# PROGVERSION string is updated by cmake when component is installed
PROGVERSION=6.1.0
function version(){
   echo $0 version $PROGVERSION
   exit 0
}
function verbosity() {
   export GPURUN_VERBOSE=0
}
function verbose2() {
   export GPURUN_VERBOSE=2
}
function usage(){
/bin/cat 2>&1 <<"EOF"

   gpurun: Application process launch utility for GPUs.
           This utility ensures the process will run on a single GPU.
           It launches the application binary with the 'taskset' utility
	   so the process only runs on CPU cores in the same NUMA domain
	   as the selected GPU.
           This utility sets environment variable ROCR_VISIBLE_DEVICES
	   to the selected GPU ONLY if it was not already set by the
	   callers environment AND the number of GPUs is not 1.
           This utility also sets environment variable HSA_CU_MASK
           to control which CUs are available to the process.
	   HSA_CU_MASK is set only when more than one OpenMPI process
	   (rank) will utilize the same GPU.
           Lastly, it sets env variable OMPX_TARGET_TEAM_PROCS to the
           number of CUs available to the process after masking.

   Usage:
      gpurun <executable> [ <executable args> ]
      mpirun -np <num ranks>  gpurun <executable> [ <executable args> ]

   Options:
      -h        Print this help message and exit
      -s        suppress output, often useful in benchmarking
      --version Print version of gpurun and exit

   Optional Input environment variables:
      GPURUN_VERBOSE
        "" (or unset) print 1 line trace to stdout, format:
           RANK:<rank> D:<dev-id> PCI:<pci-id> NN:<numanode> <CU mask if set>
        0:  for silent operation, nothing is added to stdout
        1:  prints trace and other diagnostics to stdout
        2:  prints trace, other diagnostics, and taskset command
      ROCMINFO_BINARY  Set location of rocminfo binary
      AOMP: location of AOMP or ROCM
      GPURUN_DEVICE_BIAS: amount to shift device number to avoid dev 0
      ROCR_VISIBLE_DEVICES: See description above
      OMPI_COMM_WORLD_LOCAL_SIZE Number of ranks on this node set by openmpi
      OMPI_COMM_WORLD_LOCAL_RANK The local rank number 0-(nranks-1) from openmpi
      This also checks for MPI_LOCALNRANKS/MPI_LOCALRANKID
      and MPI_COMM_WORLD_LOCAL_SIZE/MPI_COMM_WORLD_LOCAL_RANK

   Generated (output) Environment Variables:
      OMPX_TARGET_TEAM_PROCS - Number of CUs available to process
      ROCR_VISIBLE_DEVICES - The logical device number for selected device
                             Not changed if it was preset.
      HSA_CU_MASK - The CU mask for the device.

   Limitations:
   - This utility assigns no more than one GPU to the application process.
     Therefore, the OpenMP API omp_get_num_devices() will always return 1.
   - Currently, gpurun creates masks that are mutually exclusive of each other.
     That is, the MPI processes will not share CUs. If number of ranks is not
     perfectly divisible by number of CUs or number of GPUs, some resources
     would be unused.
     Set GPURUN_VERBOSE=1 or 2 to see overall cu utilization.
   - Works with AOMP 15.0-0 or ROCM 5.0 or greater

   Notes:
     With MPI, this utility distributes GPUs and their CUs across
     multiple ranks of an MPI job into mutually exclusive sets of CUs.
     It uses OpenMPI environment variables OMPI_COMM_WORLD_LOCAL_SIZE
     and OMPI_COMM_WORLD_LOCAL_RANK to set visible devices and a
     the mutually exclusive CU mask.

     An rplace (rank place) is a subset of CUs for a rank. 
     This utility calculates the number of rplaces needed to contain all
     the specified number of ranks for this node. If number of ranks not
     divisible by number of GPUs, then there will be more rplaces than ranks.
     The number of CUs in an rplace is calculated by dividing the number of
     CUs per GPU by the number of rplaces per GPU. This is also the number of
     bits set in the CU mask. This is also the number of physical locations
     available for an OpenMP team to execute. This utility exports that number
     to the environment variable OMPX_TARGET_TEAM_PROCS. This value
     could be used by the application or runtume to adjust the number
     of desired teams in a target region. If no masking occurs, the entire
     GPU is available for the process and OMPX_TARGET_TEAM_PROCS is set to
     the total number of CUs on the GPU.

   Copyright (c) 2024  ADVANCED MICRO DEVICES, INC.

EOF
  exit 0
}


[ "$1" == "--version" ] && version
[ "$1" == "-version" ] && version
[ "$1" == "-h" ] && usage
[ "$1" == "-s" ] && verbosity && shift
[ "$1" == "-v" ] && verbose2 && shift
[ "$1" == "-help" ] && usage
[ "$1" == "--help" ] && usage

#  Get environment variables set by OpenMPI
_num_local_ranks=$OMPI_COMM_WORLD_LOCAL_SIZE
_local_rank_num=$OMPI_COMM_WORLD_LOCAL_RANK
# If not OpenMPI, check for Platform MPI, MVAPICH
if [ -z "$_num_local_ranks" ] ; then
  _num_local_ranks=$MPI_LOCALNRANKS
  _local_rank_num=$MPI_LOCALRANKID
fi
# Also try MPI_COMM_WORLD env vars
if [ -z "$_num_local_ranks" ] ; then
  _num_local_ranks=$MPI_COMM_WORLD_LOCAL_SIZE
  _local_rank_num=$MPI_COMM_WORLD_LOCAL_RANK
fi
# If none of the above MPIs, assume gpurun is wrapper for single process on single GPU
if [ -z "$_num_local_ranks" ] ; then
   _num_local_ranks=1
   _local_rank_num=0
fi

GPURUN_DEVICE_BIAS=${GPURUN_DEVICE_BIAS:-0}

# Find location of the rocminfo binary
AOMP=${AOMP:-/opt/rocm/llvm}
if [ ! -d $AOMP ] ; then
   AOMP="/opt/rocm/llvm"
fi
if [ ! -d $AOMP ] ; then
   AOMP="/opt/rocm/llvm"
fi
if [ ! -d $AOMP ] ; then
   realpath=`realpath $0`
   thisdir=`dirname $realpath`
   AOMP=$thisdir/..
fi
if [ ! -d $AOMP ] ; then
   >&2 echo "ERROR: AOMP not found at $AOMP"
   >&2 echo "       Please install AOMP or correctly set env-var AOMP"
   exit 1
fi
ROCMINFO_BINARY=${ROCMINFO_BINARY:-$AOMP/bin/rocminfo}
[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../bin/rocminfo
[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../../bin/rocminfo
if [ ! -f $ROCMINFO_BINARY ] ; then
   >&2 echo "ERROR: Could not find binary $ROCMINFO_BINARY"
   >&2 echo "       Please correct the installation of ROCM or AOMP"
   exit 1
fi

# Scan amdgpu devs and store info (bdfid, cpus, and numande) in 3 arrays
# indexed by _device_num. This is cleaner that parsing rocminfo bdfid.
# Eventially we want to get rid of all rocminfo parsing.
_sysdevdir="/sys/bus/pci/devices"
_scanned_num_devices=0
_cpulist=()
_long_bdfid=()
_numanode=()
for _devid in `ls $_sysdevdir` ; do
  if [ -f $_sysdevdir/$_devid/device ] ; then
    _driver_name=`cat $_sysdevdir/$_devid/uevent | grep DRIVER | awk '{print $1}'`
    if [ ! -z $_driver_name ] ; then
      if [ $_driver_name  == "DRIVER=amdgpu" ] ; then
        _numa_node=`cat $_sysdevdir/$_devid/numa_node`
        [ "$_numa_node" == "-1" ] && _numa_node=0
        _this_cpulist=`cat $_sysdevdir/$_devid/local_cpulist`
        _long_bdfid+=( $_devid )
        _numanode+=( $_numa_node )
        _cpulist+=( $_this_cpulist )
        _scanned_num_devices=$(( $_scanned_num_devices + 1 ))
      fi
    fi
  fi
done

if [[ $_scanned_num_devices -lt 1  ]] ; then
   >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir"
   >&2 echo "       num_devices=$_scanned_num_devices"
   exit 1
fi

# Use rocminfo to find number number of CUs per GPU
_available_CUs_per_device=0
_tfile="/tmp/rinfo_out$$"
$ROCMINFO_BINARY | grep -E "Compute Unit:| Device Type:" >$_tfile
while read _linepair ; do
  _fieldvalue=`echo $_linepair | cut -d":" -f2`
  _fieldtype=`echo $_linepair | cut -d":" -f1`
  if [ $_fieldvalue == "CPU" ] ; then
     _last_device_type_was_gpu=0
  elif [ $_fieldvalue == "GPU" ] ; then
     _last_device_type_was_gpu=1
  else
     # else the _fieldvalue was the number of CUs or GCPUs
     if [[ $_last_device_type_was_gpu == 1 ]] ; then
	if [[ $_available_CUs_per_device == 0 ]] ; then
           # set number of CUs from the first GPU agent
           _available_CUs_per_device=$_fieldvalue
	else
	   # ensure all subsequent GPU agents have same number of CUs
           if [[ $_available_CUs_per_device != $_fieldvalue ]] ; then
              >&2 echo "ERROR: Defective node! The cu_count for each GPU must be identical"
	      >&2 echo "       Last CU count : $_fieldvalue"
	      >&2 echo "       First CU count : $_available_CUs_per_device"
	      >&2 echo "       Scanned Number of GPUs : $_scanned_num_devices"
	      exit 1
           fi
        fi
     fi
  fi
done < $_tfile
rm $_tfile

if [[ -z $ROCR_VISIBLE_DEVICES ]] ; then
  _num_devices=$_scanned_num_devices
else
  if [[ "$ROCR_VISIBLE_DEVICES" =~ .*",".* ]] ; then
    >&2 echo "ERROR: preset ROCR_VISIBLE_DEVICES '$ROCR_VISIBLE_DEVICES' exposes more than one GPU"
    >&2 echo "       gpurun assumes each process will get a single GPU"
    >&2 echo "       use a single index between 0 and $(( $_scanned_num_devices - 1 ))"
    exit 1
  fi
  if [[ "$ROCR_VISIBLE_DEVICES" =~ .*"-".* ]] ; then
    >&2 echo "ERROR: preset ROCR_VISIBLE_DEVICES '$ROCR_VISIBLE_DEVICES' is invalid"
    >&2 echo "       use a single index between 0 and $(( $_scanned_num_devices - 1 ))"
    exit 1
  fi
  if [[ $ROCR_VISIBLE_DEVICES -ge $_scanned_num_devices ]] ; then
    >&2 echo "ERROR: preset ROCR_VISIBLE_DEVICES '$ROCR_VISIBLE_DEVICES' is invalid"
    >&2 echo "       use a single index between 0 and $(( $_scanned_num_devices - 1 ))"
    exit 1
  fi
  _num_devices=1
  if [[ $_scanned_num_devices != 1 ]] && [[ "$GPURUN_VERBOSE" != "0"  ]]  ; then
    >&2 echo "WARNING: preset ROCR_VISIBLE_DEVICES, process $_local_rank_num of $_num_local_ranks to use dev $ROCR_VISIBLE_DEVICES"
  fi
fi

_node_cus=$(( $_num_devices * $_available_CUs_per_device ))
if [ $_num_local_ranks -gt $_node_cus ] ; then
   >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks "
   exit 1
fi

if [ $_num_devices -gt  $_num_local_ranks ] ; then
   _utilized_devices=$_num_local_ranks
else
   _utilized_devices=$_num_devices
fi

# Calculate number of GPUs to use to evenly spread ranks across GPUs.
# An rplace is a set of CUs that will be used for a rank.
# The number of rplaces must be at least the number of ranks.
_uncovered_ranks=$(( $_num_local_ranks % $_utilized_devices ))
_number_of_rplaces_per_GPU=$(( $_num_local_ranks / $_utilized_devices ))
if [ $_uncovered_ranks != 0 ] ; then
   # If _num_local_ranks not divisible by number of GPUs,
   # then add an extra rplace per GPU to make room for remainder.
   _number_of_rplaces_per_GPU=$(( $_number_of_rplaces_per_GPU + 1 ))
fi

if [[ -z $ROCR_VISIBLE_DEVICES ]] ; then
  _device_num=$(( $_local_rank_num / $_number_of_rplaces_per_GPU ))
  # Some users want to shift selected device to avoid dev 0
  _device_num=$(( ( $_device_num + $GPURUN_DEVICE_BIAS ) % $_num_devices ))
  _relative_device_num=$_device_num
else
  # if caller set ROCR_VISIBLE_DEVICES, that becomes the device_num
  _device_num=$ROCR_VISIBLE_DEVICES
  _relative_device_num=0
fi

_utilized_CUs_per_device=$_available_CUs_per_device
_rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU ))
# Lower utilized CUs till divisible by number of rplaces per GPU
while [ $_rem2 != 0 ] ; do
   _utilized_CUs_per_device=$(( $_utilized_CUs_per_device - 1 ))
   _rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU ))
done
_CUs_per_rplace=$(( $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU ))

# Diagnostics:
if [ $_local_rank_num == 0 ] && [[ "$GPURUN_VERBOSE" == "1" || "$GPURUN_VERBOSE" == "2" ]]; then
   _wasted_CUs_on_each_GPU=$(( $_available_CUs_per_device - $_utilized_CUs_per_device ))
   _total_GPU_rplaces=$(( $_number_of_rplaces_per_GPU * $_num_devices ))
   _total_wasted_rplaces=$(( $_total_GPU_rplaces - $_num_local_ranks ))
   _wasted_GPUs=$(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU ))
   _used_cus=$(( $_num_local_ranks * $_CUs_per_rplace ))
   _utilization=$(( ( $_used_cus * 100 ) / $_node_cus ))
   if ! [ $_num_devices -gt $_num_local_ranks ] ; then
      if [ $_wasted_CUs_on_each_GPU != 0 ] || [ $_total_wasted_rplaces != 0 ] ; then
         _extra_diags=true
      fi
   fi
   >&2 echo "-  ROCMINFO LOCATION: $ROCMINFO_BINARY"
   >&2 echo "-  PROCESSES:         $_num_local_ranks (OMPI_COMM_WORLD_LOCAL_SIZE)"
   [ $_extra_diags ] && echo
   >&2 echo "-  SCANNED GPUS:      $_scanned_num_devices ($_sysdevdir)"
   >&2 echo "-  AVAILABLE GPUS:    $_num_devices"
   [ $_extra_diags ] && \
   >&2 echo "-- USED GPUS:         $(( $_num_devices - $_wasted_GPUs ))"
   [ $_extra_diags ] && \
   >&2 echo "-- UNUSED GPUS:       $(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) "
   [ $_extra_diags ] && echo
   >&2 echo "-  RPLACEs PER NODE:  $_total_GPU_rplaces"
   >&2 echo "-  RPLACEs PER GPU:   $_number_of_rplaces_per_GPU"
   [ $_extra_diags ] && \
   >&2 echo "-- USED RPLACEs:      $_num_local_ranks (RANKS)"
   [ $_extra_diags ] && \
   >&2 echo "-- UNUSED RPLACEs:    $_total_wasted_rplaces" ; \
   #&2 echo "-  LAST GPU UNUSED RPLACES:  $(( $_total_wasted_rplaces % $_number_of_rplaces_per_GPU )) "
   [ $_extra_diags ] && echo
   >&2 echo "-  CUs PER GPU:      $_available_CUs_per_device"
   [ $_extra_diags ] && \
   >&2 echo "-- USED CUs PER GPU:  $_utilized_CUs_per_device"
   [ $_extra_diags ] && \
   >&2 echo "-- UNUSED CUs PER GPU:$_wasted_CUs_on_each_GPU"
   >&2 echo "-  CUs PER RPLACE:    $_CUs_per_rplace (OMPX_TARGET_TEAM_PROCS)"
   >&2 echo "-  FORMULA: OMPX_TARGET_TEAM_PROCS = $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU"
   if [[ -z "$HSA_CU_MASK" ]] ; then
      # node utilizatino could be incorrect with preset mask.
      >&2 echo "-  NODE UTILIZATION:  $_utilization %"
   fi
fi

if [ $_CUs_per_rplace != $_available_CUs_per_device ] ; then
   #  Build the CU mask for this rank, bits_to_set = _CUs_per_rplace
   _bits_to_set=$_CUs_per_rplace
   #  This formula keeps adjacent ranks on same GPU which should be preferred
   _bits_to_shift=$(( ( $_local_rank_num * $_bits_to_set) - ( _relative_device_num * $_utilized_CUs_per_device) ))
   # use bc because these values can be very large
   _unshifted_bits=`echo "(2 ^ $_bits_to_set) - 1" | bc`
   _mask=`echo "obase=16; $_unshifted_bits * (2 ^ $_bits_to_shift)" | bc`
   # Calculate the number of leading zeros needed for this mask
   _lz=$(( ( $_utilized_CUs_per_device / 4 ) - ${#_mask} + 1 ))
   for i in `seq 1 $_lz` ; do
      _mask="0$_mask"
   done
   _mask="0x$_mask"
fi

# retrieve scanned info
_bdfidstrc=${_long_bdfid[$_device_num]}
NUMANODE=${_numanode[$_device_num]}
_taskset_cmd="taskset -c ${_cpulist[$_device_num]}"

# If gpurun was not given command to execute, then don't run taskset_cmd
[ "$*" == "" ] && _taskset_cmd=""

# only set ROCR_VISIBLE_DEVICES if not already set and multiple devices available
[[ -z $ROCR_VISIBLE_DEVICES ]] && [[ $_num_devices != 1 ]] && export ROCR_VISIBLE_DEVICES=$_device_num
export OMPX_TARGET_TEAM_PROCS=$_CUs_per_rplace

#  - Limit HSA queues when multiple ranks per GPU
if [ $_number_of_rplaces_per_GPU != 1 ] ; then
   # Only set these env controls if not set by caller
   [[ -z "$GPU_MAX_HW_QUEUES" ]] && export GPU_MAX_HW_QUEUES=1
   [[ -z "$LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" ]] && export LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES=1
fi

if [[ ! -z "$HSA_CU_MASK" ]] && [[ "$GPURUN_VERBOSE" != "0"  ]] ; then
   >&2 echo "WARNING: preset HSA_CU_MASK:$HSA_CU_MASK"
fi
if [ $_CUs_per_rplace == $_available_CUs_per_device ] ; then
   # Do not modify HSA_CU_MASK when using all CUs
   if [ "$GPURUN_VERBOSE" == "1" ] || [ "$GPURUN_VERBOSE" == "" ] ; then
      printf "RANK:%02d D:%d PCI:%5s NN:%d \n" $_local_rank_num $_device_num $_bdfidstrc $NUMANODE >&2
   fi
   if [ "$GPURUN_VERBOSE" == "2" ] ; then
      printf "RANK:%02d D:%d PCI:%5s NN:%d CMD:%s $* \n" $_local_rank_num $_device_num $_bdfidstrc $NUMANODE "$_taskset_cmd" >&2
   fi
   $_taskset_cmd $*
else
   if [[ -z "$HSA_CU_MASK" ]] ; then
      # Since ROCR_VISIBLE_DEVICES only enables 1 GPU, HSA_CU_MASK starts with 0:
      export HSA_CU_MASK=0:$_mask
   else
      # use preset mask
      _mask=$HSA_CU_MASK
   fi
   if [ "$GPURUN_VERBOSE" == "1" ] || [ "$GPURUN_VERBOSE" == "" ] ; then
      printf "RANK:%02d D:%d PCI:%5s NN:%d CUMASK:$_mask \n" $_local_rank_num $_device_num $_bdfidstrc $NUMANODE >&2
   fi
   if [ "$GPURUN_VERBOSE" == "2" ] ; then
      printf "RANK:%02d D:%d PCI:%5s NN:%d CUMASK:$_mask CMD:%s $* \n" $_local_rank_num $_device_num $_bdfidstrc $NUMANODE "$_taskset_cmd" >&2
   fi
   HSA_CU_MASK=0:$_mask \
   $_taskset_cmd $*
fi
exit $?
