#!/bin/bash
# Copyright(C) 2021 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
#
#  gpurun: Process launch utility for GPU applications. This is a wrapper
#          to execute a GPU application including OpenMPI GPU applications.
#          See help message below (gpurun -h) for more information.
#
#  Usage Examples:
#    _appbin=true
#    _appargs=""
#    gpurun $_appbin $_appargs
#    mpirun -np  4 gpurun $_appbin $_appargs
#    mpirun -np  8 gpurun $_appbin $_appargs
#    mpirun -np  9 gpurun $_appbin $_appargs
#    mpirun -np 23 gpurun $_appbin $_appargs
#    # For large numbers of ranks, increase slots with a hosfile.
#    _host_file="/tmp/host_file$$"
#    echo "`hostname` slots=64" >$_host_file
#    mpirun -np 64 -hostfile $_host_file gpurun $_appbin $_appargs
#    mpirun -np 60 -hostfile $_host_file gpurun $_appbin $_appargs
#    mpirun -np 55 -hostfile $_host_file gpurun $_appbin $_appargs
#
#  TODO:
#  - Limit HSA queues with GPU_MAX_HW_QUEUES when multiple ranks per GPU
#  - Warn if HSA_CU_MASK is preset and use that instead of calculated MASK.
#    Note that rocminfo does not recognize ROCR_VISIBLE_DEVICES or HSA_CU_MASK.
#  - Add support for cuda.
#  - Add support for other mpi launchers besides openmpi's mpirun.
#  - If gpurun becomes popular, convert this script to a program
#

PROGVERSION=5.7.1
function version(){
   echo $0 version $PROGVERSION
   exit 0
}
function verbosity() {
   export GPURUN_VERBOSE=0
}
function usage(){
/bin/cat 2>&1 <<"EOF"

   gpurun: Application process launch utility for GPUs.
           This utility launches an application binary with the
           Linux 'taskset' utility to limit the application process to
           execute only on CPU cores in the same NUMA domain as
           the selected GPU to improve efficiency of memory transfers.
           This utility sets environment variable ROCR_VISIBLE_DEVICES
           if the number of visible GPUs is greater than 1. It sets
           it to limit the application binary to a single GPU.
           This utility sets environment variable HSA_CU_MASK
           to control which CUs are available to the process only
           when more than one OpenMPI rank will utilize the same GPU
           Lastly, it sets env variable OMPX_TARGET_TEAM_PROCS to the
           number of CUs available to the process after masking.

   Usage:
      gpurun <executable> [ <executable args> ]
      mpirun -np <num ranks>  gpurun <executable> [ <executable args> ]

   Options:
      -h        Print this help message and exit
      -s        suppress output, often useful in benchmarking
      --version Print version of gpurun and exit

   Generated (output) Environment Variables:
      OMPX_TARGET_TEAM_PROCS - Number of CUs available to process
      ROCR_VISIBLE_DEVICES - The logical device number for selected device
      HSA_CU_MASK - The CU mask for the device.

   Optional input environment variables:
      GPURUN_VERBOSE=
        "" (or unset) print 1 line trace to stdout, format:
           RANK:<rank> D:<dev-id> PCI:<pci-id> NN:<numanode> <CU mask if set>
        0:  for silent operation, nothing is added to stdout
        1:  prints trace and other diagnostics to stdout
        2:  prints trace, other diagnostics, and taskset command
      ROCMINFO_BINARY  Set location of rocminfo binary
      AOMP location of AOMP or ROCM

   Input environment variables set by OpenMP mpirun
      OMPI_COMM_WORLD_LOCAL_SIZE Number of ranks on this node
      OMPI_COMM_WORLD_LOCAL_RANK The local rank number 0-(nranks-1)
      This script also checks for MPI_LOCALNRANKS/MPI_LOCALRANKID
      and MPI_COMM_WORLD_LOCAL_SIZE/MPI_COMM_WORLD_LOCAL_RANK

   Limitations:
   - This utility assigns no more than one GPU to the application process.
     That is, the OpenMP API omp_get_num_devices() will always return 1.
   - Currently, gpurun creates masks that are mutually exclusive of each other.
     That is, the MPI processes will not share CUs. If number of ranks is not
     perfectly divisible by number of CUs or number of GPUs, some resources
     would be unused.
     Set GPURUN_VERBOSE=1 or 2 to see overall cu utilization.
   - Works with AOMP 15.0-0 or ROCM 5.0 or greater

   Notes:
     With MPI, this utility distributes GPUs and their CUs across
     multiple ranks of an MPI job into mutually exclusive sets of CUs.
     It uses OpenMPI environment variables OMPI_COMM_WORLD_LOCAL_SIZE
     and OMPI_COMM_WORLD_LOCAL_RANK to set visible devices and a
     the mutually exclusive CU mask.

     An rplace (rank place) is a subset of CUs for a rank. 
     This utility calculates the number of rplaces needed to contain all
     the specified number of ranks for this node. If number of ranks not
     divisible by number of GPUs, then there will be more rplaces than ranks.
     The number of CUs in an rplace is calculated by dividing the number of
     CUs per GPU by the number of rplaces per GPU. This is also the number of
     bits set in the CU mask. This is also the number of physical locations
     available for an OpenMP team to execute. This utility exports that number
     to the environment variable OMPX_TARGET_TEAM_PROCS. This value
     could be used by the application or runtume to adjust the number
     of desired teams in a target region. If no masking occurs, the entire
     GPU is available for the process and OMPX_TARGET_TEAM_PROCS is set to
     the total number of CUs on the GPU.

   Copyright (c) 2022  ADVANCED MICRO DEVICES, INC.

EOF
  exit 0
}


[ "$1" == "--version" ] && version
[ "$1" == "-version" ] && version
[ "$1" == "-h" ] && usage
[ "$1" == "-s" ] && verbosity && shift
[ "$1" == "-help" ] && usage
[ "$1" == "--help" ] && usage

#  Get environment variables set by OpenMPI
_num_local_ranks=$OMPI_COMM_WORLD_LOCAL_SIZE
_local_rank_num=$OMPI_COMM_WORLD_LOCAL_RANK
# If not OpenMPI, check for Platform MPI, MVAPICH
if [ -z "$_num_local_ranks" ] ; then
  _num_local_ranks=$MPI_LOCALNRANKS
  _local_rank_num=$MPI_LOCALRANKID
fi
# Also try MPI_COMM_WORLD env vars
if [ -z "$_num_local_ranks" ] ; then
  _num_local_ranks=$MPI_COMM_WORLD_LOCAL_SIZE
  _local_rank_num=$MPI_COMM_WORLD_LOCAL_RANK
fi
# If none of the above MPIs, assume gpurun is wrapper for single process on single GPU
if [ -z "$_num_local_ranks" ] ; then
   _num_local_ranks=1
   _local_rank_num=0
fi

GPURUN_DEVICE_BIAS=${GPURUN_DEVICE_BIAS:-0}

# Find location of the rocminfo binary
AOMP=${AOMP:-/opt/rocm/llvm}
if [ ! -d $AOMP ] ; then
   AOMP="/opt/rocm/llvm"
fi
if [ ! -d $AOMP ] ; then
   AOMP="/opt/rocm/llvm"
fi
if [ ! -d $AOMP ] ; then
   realpath=`realpath $0`
   thisdir=`dirname $realpath`
   AOMP=$thisdir/..
fi
if [ ! -d $AOMP ] ; then
   >&2 echo "ERROR: AOMP not found at $AOMP"
   >&2 echo "       Please install AOMP or correctly set env-var AOMP"
   exit 1
fi
ROCMINFO_BINARY=${ROCMINFO_BINARY:-$AOMP/bin/rocminfo}
[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../bin/rocminfo
if [ ! -f $ROCMINFO_BINARY ] ; then
   >&2 echo "ERROR: Could not find binary $ROCMINFO_BINARY"
   >&2 echo "       Please correct the installation of ROCM or AOMP"
   exit 1
fi

# Find number of GPUs and number of CUs per GPU
_available_CUs_per_device=0
_available_devices=0
_bdfids=()
_tfile="/tmp/rinfo_out$$"
$ROCMINFO_BINARY | grep -E "Compute Unit:| Device Type:|BDFID:" >$_tfile
while read _linepair ; do
  last_cu_count=`echo $_linepair | cut -d":" -f2`
  _fieldtype=`echo $_linepair | cut -d":" -f1`
  if [ $last_cu_count == "CPU" ] ; then 
     _last_device_type_was_gpu=0
  elif [ $last_cu_count == "GPU" ] ; then 
     _last_device_type_was_gpu=1
  else
     if [[ $_last_device_type_was_gpu == 1 ]] ; then
        if [ "$_fieldtype" == "BDFID" ] ; then
	   bdfids+=( $last_cu_count )
        else
           _available_devices=$(( $_available_devices + 1 ))
	   if [[ $_available_CUs_per_device == 0 ]] ; then
              _available_CUs_per_device=$last_cu_count
	   else
              if [[ $_available_CUs_per_device != $last_cu_count ]] ; then
                 >&2 echo "ERROR: Defective node! The cu_count for each GPU must be identical"
	         >&2 echo "       Last CU count : $last_cu_count"
	         >&2 echo "       Previous CU count : $_available_CUs_per_device"
	         >&2 echo "       Number of GPUs : $_available_devices"
	         exit 1
              fi
           fi
        fi
     fi
  fi
done < $_tfile
rm $_tfile

if [[ $_available_devices -lt 1  ]] ; then
   >&2 echo "ERROR: Local rank $_local_rank_num found no GPUS available"
   >&2 echo "       available_devices=$_available_devices"
   exit 1
fi

_node_cus=$(( $_available_devices * $_available_CUs_per_device ))
if [ $_num_local_ranks -gt $_node_cus ] ; then
   >&2 echo "ERROR: Not enough node CUs ($_node_cus) for $_num_local_ranks ranks "
   exit 1
fi

if [ $_available_devices -gt  $_num_local_ranks ] ; then
   _utilized_devices=$_num_local_ranks
else
   _utilized_devices=$_available_devices
fi

# Calculate number of GPUs to use to evenly spread ranks across GPUs.
# An rplace is a set of CUs that will be used for a rank.
# The number of rplaces must be at least the number of ranks.
_uncovered_ranks=$(( $_num_local_ranks % $_utilized_devices ))
_number_of_rplaces_per_GPU=$(( $_num_local_ranks / $_utilized_devices ))
if [ $_uncovered_ranks != 0 ] ; then
   # If _num_local_ranks not divisible by number of GPUs,
   # then add an extra rplace per GPU to make room for remainder.
   _number_of_rplaces_per_GPU=$(( $_number_of_rplaces_per_GPU + 1 ))
fi
_device_num=$(( $_local_rank_num / $_number_of_rplaces_per_GPU + $GPURUN_DEVICE_BIAS ))
_utilized_CUs_per_device=$_available_CUs_per_device
_rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU ))
# Lower utilized CUs till divisible by number of rplaces per GPU
while [ $_rem2 != 0 ] ; do
   _utilized_CUs_per_device=$(( $_utilized_CUs_per_device - 1 ))
   _rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU ))
done
_CUs_per_rplace=$(( $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU ))

# Diagnostics:
if [ $_local_rank_num == 0 ] && [[ "$GPURUN_VERBOSE" == "1" || "$GPURUN_VERBOSE" == "2" ]]; then
   _wasted_CUs_on_each_GPU=$(( $_available_CUs_per_device - $_utilized_CUs_per_device ))
   _total_GPU_rplaces=$(( $_number_of_rplaces_per_GPU * $_available_devices ))
   _total_wasted_rplaces=$(( $_total_GPU_rplaces - $_num_local_ranks ))
   _wasted_GPUs=$(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU ))
   _used_cus=$(( $_num_local_ranks * $_CUs_per_rplace ))
   _utilization=$(( ( $_used_cus * 100 ) / $_node_cus ))
   if ! [ $_available_devices -gt $_num_local_ranks ] ; then
      if [ $_wasted_CUs_on_each_GPU != 0 ] || [ $_total_wasted_rplaces != 0 ] ; then
         _extra_diags=true
      fi
   fi
   >&2 echo "-  ROCMINFO LOCATION: $ROCMINFO_BINARY"
   >&2 echo "-  OPENMPI RANKS:     $_num_local_ranks (OMPI_COMM_WORLD_LOCAL_SIZE)"
   [ $_extra_diags ] && echo
   >&2 echo "-  AVAILALBLE GPUS:   $_available_devices"
   [ $_extra_diags ] && \
   >&2 echo "-- USED GPUS:         $(( $_available_devices - $_wasted_GPUs ))"
   [ $_extra_diags ] && \
   >&2 echo "-- UNUSED GPUS:       $(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) "
   [ $_extra_diags ] && echo
   >&2 echo "-  RPLACEs PER NODE:  $_total_GPU_rplaces"
   >&2 echo "-  RPLACEs PER GPU:   $_number_of_rplaces_per_GPU"
   [ $_extra_diags ] && \
   >&2 echo "-- USED RPLACEs:      $_num_local_ranks (RANKS)"
   [ $_extra_diags ] && \
   >&2 echo "-- UNUSED RPLACEs:    $_total_wasted_rplaces" ; \
   #&2 echo "-  LAST GPU UNUSED RPLACES:  $(( $_total_wasted_rplaces % $_number_of_rplaces_per_GPU )) "
   [ $_extra_diags ] && echo
   >&2 echo "-  CUs PER GPU:       $_available_CUs_per_device"
   [ $_extra_diags ] && \
   >&2 echo "-- USED CUs PER GPU:  $_utilized_CUs_per_device"
   [ $_extra_diags ] && \
   >&2 echo "-- UNUSED CUs PER GPU:$_wasted_CUs_on_each_GPU"
   >&2 echo "-  CUs PER RPLACE:    $_CUs_per_rplace (OMPX_TARGET_TEAM_PROCS)"
   >&2 echo "-  FORMULA: OMPX_TARGET_TEAM_PROCS = $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU"
   >&2 echo "-  NODE UTILIZATION:  $_utilization %"
fi

if [ $_CUs_per_rplace != $_available_CUs_per_device ] ; then
   #  Build the CU mask for this rank, bits_to_set = _CUs_per_rplace
   _bits_to_set=$_CUs_per_rplace
   #  This formula keeps adjacent ranks on same GPU which should be preferred
   _bits_to_shift=$(( ( $_local_rank_num * $_bits_to_set) - ( _device_num * $_utilized_CUs_per_device) ))
   # use bc because these values can be very large
   _unshifted_bits=`echo "(2 ^ $_bits_to_set) - 1" | bc`
   _mask=`echo "obase=16; $_unshifted_bits * (2 ^ $_bits_to_shift)" | bc`
   # Calculate the number of leading zeros needed for this mask
   _lz=$(( ( $_utilized_CUs_per_device / 4 ) - ${#_mask} + 1 ))
   for i in `seq 1 $_lz` ; do
      _mask="0$_mask"
   done
   _mask="0x$_mask"
fi

# Get NUMANODE and cpuset for this GPU identified by BDFID
_bdfid=${bdfids[$_device_num]}
_bdfidstr=`echo "obase=16; $_bdfid" | bc | tr '[:upper:]' '[:lower:]'`
if [ ${#_bdfidstr} == 3 ] ; then
   _bdfidstrc="0${_bdfidstr:0:1}:${_bdfidstr:1:2}"
else
   _bdfidstrc="${_bdfidstr:0:2}:${_bdfidstr:2:2}"
fi
#NUMANODE=`lspci -vmm -s $_bdfidstrc | grep -m 1 NUMANode | cut -d":" -f2`
NUMANODE=`cat /sys/bus/pci/devices/0000:$_bdfidstrc.0/numa_node`
# Sometimes lscpu shows 0 for the node, when /sys/bus/pci/.. shows -1
[[ $NUMANODE == -1 ]] && NUMANODE=0
[[ "$NUMANODE" == "" ]] && NUMANODE=0
_taskset_cmd="taskset -c "
lscpu --extended=cpu,node >$_tfile
while read _linepair ; do
  _nn=`echo $_linepair | awk '{print $2}'`
  if [ $_nn == $NUMANODE ]; then
    _cpu=`echo $_linepair | awk '{print $1}'`
    if [ $_notfirstitem ] ; then
       _taskset_cmd+=",$_cpu"
    else
       _taskset_cmd+="$_cpu"
       _notfirstitem=1
    fi
  fi
done < $_tfile
rm $_tfile

[ "$*" == "" ] && _taskset_cmd=""
#  FIXME: What if existing ROCR_VISIBLE_DEVICES has multiple non-sequential
#         devices not starting at 0.
[[ $_available_devices != 1 ]] && export ROCR_VISIBLE_DEVICES=$_device_num
export OMPX_TARGET_TEAM_PROCS=$_CUs_per_rplace
if [ $_CUs_per_rplace == $_available_CUs_per_device ] ; then
   # Do not set HSA_CU_MASK when using all CUs
   [[ "$GPURUN_VERBOSE" == "1" || "$GPURUN_VERBOSE" == "" ]] && \
      printf "RANK:%02d D:%d PCI:%5s NN:%d \n" $_local_rank_num $_device_num $_bdfidstrc $NUMANODE >&2
   [ "$GPURUN_VERBOSE" == "2" ] && \
      printf "RANK:%02d D:%d PCI:%5s NN:%d \n  taskset cmd: %s \n "  \
         $_local_rank_num $_device_num $_bdfidstrc $NUMANODE "$_taskset_cmd" >&2
   $_taskset_cmd $*
else
   # Since ROCR_VISIBLE_DEVICES only enables 1 GPU, HSA_CU_MASK starts with 0:
   export HSA_CU_MASK=0:$_mask
   [[ "$GPURUN_VERBOSE" == "1" || "$GPURUN_VERBOSE" == "" ]] && \
      printf "RANK:%02d D:%d PCI:%5s NN:%d CUMASK:0:%s \n" \ 
         $_local_rank_num $_device_num $_bdfidstrc $NUMANODE $_mask >&2
   [ "$GPURUN_VERBOSE" == "2" ] && \
      printf "RANK:%02d D:%d PCI:%5s NN:%d CUMASK:0:%s \n  taskset cmd: %s \n" \
         $_local_rank_num $_device_num $_bdfidstrc $NUMANODE $_mask "$_taskset_cmd" &>2
   HSA_CU_MASK=0:$_mask \
   $_taskset_cmd $*
fi
exit $?
