#!/bin/sh # # sgemaster Gridengine master daemon and scheduler # # chkconfig: - 98 02 # description: The gridengine master daemon and scheduler ### BEGIN INIT INFO # Provides: sge_qmaster # Required-Start: $network $local_fs $remote_fs # Required-Stop: $network $local_fs $remote_fs # Should-Start: # Should-Stop: # Default-Start: # Default-Stop: 0 1 6 # Short-Description: Gridengine master daemon and scheduler # Description: The gridengine master daemon and scheduler ### END INIT INFO # Source function library. . /etc/rc.d/init.d/functions master_exec="/usr/bin/sge_qmaster" master_prog="sge_qmaster" #Defaults SGE_ROOT=/usr/share/gridengine; export SGE_ROOT SGE_CELL=default; export SGE_CELL #Configuration config=/etc/sysconfig/gridengine [ -e $config ] && . $config qmaster_spool_dir=`awk '$1 == "qmaster_spool_dir" { print $2 }' $SGE_ROOT/$SGE_CELL/common/bootstrap` master_pidfile=$qmaster_spool_dir/qmaster.pid retval=0 #--------------------------------------------------------------------------- # CheckIfQmasterHost # If our hostname given in $1 is the same as in the "act_qmaster" file # echo "true" else echo "false" # CheckIfQmasterHost() { host=$1 if [ "$host" = "`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster`" ]; then echo true else echo false fi } #--------------------------------------------------------------------------- # CheckIfPrimaryQmasterHost # Check if our hostname given in $1 is the same as in the # "primary_qmaster" file # echo true if there is our hostname else echo false # CheckIfPrimaryQmasterHost() { host=$1 fname=$SGE_ROOT/$SGE_CELL/common/primary_qmaster if [ -f $fname ]; then if [ "$host" = "`cat $fname`" ]; then echo true else echo false fi else echo false fi } #--------------------------------------------------------------------------- # CheckIfShadowMasterHost # Check if our hostname given in $1 is contained in the # "shadow_masters" file # set shadow_host true if there is our hostname # CheckIfShadowMasterHost() { host=$1 fname=$SGE_ROOT/$SGE_CELL/common/shadow_masters if [ -f $fname ]; then grep -i $host $fname 2>&1 > /dev/null if [ $? = 0 ]; then shadow_host="true" else shadow_host="false" fi else shadow_host="false" fi } #--------------------------------------------------------------------------- # GetAdminUser # echo the name of the admin user on this system # echo "root" if admin user retrieval fails GetAdminUser() { cfgname=$SGE_ROOT/$SGE_CELL/common/bootstrap user=none if [ -f $cfgname ]; then user=`grep admin_user $cfgname | awk '{ print $2 }'` fi if [ `echo $user|tr "A-Z" "a-z"` = "none" ]; then user=root fi echo $user } #--------------------------------------------------------------------------- # CheckRunningQmaster # checks, if sge_qmaster is running # In error case the sge_qmaster didn't start, silently # CheckRunningQmaster() { masterhost=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster` running=false loop=0 if [ "$SGE_QMASTER_PORT" = "" ]; then SGE_QMASTER_PORT=`$utilbin_dir/getservbyname -number sge_qmaster` fi while [ $running = "false" -a $loop -ne 30 ]; do qping -info $masterhost $SGE_QMASTER_PORT qmaster 1 > /dev/null 2>&1 if [ "$?" = 0 ]; then running=true else sleep 2 masterhost=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster` loop=`expr $loop + 1` fi done if [ $running = "false" ]; then echo echo "sge_qmaster didn't start!" echo "Please check the messages file" echo fi } #--------------------------------------------------------------------------- usage() { echo "Grid Engine start/stop script. Valid parameters are:" echo "" echo " \"start\" start qmaster daemon" echo " \"stop\" shutdown qmaster daemon" echo " \"-qmaster\" only start/stop qmaster (if applicable)" echo " \"-shadowd\" only start/stop shadowd (if applicable)" echo " \"-migrate\" shutdown qmaster if it's running on another" echo " host and restart it on this host" echo " Migration only works if this host is an admin host" echo "" echo "Only one of the parameters \"start\", \"stop\" is allowed." echo "Only one of the parameters beginning with \"-\" is allowed." echo echo "Default for \"stop\" is shutting down all components." echo exit 1 } CheckArgs() { if [ "$1" = -qmaster ]; then qmaster=true shadowd=false elif [ "$1" = -shadowd ]; then qmaster=false shadowd=true elif [ "$1" = -migrate ]; then migrate_qmaster=true qmaster=true shadowd=false else usage fi } utilbin_dir=/usr/libexec/gridengine/utilbin if [ "$utilbin_dir" = "none" ]; then echo "can't determine path to Grid Engine utility binaries" exit 6 fi HOST=`$utilbin_dir/gethostname -aname` UQHOST=`$utilbin_dir/gethostname -aname | cut -f1 -d.` CheckIfShadowMasterHost $HOST lockfile=/var/lock/subsys/sgemaster #Default actions qmaster=true shadowd=true qstd=false migrate_qmaster=false start() { # qmaster_host=true if qmaster was running on this host the last time # this host is an execution host qmaster_host=`CheckIfQmasterHost $HOST` primary_qmaster_host=`CheckIfPrimaryQmasterHost $HOST` if [ $qmaster = true -a $qmaster_host = true -a $migrate_qmaster = true ]; then echo " qmaster and scheduler running on this host. Will not migrate qmaster." exit 1 fi [ -x $master_exec ] || exit 5 if [ $qmaster = true -a $qmaster_host = false -a \ \( $primary_qmaster_host = true -o $migrate_qmaster = true \) ]; then actual_qmaster_host=`cat $SGE_ROOT/$SGE_CELL/common/act_qmaster` echo " shutting down qmaster and scheduler on host \"$actual_qmaster_host\" ..." qconf_output=`qconf -ks 2>&1 | grep "denied"` if [ "$qconf_output" != "" ]; then echo " denied: host \"$HOST\" is no admin host." exit 1 fi qconf -km > /dev/null 2>&1 qping_count=0 qping_retries=10 qping_exit_state=0 while [ $qping_count -lt $qping_retries ]; do qping -info $actual_qmaster_host $SGE_QMASTER_PORT qmaster 1 > /dev/null 2>&1 qping_exit_state=$? if [ $qping_exit_state -ne 0 ]; then break fi sleep 3 qping_count=`expr $qping_count + 1` done if [ $qping_exit_state -eq 0 ]; then # qmaster is still running echo " qmaster on host $actual_qmaster_host still alive. Cannot migrate qmaster." exit 1 fi lock_file_read_retries=10 lock_file_read_count=0 lock_file_found=0 while [ $lock_file_read_count -lt $lock_file_read_retries ]; do if [ -f $qmaster_spool_dir/lock ]; then lock_file_found=1 break fi sleep 3 lock_file_read_count=`expr $lock_file_read_count + 1` done if [ $lock_file_found -eq 0 ]; then # old qmaster did not write lock file echo " old qmaster did not write lock file. Cannot migrate qmaster." echo " Please verify that qmaster on host $actual_qmaster_host is down" echo " and make sure that the lock file in qmaster spool directory is" echo " read-able." exit 1 fi qmaster_host=true fi if [ $qmaster = true -a $qmaster_host = true ]; then echo -n $"Starting $master_prog: " daemon --check $master_prog --pidfile=$master_pidfile $master_exec retval=$? CheckRunningQmaster elif [ $qmaster = true -a $qmaster_host = false ]; then echo echo "sge_qmaster didn't start!" echo "This is not a qmaster host!" echo "Please, check your act_qmaster file!" echo fi if [ $shadowd = true -a $shadow_host = true ]; then pidfile=$qmaster_spool_dir/shadowd_$HOST.pid [ -f $pidfile ] || pidfile=$qmaster_spool_dir/shadowd_$UQHOST.pid echo -n $"Starting sge_shadowd: " daemon --check sge_shadowd --pidfile=$pidfile /usr/bin/sge_shadowd retval=$? fi echo [ $retval -eq 0 ] && touch $lockfile return $retval } stop() { if [ $shadow_host = true ]; then prog=sge_shadowd pidfile=$qmaster_spool_dir/shadowd_$UQHOST.pid [ -f $pidfile ] || pidfile=$qmaster_spool_dir/shadowd_$HOST.pid # Send SIGTERM to shadowd echo -n $"Stopping $prog: " killproc -p $pidfile $prog retval=$? fi if [ $qmaster = true ]; then if [ `CheckIfQmasterHost $HOST` = true ]; then # Send SIGTERM to qmaster echo -n $"Stopping $master_prog: " killproc -p $master_pidfile $master_prog retval=`expr $retval + $?` fi fi echo [ $retval -eq 0 ] && rm -f $lockfile return $retval } restart() { stop start } reload() { restart } force_reload() { restart } rh_status() { # run checks to determine if the service is running or use generic status status $master_prog } rh_status_q() { rh_status >/dev/null 2>&1 } case "$1" in start) rh_status_q && exit 0 [ -n "$2" ] && CheckArgs $2 $1 ;; stop) rh_status_q || exit 0 [ -n "$2" ] && CheckArgs $2 $1 ;; restart) $1 ;; reload) rh_status_q || exit 7 $1 ;; force-reload) force_reload ;; status) rh_status ;; condrestart|try-restart) rh_status_q || exit 0 restart ;; *) echo $"Usage: $0 {start|stop|status|restart|try-restart|reload|force-reload}" exit 2 esac exit $?