#!/bin/bash
#
# Copyright 2003-2004 Red Hat, Inc.
#
# Author(s):
#     Hardy Merrill <hmerrill at redhat.com>
#     Lon Hohberger <lhh at redhat.com>
#     Michael Moon <Michael dot Moon at oracle.com>
#
# This program is Open Source software.  You may modify and/or redistribute
# it persuant to the terms of the Open Software License version 2.1, which
# is available from the following URL and is included herein by reference:
#
# 	http://opensource.org/licenses/osl-2.1.php
#
# chkconfig: 345 99 01
# description: Service script for starting/stopping      \
#              Oracle(R) Application Server software on: \
#			Red Hat Enterprise Linux 2.1 AS  \
#			Red Hat Enterprise Linux 3 AS    \
#			Red Hat Enterprise Linux 3 ES
#
# NOTES:
#
# (1) You can comment out the LOCKFILE declaration below.  This will prevent
# the need for this script to access anything outside of the ORACLE_HOME 
# path.
#
# (2) You MUST customize ORACLE_USER, ORACLE_HOME, and ORACLE_SID to match
# your installation!
#
# (3) Do NOT place this script in shared storage; place it in ORACLE_USER's
# home directory.
#
# Oracle is a registered trademark of Oracle Corporation.
# Oracle9i is a trademark of Oracle Corporation.
# All other trademarks are property of their respective owners.
#

. /etc/init.d/functions

declare SCRIPT="`basename $0`"


[ -n "$OCF_RESKEY_user" ] && ORACLE_USER=$OCF_RESKEY_user
[ -n "$OCF_RESKEY_home" ] && ORACLE_HOME=$OCF_RESKEY_home
[ -n "$OCF_RESKEY_name" ] && ORACLE_SID=$OCF_RESKEY_name
[ -n "$OCF_RESKEY_lockfile" ] && LOCKFILE=$OCF_RESKEY_lockfile


######################################################
# Customize these to match your Oracle installation. #
######################################################
#
# 1. Oracle user.  Must be the same across all cluster members.  In the event
#    that this script is run by the super-user, it will automatically switch
#    to the Oracle user and restart.  Oracle needs to run as the Oracle
#    user, not as root.
#
[ -n "$ORACLE_USER" ] || ORACLE_USER=oracle

#
# 2. Oracle home.  This is set up during the installation phase of Oracle.
#    From the perspective of the cluster, this is generally the mount point
#    you intend to use as the mount point for your Oracle Infrastructure
#    service.
#
[ -n "$ORACLE_HOME" ] || ORACLE_HOME=/mnt/oracle/oracle-10g

#
# 3. This is your SID.  This is set up during oracle installation as well.
#
[ -n "$ORACLE_SID" ] || ORACLE_SID=asdb

#
# 4. The oracle user probably doesn't have the permission to write to 
# /var/lock/subsys, so use the user's home directory.
#
[ -n "$LOCKFILE" ] || LOCKFILE="/home/$ORACLE_USER/.oracle-ias.lock"
#[ -n "$LOCKFILE" ] || LOCKFILE="$ORACLE_HOME/.oracle-ias.lock"
#[ -n "$LOCKFILE" ] || LOCKFILE="/var/lock/subsys/oracle-ias" # Watch privileges

###########################################################################
export ORACLE_USER ORACLE_HOME ORACLE_SID LOCKFILE


##########################
# Set up paths we'll use.
#
export LD_LIBRARY_PATH=$ORACLE_HOME/lib:$ORACLE_HOME/opmn/lib
export PATH=$ORACLE_HOME/bin:$ORACLE_HOME/opmn/bin:$ORACLE_HOME/dcm/bin:$PATH

declare -i	RESTART_RETRIES=3
declare -r	DB_PROCNAMES="pmon"
#declare -r	DB_PROCNAMES="pmonXX" # testing
#declare -r	DB_PROCNAMES="pmon smon dbw0 lgwr"

declare -r	LSNR_PROCNAME="tnslsnr"
#declare -r	LSNR_PROCNAME="tnslsnrXX" # testing

#
# The oracle user probably doesn't have the permission to write to 
# /var/lock/subsys, so use the user's home directory.
#
declare -r	LOCKFILE="/home/$ORACLE_USER/.oracle-ias.lock"
#declare -r	LOCKFILE="$ORACLE_HOME/.oracle-ias.lock"
#declare -r	LOCKFILE="/var/lock/subsys/oracle-ias" # Watch privileges

##########################################################
# (Hopefully) No user-serviceable parts below this line. #
##########################################################
meta_data()
{
	cat <<EOT
<?xml version="1.0" ?>
<resource-agent name="oracleas" version="rgmanager 2.0">
    <version>1.0</version>

    <longdesc lang="en">
	Oracle Application 10g Failover Instance
    </longdesc>
    <shortdesc lang="en">
	Oracle Application 10g Failover Instance
    </shortdesc>

    <parameters>
        <parameter name="name" primary="1">
	    <longdesc lang="en">
		Instance name (SID) of oracle instance
	    </longdesc>
            <shortdesc lang="en">
		Oracle SID
            </shortdesc>
	    <content type="string"/>
        </parameter>

        <parameter name="user" unique="1" required="1">
	    <longdesc lang="en">
		Oracle user name.  This is the user name of the Oracle
		user which the Oracle AS instance runs as.
	    </longdesc>
            <shortdesc lang="en">
		Oracle User Name
            </shortdesc>
	    <content type="string"/>
        </parameter>

        <parameter name="device" unique="1" required="1">
	    <longdesc lang="en">
		This is the Oracle (application, not user) home directory.
		This is configured when you install Oracle.
	    </longdesc>
            <shortdesc lang="en">
		Oracle Home Directory
            </shortdesc>
	    <content type="string"/>
        </parameter>

    </parameters>

    <actions>
        <action name="start" timeout="900"/>
	<action name="stop" timeout="90"/>
        <action name="recover" timeout="990"/>

	<!-- Checks to see if it's mounted in the right place -->
	<action name="status" timeout="10"/>
	<action name="monitor" timeout="10"/>

	<!-- Checks to see if we can read from the mountpoint -->
	<action name="status" depth="10" timeout="30" interval="5m"/>
	<action name="monitor" depth="10" timeout="30" interval="5m"/>

	<!-- Checks to see if we can write to the mountpoint (if !ROFS) -->
	<action name="status" depth="20" timeout="90" interval="10m"/>
	<action name="monitor" depth="20" timeout="90" interval="10m"/>

	<action name="meta-data" timeout="5"/>
	<action name="verify-all" timeout="5"/>
    </actions>

    <special tag="rgmanager">
	<attributes maxinstances="1"/>
    </special>
</resource-agent>
EOT
}


#
# "action"-like macro supporting functions
#
faction()
{
	echo -n "$1"
	shift
	$*
	if [ $? -eq 0 ]; then
		echo_success
		echo
		return 0
	fi

	echo_failure
	echo
	return 1
}


#
# Start Oracle9i (database portion)
#
start_db()
{
	declare tmpfile
	declare logfile
	declare -i rv

	tmpfile=/tmp/$SCRIPT-start.$$
	logfile=/tmp/$SCRIPT-start.log

	#
	# Set up our sqlplus script.  Basically, we're trying to 
	# capture output in the hopes that it's useful in the case
	# that something doesn't work properly.
	#
	echo "startup" > $tmpfile
	echo "quit" >> $tmpfile

	sqlplus "/ as sysdba" < $tmpfile &> $logfile
	rv=$?

	# Dump logfile to /var/log/messages
	initlog -q -c "cat $logfile"
	
	if [ $rv -ne 0 ]; then
		echo "ORACLE_HOME Incorrectly set?"
		echo "See $logfile for more information."
		return 1
	fi

	# 
	# If we see:
	# ORA-.....: failure, we failed
	#

	rm -f $tmpfile
	grep -q "failure" $logfile
	if [ $? -eq 0 ]; then
		rm -f $tmpfile
		echo "ORACLE_SID Incorrectly set?"
		echo "See $logfile for more information."
		return 1
	fi

	return 0
}


#
# Stop Oracle9i (database portion)
#
stop_db()
{
	declare tmpfile
	declare logfile
	declare -i rv

	tmpfile=/tmp/$SCRIPT-stop.$$
	logfile=/tmp/$SCRIPT-stop.log

	# Setup for Stop ...
	echo "shutdown abort" > $tmpfile
	echo "quit" >> $tmpfile

	sqlplus "/ as sysdba" < $tmpfile &> $logfile
	rv=$?

	# Dump logfile to /var/log/messages
	initlog -q -c "cat $logfile"
	
	if [ $rv -ne 0 ]; then
		echo "ORACLE_HOME Incorrectly set?"
		echo "See $logfile for more information."
		return 1
	fi

	# 
	# If we see 'failure' in the log, we're done.
	#
	rm -f $tmpfile
	grep -q failure $logfile
	if [ $? -eq 0 ]; then
		echo_failure
		echo
		echo "Possible reason: ORACLE_SID Incorrectly set."
		echo "See $logfile for more information."
		return 1
	fi

	return 0
}


#
# Destroy any remaining processes with refs to $ORACLE_HOME
#
force_cleanup()
{
	declare pids
	declare pid

	pids=`ps ax | grep $ORACLE_HOME | grep -v grep | awk '{print $1}'`

	initlog -n $SCRIPT -s "<err> Not all Oracle processes exited cleanly, killing"
	
	for pid in $pids; do
		kill -9 $pid
		if [ $? -eq 0 ]; then
			initlog -n $SCRIPT -s "Killed $pid"
		fi
	done

	return 0
}



#
# Wait for oracle processes to exit.  Time out after 60 seconds
#
exit_idle()
{
	declare -i n=0
	while ps ax | grep $ORACLE_HOME | grep -q -v grep; do
		if [ $n -ge 90 ]; then
			force_cleanup
			return 0
		fi
		sleep 1
		((n++))
	done
	return 0
}


#
# Get database background process status.  Restart it if it failed and
# we have seen the lock file.
#
get_db_status()
{
	declare -i subsys_lock=$1
	declare -i i=0
	declare -i rv=0
	declare ora_procname

	for procname in $DB_PROCNAMES ; do

		ora_procname="ora_${procname}_${ORACLE_SID}"
		
		status $ora_procname
		if [ $? -eq 0 ] ; then
			# This one's okay; go to the next one.
			continue
		fi

		#
		# We're not supposed to be running, and we are,
		# in fact, not running...
		# XXX only works when monitoring one db process; consider
		# extending in future.
		#
		if [ $subsys_lock -ne 0 ]; then
			return 3
		fi

		for (( i=$RESTART_RETRIES ; i; i-- )) ; do
			# this db process is down - stop and
			# (re)start all ora_XXXX_$ORACLE_SID processes
			initlog -q -n $SCRIPT -s "Restarting Oracle Database..."
			stop_db
			if [ $? != 0 ] ; then
				# stop failed - return 1
				return 1
			fi

			start_db
			if [ $? == 0 ] ; then
				# ora_XXXX_$ORACLE_SID processes started
				# successfully, so break out of the
				# stop/start # 'for' loop
				break
			fi
		done

		if [ $i -eq 0 ]; then
			# stop/start's failed - return 1 (failure)
			return 1
		fi
	done
	return 0
}


#
# Get the status of the Oracle listener process
#
get_lsnr_status() 
{
	declare -i subsys_lock=$1
	declare -i rv

	status $LSNR_PROCNAME
	rv=$?
	if [ $rv == 0 ] ; then
		return 0 # Listener is running fine
	fi

	#
	# We're not supposed to be running, and we are,
	# in fact, not running.  Return 3
	#
	if [ $subsys_lock -ne 0 ]; then
		return 3
	fi

	#
	# Listener is NOT running (but should be) - try to restart
	#
	for (( i=$RESTART_RETRIES ; i; i-- )) ; do

		action "Restarting Oracle listener:" lsnrctl start
		lsnrctl status >& /dev/null
		if [ $? == 0 ] ; then
			break # Listener was (re)started and is running fine
		fi
	done

	if [ $i -eq 0 ]; then
		# stop/start's failed - return 1 (failure)
		return 1
	fi

	status $LSNR_PROCNAME
	if [ $? != 0 ] ; then
		return 1 # Problem restarting the Listener
	fi
	return 0 # Success restarting the Listener
}


#
# usage: get_opmn_proc_status <ias-component> [process-type]
#
# Get the status of a specific OPMN-managed process.  If process-type
# is not specified, assume the process-type is the same as the ias-component.
# If the lock-file exists (or no lock file is specified), try to restart
# the given process-type if it is not running.
#
get_opmn_proc_status()
{
	declare comp=$1
	declare opmntype=$2
	declare type_pretty
	declare _pid _status
	
	[ -n "$comp" ] || return 1
	if [ -z "$opmntype" ]; then
		opmntype=$comp
	else
		type_pretty=" [$opmntype]"
	fi

	for (( i=$RESTART_RETRIES ; i; i-- )) ; do

		_status=`opmnctl status | grep "^$comp " | grep " $opmntype " | cut -d '|' -f3,4 | sed -e 's/ //g' -e 's/|/ /g'`

		_pid=`echo $_status | cut -f1 -d' '`
		_status=`echo $_status | cut -f2 -d' '`
		if [ "${_status}" == "Alive" ] || [ "${_status}" == "Init" ]; then
			if [ $i -lt $RESTART_RETRIES ] ; then
				echo "  $comp$type_pretty restarted"
			fi
			echo "  $comp$type_pretty (pid $_pid) is running..."
			break
		else
			echo "  $comp$type_pretty is stopped"

			#
			# Try to restart it, but don't worry if we fail.  OPMN
			# is supposed to handle restarting these anyway.
			#
			# If it's running and you tell OPMN to "start" it,
			# you will get an error.
			#
			# If it's NOT running and you tell OPMN to "restart"
			# it, you will also get an error.
			#
			opmnctl startproc process-type=$opmntype &> /dev/null
		fi
	done

	if [ $i -eq 0 ]; then
		# restarts failed - return 1 (failure)
		return 1
	fi

	return 0
}


#
# Get the status of the OPMN-managed processes.
#
get_opmn_status()
{
	declare -i subsys_lock=$1
	declare -i ct_errors=0

	opmnctl status &> /dev/null
	if [ $? -eq 2 ]; then
		#
		# OPMN not running??
		#
		echo "opmn is stopped"

		if [ $subsys_lock -eq 0 ]; then
			#
			# Don't handle full opmn-restart. XXX
			#
			return 1
		fi

		# That's okay, it's not supposed to be!
		return 3
	fi

	#
	# Print out the PIDs for everyone.
	#
	echo "opmn is running..."
	echo "opmn components:"

	#
	# Check the OPMN-managed processes
	#
	get_opmn_proc_status OID || ((ct_errors++))
	get_opmn_proc_status HTTP_Server || ((ct_errors++))
	get_opmn_proc_status OC4J OC4J_SECURITY || ((ct_errors++))

	#
	# One or more OPMN-managed processes failed and could not be
	# restarted.
	#
	if [ $ct_errors -ne 0 ]; then
		return 1
	fi
	return 0
}


#
# Helps us keep a running status so we know what our ultimate return
# code will be.  Returns 1 if the $1 and $2 are not equivalent, otherwise
# returns $1.  The return code is meant to be the next $1 when this is
# called, so, for example:
#
# update_status 0   <-- returns 0
# update_status $? 0 <-- returns 0
# update_status $? 3 <-- returns 1 (values different - error condition)
# update_status $? 1 <-- returns 1 (same, but happen to be error state!)
#
# update_status 3
# update_status $? 3 <-- returns 3
#
# (and so forth...)
#
update_status()
{
	declare -i old_status=$1
	declare -i new_status=$2

	if [ -z "$2" ]; then
		return $old_status
	fi

	if [ $old_status -ne $new_status ]; then
		return 1
	fi

	return $old_status
}


#
# Print an error message to the user and exit.
#
oops()
{
	echo "Please configure this script ($0) to"
	echo "match your installation."
	echo 
	echo "    $1 failed validation checks."
	exit 1
}


#
# Do some validation on the user-configurable stuff at the beginning of the
# script.
#
validation_checks()
{
	#
	# If the oracle user doesn't exist, we're done.
	#
	[ -n "$ORACLE_USER" ] || oops ORACLE_USER
	id -u $ORACLE_USER > /dev/null || oops ORACLE_USER
	id -g $ORACLE_USER > /dev/null || oops ORACLE_USER

	#
	# If the oracle home isn't a directory, we're done
	#
	[ -n "$ORACLE_HOME" ] || oops ORACLE_HOME
	[ -d "$ORACLE_HOME" ] || oops ORACLE_HOME

	#
	# If the oracle SID is NULL, we're done
	#
	[ -n "$ORACLE_SID" ] || oops ORACLE_SID

	#
	# Super user? Automatically change UID and exec as oracle user.
	# Oracle needs to be run as the Oracle user, not root!
	#
	if [ "`id -u`" = "0" ]; then
		echo "Restarting as $ORACLE_USER."
		#exec su - $ORACLE_USER -c "$0 $*"
		exec sudo -u $ORACLE_USER $0 $*
	fi

	#
	# If we're not root and not the Oracle user, we're done.
	#
	[ "`id -u`" = "`id -u $ORACLE_USER`" ] || exit 1
	[ "`id -g`" = "`id -g $ORACLE_USER`" ] || exit 1

	#
	# Go home.
	#
	cd $ORACLE_HOME

	return 0
}


#
# Start Oracle9i Application Server Infrastructure
#
start_ias()
{
	faction "Starting Oracle Database:" start_db || return 1
	action "Starting Oracle Listener:" lsnrctl start || return 1
	action "Starting Oracle EM:" emctl start em || return 1
	action "Starting iAS Infrastructure:" opmnctl startall || return 1

	if [ -n "$LOCKFILE" ]; then
		touch $LOCKFILE
	fi
	return 0
}


#
# Stop Oracle9i Application Server Infrastructure
#
stop_ias()
{
	action "Stopping iAS Infrastructure:" opmnctl stopall || return 1
	action "Stopping Oracle EM:" emctl stop em || return 1
	faction "Stopping Oracle Database:" stop_db || return 1
	action "Stopping Oracle Listener:" lsnrctl stop
	faction "Waiting for all Oracle processes to exit:" exit_idle 

	if [ $? -ne 0 ]; then
		echo "WARNING: Not all Oracle processes exited cleanly"
	fi

	if [ -n "$LOCKFILE" ]; then
		rm -f $LOCKFILE
	fi
	return 0
}


#
# Find and display the status of iAS infrastructure.
#
# This has three parts:
# (1) Oracle database itself
# (2) Oracle listener process
# (3) OPMN and OPMN-managed processes
#
# - If all are (cleanly) down, we return 3.  In order for this to happen,
# $LOCKFILE must not exist.  In this case, we try and restart certain parts
# of the service - as this may be running in a clustered environment.
#
# - If some but not all are running (and, if $LOCKFILE exists, we could not
# restart the failed portions), we return 1 (ERROR)
#
# - If all are running, return 0.  In the "all-running" case, we recreate
# $LOCKFILE if it does not exist.
#
status_ias()
{
	declare -i subsys_lock=1
	declare -i last 
	declare -i depth=$1

	#
	# Check for lock file.  Crude and rudimentary, but it works
	#
	if [ -z "$LOCKFILE" ] || [ -f $LOCKFILE ]; then
		subsys_lock=0 
	fi

	# Check database status
	get_db_status $subsys_lock $depth
	update_status $? # Start
	last=$?

	# Check & report listener status
	get_lsnr_status $subsys_lock $depth
	update_status $? $last
	last=$?
	
	# Check & report opmn / opmn-managed process status
	get_opmn_status $subsys_lock $depth
	update_status $? $last
	last=$?

	#
	# No lock file, but everything's running.  Put the lock
	# file back. XXX - this kosher?
	#
	if [ $last -eq 0 ] && [ $subsys_lock -ne 0 ]; then
		touch $LOCKFILE
	fi

	return $last
}


########################
# Do some real work... #
########################
if [ "$1" = "meta-data" ]; then
	exit 0
fi
validation_checks $*

case $1 in
	start)
		start_ias
		exit $?
		;;
	stop)
		stop_ias
		exit $?
		;;
	status|monitor)
		status_ias $OCF_CHECK_LEVEL
		exit $?
		;;
	restart)
		$0 stop || exit $?
		$0 start || exit $?
		exit 0
		;;
	*)
		echo "usage: $SCRIPT {start|stop|status|restart}"
		exit 1
		;;
esac	
exit 0
