#! /bin/bash
export LANG=en_US.UTF-8
#WARNLIMIT=-1 #警告值 超过该值则认为是需要告警
#ERRORLIMIT=-1 #错误值 超过该值认为是错误
PORT=
IP=localhost
FLAGE=
FLAGEID=
#是否输出FLAGPID
#FLAGEOPID=1
#是否输出RUNTIME
#RUNTIME=1
#是否输出STARTTIME
STARTTIME=0
VERBOSE=0
QUOTASTR=
quota=
zookeeper_acquisition_json=""
CORENUM=1
keys=(FLAGEPID Memory CPU RUNTIME STARTTIME ConnectionNum Thread ClusterState zkState bytesReceived
bytesSent zkConnections latencyAvg latencyMin latencyMax znodeCount outstandingRequests packetsReceived packetsSent)
#echo ${#keys[@]} 看看多少个指标,不包含cpname
declare -A map=()
for var in ${keys[@]};
do
map[$var]=""
done
function gotErr(){
if [ "$1" -eq 0 ];then
echo "mailstatedes=各项指标恢复正常 statedes=各项指标恢复正常|$DataResultStr"
elif [ "$1" -eq 3 ];then
echo "无法获取到组件PID,疑似组件故障,请确认并请检查监控脚本和运维平台配置"
elif [ "$1" -eq 4 ];then
echo "该组件进程为僵尸进程,请确认并请检查该组件状态"
exit 3
fi
exit $1
}
#ps aux 4是内存 累加4的值
function getMemory(){
CURNUM=$(ps aux | grep $FLAGEID |awk 'BEGIN{sum=0}{sum+=$4}END{print sum}')
zookeeper_acquisition_json="$zookeeper_acquisition_json,'Memory':$CURNUM"
map["Memory"]="$CURNUM"
}
#ps aux 3是CPU 累加3的值
function getCPU(){
CURNUM=$(ps aux | grep $FLAGEID |awk 'BEGIN{sum=0}{sum+=$3}END{print sum}')
if [ "$CORENUM" -ne 0 ];then
CURNUM=$(printf "%.2f" `echo "scale=2;$CURNUM/$CORENUM" | bc`)
fi
zookeeper_acquisition_json="$zookeeper_acquisition_json,'CPU':$CURNUM"
map["CPU"]="$CURNUM"
}
#ps -eo pid,etime 是查询进程运行时间
function getRuntime(){
CURNUM=$(ps -eo pid,etime | grep $FLAGEID |awk '{if($1=='$FLAGEID') print $2}')
zookeeper_acquisition_json="$zookeeper_acquisition_json,'RUNTIME':'$CURNUM'"
map["RUNTIME"]="$CURNUM"
}
#ps -eo pid,lstart是查询进程的开始时间
function getStarttime(){
CURNUM=$(ps -eo pid,lstart | grep $FLAGEID |awk '{if($1=='$FLAGEID') print $2 ,$3 ,$4 ,$5 ,$6}')
zookeeper_acquisition_json="$zookeeper_acquisition_json,'STARTTIME':'$CURNUM'"
map["STARTTIME"]="$CURNUM"
}
#处理连接数
function getConnNum(){
CURNUM=$(sudo netstat -apn | grep -w $FLAGEID | grep ESTABLISHED | wc -l)
}
#ps elm 是PID所包含的线程数目
function getThreadNum(){
CURNUM=$(ps -mp $FLAGEID |wc -l | awk 'BEGIN{sum=0}{sum=$1-2}END{print sum}')
}
#集群状态
function getClusterState(){
zookeeperMonitor="$(cd `dirname $0`;pwd)/zookeeper_monitor.jar"
monitorDate=$(java -jar $zookeeperMonitor $ServerAddr 2>/dev/null)
monitorDateArry=(`echo $monitorDate|awk '{split($0,a,",");for(i in a) print a[i]}'`)
state=(`for ((i=0;i<${#monitorDateArry[@]};i++));
do
echo ${monitorDateArry[i]}|cut -d - -f 1
done` )
for i in ${state[@]}
do
if [[ 2 == ${i} ]];then
CURNUM=2
ErrorResultStr="${ErrorResultStr} ${monitorDate}"
return
fi
done
CURNUM=0
}
#获取状态概览
function getMntr(){
echo "mntr" | nc $IP $PORT >mntr.txt
}
#主从状态
#0表示主从没有发生变化
#2表示该节点由主变成从
#3表示该节点由从变成主
function getzkState(){
state=$(cat mntr.txt|grep "zk_server_state"|awk '{print $NF}')
state1=`cat /root/zkState.txt`
echo "$state" > /root/zkState.txt
if [ "$state" == "$state1" -o "$state1" == "" -o "$state" == "" ];then
CURNUM=0
elif [[ "$state" == "follower" ]];then
CURNUM=2
else
CURNUM=3
fi
}
#获取客户端接收的总字节数
function getBytesReceived(){
CURNUM=`echo "cons" | nc ${IP} ${PORT}|grep -o "recved=[0-9]\+"|sed "s/[^0-9\.]//g" | awk '{s+=$1}END{print s}'|awk '{printf ("%.0f\n",$1/1024/1024)}'`
if [ -z $CURNUM ];then
CURNUM=0
fi
}
#获取客户端接收的总字节数
function getBytesSent(){
CURNUM=`echo "cons" | nc ${IP} ${PORT}|grep -o "sent=[0-9]\+"|sed "s/[^0-9\.]//g" | awk '{s+=$1}END{print s}'|awk '{printf ("%.0f\n",$1/1024/1024)}'`
if [ -z $CURNUM ];then
CURNUM=0
fi
}
#获取客户端连接总数
function getzkConnections(){
CURNUM=`cat mntr.txt|grep "zk_num_alive_connections"|sed "s/[^0-9\.]//g"`
if [ -z $CURNUM ];then
CURNUM=0
fi
}
#获取服务器响应客户端请求的平均时间
function getlatencyAvg(){
CURNUM=`cat mntr.txt|grep "zk_avg_latency"|sed "s/[^0-9\.]//g"`
if [ -z $CURNUM ];then
CURNUM=0
fi
}
#获取服务器响应客户端请求的最大时间
function getlatencyMax(){
CURNUM=`cat mntr.txt|grep "zk_max_latency"|sed "s/[^0-9\.]//g"`
if [ -z $CURNUM ];then
CURNUM=0
fi
}
#获取服务器响应客户端请求的最小时间
function getlatencyMin(){
CURNUM=`cat mntr.txt|grep "zk_min_latency"|sed "s/[^0-9\.]//g"`
if [ -z $CURNUM ];then
CURNUM=0
fi
}
#获取znode数量
function getznodeCount(){
CURNUM=`cat mntr.txt|grep "zk_znode_count"|sed "s/[^0-9\.]//g"`
if [ -z $CURNUM ];then
CURNUM=0
fi
}
#获取超出服务器处理能力的排队请求数量
function getoutstandingRequests(){
CURNUM=`cat mntr.txt|grep "zk_outstanding_requests"|sed "s/[^0-9\.]//g"`
if [ -z $CURNUM ];then
CURNUM=0
fi
}
#获取接收的数据包的数量
function getpacketsReceived(){
CURNUM=`cat mntr.txt|grep "zk_packets_received"|sed "s/[^0-9\.]//g"`
if [ -z $CURNUM ];then
CURNUM=0
fi
}
#获取发送的数据包的数量
function getpacketsSent(){
CURNUM=`cat mntr.txt|grep "zk_packets_sent"|sed "s/[^0-9\.]//g"`
if [ -z $CURNUM ];then
CURNUM=0
fi
}
#返回是否服务是否存活
function isAlive(){
CORENUM=`cat /proc/cpuinfo |grep "physical id"|wc -l`
FLAGEID=$(netstat -lnp | grep -w $PORT | grep LISTEN | awk '{print $NF}'| awk -F '/' '{print $1}' |head -n 1)
ZOMBIE=$(ps -A -ostat,pid | grep -e '^[Zz]' | grep $FLAGEID)
if [[ "$FLAGEID" == "" ]];then
gotErr 3
fi
ZOMBIE=$(ps -A -ostat,pid | grep -e '^[Zz]' | grep $FLAGEID)
if [[ "$ZOMBIE" != "" ]];then
gotErr 4
fi
zookeeper_acquisition_json="'cpname':'zookeeper','FLAGEPID':'$FLAGEID'"
map["FLAGEPID"]="$FLAGEID"
}
function analysisStat(){
getStarttime
getRuntime
getMemory
getCPU
if [[ "$ServerAddr" != "" && "$ServerAddr" != "<<cluster_ip>>" ]];then
getClusterState
zookeeper_acquisition_json="$zookeeper_acquisition_json,'ClusterState':$CURNUM"
map["ClusterState"]="$CURNUM"
fi
if [[ "$IP" == "<<ip>>" || "$IP" == "" ]];then
return
fi
getConnNum
zookeeper_acquisition_json="$zookeeper_acquisition_json,'ConnectionNum':$CURNUM"
map["ConnectionNum"]="$CURNUM"
getThreadNum
zookeeper_acquisition_json="$zookeeper_acquisition_json,'Thread':$CURNUM"
map["Thread"]="$CURNUM"
#获取状态概览
getMntr
getzkState
zookeeper_acquisition_json="$zookeeper_acquisition_json,'zkState':$CURNUM"
map["zkState"]="$CURNUM"
getBytesReceived
zookeeper_acquisition_json="$zookeeper_acquisition_json,'bytesReceived':$CURNUM"
map["bytesReceived"]="$CURNUM"
getBytesSent
zookeeper_acquisition_json="$zookeeper_acquisition_json,'bytesSent':$CURNUM"
map["bytesSent"]="$CURNUM"
getzkConnections
zookeeper_acquisition_json="$zookeeper_acquisition_json,'zkConnections':$CURNUM"
map["zkConnections"]="$CURNUM"
getlatencyAvg
zookeeper_acquisition_json="$zookeeper_acquisition_json,'latencyAvg':$CURNUM"
map["latencyAvg"]="$CURNUM"
getlatencyMin
zookeeper_acquisition_json="$zookeeper_acquisition_json,'latencyMin':$CURNUM"
map["latencyMin"]="$CURNUM"
getlatencyMax
zookeeper_acquisition_json="$zookeeper_acquisition_json,'latencyMax':$CURNUM"
map["latencyMax"]="$CURNUM"
getznodeCount
zookeeper_acquisition_json="$zookeeper_acquisition_json,'znodeCount':$CURNUM"
map["znodeCount"]="$CURNUM"
getoutstandingRequests
zookeeper_acquisition_json="$zookeeper_acquisition_json,'outstandingRequests':$CURNUM"
map["outstandingRequests"]="$CURNUM"
getpacketsReceived
zookeeper_acquisition_json="$zookeeper_acquisition_json,'packetsReceived':$CURNUM"
map["packetsReceived"]="$CURNUM"
getpacketsSent
zookeeper_acquisition_json="$zookeeper_acquisition_json,'packetsSent':$CURNUM"
map["packetsSent"]="$CURNUM"
}
#输出结果
function analysisResult(){
echo -e "{\c"
for key in ${keys[@]};
do
echo -e "\"${key}\":\"${map[$key]//\'/\"}\",\c"
done
echo -e "\"cpname\":\"zookeeper\"\c"
echo -e "}"
}
#输出错误信息并且退出程序
function showHelp(){
if [ "$1" != "" ];then
echo "请输入$1"
fi
echo "check_zookeeper.sh 可以监听本地的zookeeper的状态 参数如下"
echo "check_zookeeper.sh [-p <port>]"
echo "-I 表示主机IP"
echo "-C 表示zookeeper集群服务地址"
echo "-p <port> 特征字符串 表示监控哪个zookeeper端口"
exit 1
}
while getopts "I:p:C:h" arg
do
case $arg in
I)
IP=$OPTARG
;;
p)
PORT=$OPTARG
;;
C)
ServerAddr=$OPTARG
;;
h)
showHelp
;;
?)
showHelp
;;
esac
done
if [[ "$PORT" == "" ]];then
showHelp "特征项"
fi
#插件正题部分
#确定存活的
isAlive
analysisStat
analysisResult