
#! /bin/bash
export LANG=en_US.UTF-8
FLAGE=
FLAGEID=
#是否输出FLAGPID
FLAGEOPID=1
#是否输出RUNTIME
RUNTIME=1
#是否输出STARTTIME
STARTTIME=0
CONN=0 #是否检查连接数
CONNMIN=
CONNMAX=
MEMORY=0 #是否检查内存
MEMORYMIN=
MEMORYMAX=
CPU=0 #是否检查CPU
CPUMIN=
CPUMAX=
THREADNUM=0 #是否检查线程数
THREADMIN=
THREADMAX=
THREADFLAP=0
THREADFLAPMIN=
THREADFLAPMAX=
JMX=0 #是否利用JMX性能参数
JMXMAX=
JMXMIN=
RESTARTUSER="iflyweb" #重启用户
isExcuteRestart=0 #是否进行重启
SERVICEDIR= #服务执行目录
RESTARTTHREADLIMITMIN=
RESTARTTHREADLIMITMAX=
RESTARTTHREADFLAGLIMIT=
RETRYCOUNT=
RETRYTHREADMIN=
HistoryFlapping=0
CURNUM= #当前的性能参数
CORENUM=1
WranResultStr= #最后形成的告警字符串
ErrorResultStr= #最后形成的错误字符串
DataResultStr= #性能呢个参数字符串
QUOTASTR=
quota=
TEMPTHREADFILE="./cmd/linux/DUBBO_HISTHREAD_"
TEMPTRYCOUNTFILE="./cmd/linux/DUBBO_RETRYCOUNT_"
TEMPTRYLOGFILE="./cmd/linux/DUBBO_LOG_"
function gotErr(){
if [ "$1" -eq 2 ];then
result="{'cpname':'dubbo',$DataResultStr}"
result=`echo ${result//\'/\"}`
echo $result
elif [ "$1" -eq 3 ];then
echo "无法获取到指标,疑似组件故障,请确认并请检查监控脚本和运维平台配置"
elif [ "$1" -eq 4 ];then
echo "该组件进程为僵尸进程,请确认并请检查该组件状态"
exit 3
fi
exit $1
}
#返回服务是否存活
function isAlive(){
CORENUM=`cat /proc/cpuinfo |grep "physical id"|wc -l`
FLAGEID=`ss -lnpt "( sport == :$PORT )"|tail -n 1|awk -F " " '{print $6}'|awk -F ',' '{print $2}'`
if [[ "$FLAGEID" == "" ]];then
gotErr 3
fi
ZOMBIE=$(ps -A -ostat,pid | grep -e '^[Zz]' | grep $FLAGEID)
if [[ "$ZOMBIE" != "" ]];then
gotErr 4
fi
#TEMPTHREADFILE=$TEMPTHREADFILE$FLAGE
#TEMPTRYCOUNTFILE=$TEMPTRYCOUNTFILE$FLAGE
#TEMPTRYLOGFILE=$TEMPTRYLOGFILE$FLAGE
#获取线程历史数计算波动率
#HistoryThreadNum=`cat $TEMPTHREADFILE|head -n 1`
}
#处理连接数
function getConnNum(){
CURNUM=$(ss -oanp state established "( sport == :$PORT )"|grep -v "Address"|wc -l)
}
function getCloseWaitConnNum(){
CURNUM=$(ss -oanp state close-wait "( sport == :$PORT )"|grep -v "Address"|wc -l)
}
#ps aux 4是内存 累加4的值
function getMemory(){
CURNUM=$(ps aux | grep $FLAGEID |awk 'BEGIN{sum=0}{sum+=$4}END{print sum}')
}
#ps aux 3是CPU 累加3的值
function getCPU(){
CURNUM=$(ps aux | grep $FLAGEID |awk 'BEGIN{sum=0}{sum+=$3}END{print sum}')
if [ "$CORENUM" -ne 0 ];then
CURNUM=$(printf "%.2f" `echo "scale=2;$CURNUM/$CORENUM" | bc`)
fi
}
function getThreadNum() {
CURNUM=$(pstree -p $FLAGEID | wc -l)
THREADNUM=$CURNUM
#echo $THREADNUM > $TEMPTHREADFILE
}
#ps -eo pid,etime 是查询进程运行时间
function getRuntime(){
CURNUM=$(ps -eo pid,etime | grep $FLAGEID |awk '{if($1=='$FLAGEID') print $2}')
}
#ps -eo pid,lstart是查询进程的开始时间
function getStarttime(){
CURNUM=$(ps -eo pid,lstart | grep $FLAGEID |awk '{if($1=='$FLAGEID') print $2 ,$3 ,$4 ,$5 ,$6}')
}
function getThreadFlapping() {
isNumber $HistoryThreadNum
if [[ $? -eq 1 ]]; then
temp1=$(expr $THREADNUM - $HistoryThreadNum)
temp2=$(expr $temp1 \* 100)
HistoryFlapping=$(expr $temp2 / $HistoryThreadNum)
fi
CURNUM=$HistoryFlapping
}
function appendJMXDATA() {
CURNUM=$JMXHeapMemoryUsage
parseFileds "JMXHeapMemoryUsage" $JMXMIN $JMXMAX
CURNUM=$JMXNonHeapMemoryUsage
parseFileds "JMXNonHeapMemoryUsage" $JMXMIN $JMXMAX
CURNUM=$JMXThreadCount
parseFileds "JMXThreadCount" $JMXMIN $JMXMAX
CURNUM=$JMXPeekThreadCount
parseFileds "JMXPeekThreadCount" $JMXMIN $JMXMAX
CURNUM=$JMXClassLoadCount
parseFileds "JMXClassLoadCount" $JMXMIN $JMXMAX
CURNUM=$JMXStartTime
parseFileds "JMXStartTime" $JMXMIN $JMXMAX
CURNUM=$JMXISDeadLock
parseFileds "JMXISDeadLock" $JMXMIN $JMXMAX
}
#然后通过各个函数获取到具体的值
#值之间的
function analysisStat(){
getConnNum
parseFileds "ConnectionNum"
getCloseWaitConnNum
parseFileds "CloseWaitConnNum"
getMemory
parseFileds "Memory"
getCPU
parseFileds "CPU"
getThreadNum
parseFileds "ThreadNum"
#if [ "$THREADFLAP" -eq 1 ];then
# getThreadFlapping
# parseFiledFlaps "ThreadFlap" $THREADFLAPMIN $THREADFLAPMAX
# fi
CURNUM=$FLAGEID
parseTimeFileds "FLAGEPID"
JMX=0
if [ "$JMX" -eq 1 ];then
source ./cmd/linux/JMX_Info.sh $FLAGEID
appendJMXDATA
#Survivor0
CURNUM=$(jstat -gcutil $FLAGEID |tail -n 1| awk -F ' ' '{print $1}')
DataResultStr="$DataResultStr Survivor0=$CURNUM;999999;999999"
#Survivor1
CURNUM=$(jstat -gcutil $FLAGEID |tail -n 1| awk -F ' ' '{print $2}')
DataResultStr="$DataResultStr Survivor1=$CURNUM;999999;999999"
#EdenGen
CURNUM=$(jstat -gcutil $FLAGEID |tail -n 1| awk -F ' ' '{print $3}')
DataResultStr="$DataResultStr EdenGen=$CURNUM;999999;999999"
#OldGen
CURNUM=$(jstat -gcutil $FLAGEID |tail -n 1| awk -F ' ' '{print $4}')
DataResultStr="$DataResultStr OldGen=$CURNUM;999999;999999"
#PermGen
CURNUM=$(jstat -gcutil $FLAGEID |tail -n 1| awk -F ' ' '{print $5}')
DataResultStr="$DataResultStr PermGen=$CURNUM;999999;999999"
mkdir temp_dubbo/
#YoungGC
CURNUM=$(jstat -gcutil $FLAGEID |tail -n 1| awk -F ' ' '{print $6}')
LASTNUM=$(head -n 1 temp_dubbo/dubbo_YoungGC_"$FLAGEID".temp)
LASTTIME=$(tail -n 1 temp_dubbo/dubbo_YoungGC_"$FLAGEID".temp)
CURTIME=$(date +'%s')
if [[ "$LASTNUM" != "" && "$LASTTIME" != "" ]]; then
CURNUM2=`awk 'BEGIN{printf "%.2f",(('$CURNUM'-'$LASTNUM')/(('$CURTIME'-'$LASTTIME')/'60'))}'`
DataResultStr="$DataResultStr YoungGC=$CURNUM2;999999;999999"
fi
echo $CURNUM > temp_dubbo/dubbo_YoungGC_"$FLAGEID".temp
echo $CURTIME >> temp_dubbo/dubbo_YoungGC_"$FLAGEID".temp
#YoungGCTime
LASTNUM=
LASTTIME=
CURNUM=$(jstat -gcutil $FLAGEID |tail -n 1| awk -F ' ' '{print $7}')
LASTNUM=$(head -n 1 temp_dubbo/dubbo_YoungGCTime_"$FLAGEID".temp)
LASTTIME=$(tail -n 1 temp_dubbo/dubbo_YoungGCTime_"$FLAGEID".temp)
CURTIME=$(date +'%s')
if [[ "$LASTNUM" != "" && "$LASTTIME" != "" ]]; then
CURNUM2=`awk 'BEGIN{printf "%.2f",(('$CURNUM'-'$LASTNUM')/(('$CURTIME'-'$LASTTIME')/'60'))}'`
DataResultStr="$DataResultStr YoungGCTime=$CURNUM2;999999;999999"
fi
echo $CURNUM > temp_dubbo/dubbo_YoungGCTime_"$FLAGEID".temp
echo $CURTIME >> temp_dubbo/dubbo_YoungGCTime_"$FLAGEID".temp
#FullGC
LASTNUM=
LASTTIME=
CURNUM=$(jstat -gcutil $FLAGEID |tail -n 1| awk -F ' ' '{print $8}')
LASTNUM=$(head -n 1 temp_dubbo/dubbo_FullGC_"$FLAGEID".temp)
LASTTIME=$(tail -n 1 temp_dubbo/dubbo_FullGC_"$FLAGEID".temp)
CURTIME=$(date +'%s')
if [[ "$LASTNUM" != "" && "$LASTTIME" != "" ]]; then
CURNUM2=`awk 'BEGIN{printf "%.2f",(('$CURNUM'-'$LASTNUM')/(('$CURTIME'-'$LASTTIME')/'60'))}'`
DataResultStr="$DataResultStr FullGC=$CURNUM2;999999;999999"
fi
echo $CURNUM > temp_dubbo/dubbo_FullGC_"$FLAGEID".temp
echo $CURTIME >> temp_dubbo/dubbo_FullGC_"$FLAGEID".temp
#FGCT
LASTNUM=
LASTTIME=
CURNUM=$(jstat -gcutil $FLAGEID |tail -n 1| awk -F ' ' '{print $9}')
LASTNUM=$(head -n 1 temp_dubbo/dubbo_FullGCTime_"$FLAGEID".temp)
LASTTIME=$(tail -n 1 temp_dubbo/dubbo_FullGCTime_"$FLAGEID".temp)
CURTIME=$(date +'%s')
if [[ "$LASTNUM" != "" && "$LASTTIME" != "" ]]; then
CURNUM2=`awk 'BEGIN{printf "%.2f",(('$CURNUM'-'$LASTNUM')/(('$CURTIME'-'$LASTTIME')/'60'))}'`
DataResultStr="$DataResultStr FullGCTime=$CURNUM2;999999;999999"
fi
echo $CURNUM > temp_dubbo/dubbo_FullGCTime_"$FLAGEID".temp
echo $CURTIME >> temp_dubbo/dubbo_FullGCTime_"$FLAGEID".temp
#GCTime
LASTNUM=
LASTTIME=
CURNUM=$(jstat -gcutil $FLAGEID |tail -n 1| awk -F ' ' '{print $10}')
LASTNUM=$(head -n 1 temp_dubbo/dubbo_GCTime_"$FLAGEID".temp)
LASTTIME=$(tail -n 1 temp_dubbo/dubbo_GCTime_"$FLAGEID".temp)
CURTIME=$(date +'%s')
if [[ "$LASTNUM" != "" && "$LASTTIME" != "" ]]; then
CURNUM2=`awk 'BEGIN{printf "%.2f",(('$CURNUM'-'$LASTNUM')/(('$CURTIME'-'$LASTTIME')/'60'))}'`
DataResultStr="$DataResultStr GCTime=$CURNUM2;999999;999999"
fi
echo $CURNUM > temp_dubbo/dubbo_GCTime_"$FLAGEID".temp
echo $CURTIME >> temp_dubbo/dubbo_GCTime_"$FLAGEID".temp
fi
getRuntime
parseTimeFileds "RUNTIME"
getStarttime
parseTimeFileds "STARTTIME"
}
function parseTimeFileds(){
if [[ "$DataResultStr" != "" ]];then
DataResultStr=$DataResultStr","
fi
DataResultStr="$DataResultStr'$1':'$CURNUM'"
}
#CURNUM 当前值
#WARNLIMIT 告警的上限
#ERRORLIMIT 异常的上限
#判断以上三个值并拼接结果字符串
function parseFileds(){
if [[ "$DataResultStr" != "" ]];then
DataResultStr=$DataResultStr","
fi
DataResultStr="$DataResultStr'$1':'$CURNUM'"
}
function parseFiledFlaps() {
if [[ "$DataResultStr" != "" ]];then
DataResultStr=$DataResultStr","
fi
DataResultStr="$DataResultStr'$1':'$CURNUM'"
}
#分析结果 给出给出状态
function analysisResult(){
gotErr 2
}
#输出错误信息并且退出程序
function showHelp(){
if [ "$1" != "" ];then
echo "请输入$1"
fi
echo "check_dubbo.sh 可以监听本地的dubbo的状态 参数如下"
echo "check_dubbo.sh [-f <str>]"
echo "-f <str> 特征字符串 表示监控哪个dubbo服务"
# echo "-d <str> dubbo服务执行目录"
# echo "-t <str> dubbo服务重启线程告警值和紧急值"
# echo "-m <str> 波动dubbo服务判定的线程数下限值"
# echo "-l <str> dubbo服务重启判断次数"
# echo "-a 表示是否进行重启逻辑"
# echo "-u 表示重启的用户名"
exit 1
}
#dubbo服务重启功能
#通过比对当前线程数是否有剧烈波动&当前线程数是否高于重启值
function serviceRestartCheck(){
#先进行是否重启判断
if [[ $RESTARTTHREADLIMITMAX -eq "" || $THREADNUM -eq "" || "$RETRYCOUNT" -eq "" || $RESTARTTHREADLIMITMIN -eq "" ]];then
return
fi
#记录时间
echo "" >> $TEMPTRYLOGFILE
date >> $TEMPTRYLOGFILE
echo "CurrentThread: "$THREADNUM" HistoryThread: "$HistoryThreadNum >> $TEMPTRYLOGFILE
#判断是否需要进行重启标记
#如果大于告警值上限则直接进行重启
if [[ $THREADNUM -ge $RESTARTTHREADLIMITMAX ]];then
echo "CRITICAL!!!" >> $TEMPTRYLOGFILE
restartService
elif [[ $THREADNUM -ge $RESTARTTHREADLIMITMIN ]];then
echo "WARNING!!!" >> $TEMPTRYLOGFILE
echo 1 >> $TEMPTRYCOUNTFILE
#读取重启标记文件
WarnCount=`cat $TEMPTRYCOUNTFILE |grep 1|wc -l`
if [[ $WarnCount -ge $RETRYCOUNT ]];then
restartService
fi
else
echo "SAFE!!!" >> $TEMPTRYLOGFILE
echo > $TEMPTRYCOUNTFILE
fi
}
function restartService(){
if [[ $isExcuteRestart -eq 1 ]];then
echo > $TEMPTRYCOUNTFILE
echo "CLEAN RESTARTFLAG" >> $TEMPTRYLOGFILE
echo "restart service with port: "$FLAGEID >> $TEMPTRYLOGFILE
#重启dubbo服务
RestartLog=$(su - $RESTARTUSER -c $SERVICEDIR/bin/restart.sh)
echo $RestartLog >> $TEMPTRYLOGFILE
echo "" >> $TEMPTRYLOGFILE
IsSuccess=$(echo $RestartLog|grep "Starting the isservice-provider ....OK"|wc -l)
#设置告警信息
if [[ ${quota} != "" ]]; then
quota="${quota},"
fi
echo ${quota}
if [[ "$IsSuccess" -ne "" ]]; then
QUOTASTR="${QUOTASTR}重启状态=成功 "
else
QUOTASTR="${QUOTASTR}重启状态=失败 "
fi
fi
}
function parseRestartArgs(){
RESTARTTHREADLIMITMIN=$(echo $1 | awk -F ',' '{print $1}' | sed "s/[^0-9]//g")
RESTARTTHREADLIMITMAX=$(echo $1| awk -F ',' '{print $2}' | sed "s/[^0-9]//g")
}
while getopts "p:d:t:l:m:a:u:h" arg
do
case $arg in
t)
parseRestartArgs "$OPTARG"
;;
d)
SERVICEDIR=$OPTARG
;;
p)
PORT=$OPTARG
;;
l)
RETRYCOUNT=$OPTARG
;;
m)
RETRYTHREADMIN=$OPTARG
;;
a)
isExcuteRestart=1
;;
h)
showHelp
;;
u)
RESTARTUSER=$OPTARG
;;
?)
showHelp
;;
esac
done
if [[ "$PORT" == "" ]];then
showHelp "特征项"
fi
#插件正题部分
#确定存活的
#启动
#分析
isAlive
analysisStat
serviceRestartCheck
analysisResult