#! /bin/bash
export LANG=en_US.UTF-8
FLAGE=
FLAGEID=
#是否输出FLAGPID
FLAGEOPID=1
#是否输出RUNTIME
RUNTIME=1
#是否输出STARTTIME
STARTTIME=0
CONN=0              #是否检查连接数
CONNMIN=
CONNMAX=
MEMORY=0            #是否检查内存
MEMORYMIN=
MEMORYMAX=
CPU=0               #是否检查CPU
CPUMIN=
CPUMAX=

THREADNUM=0         #是否检查线程数
THREADMIN=
THREADMAX=

THREADFLAP=0
THREADFLAPMIN=
THREADFLAPMAX=

JMX=0           #是否利用JMX性能参数
JMXMAX=
JMXMIN=
RESTARTUSER="iflyweb"   #重启用户
isExcuteRestart=0   #是否进行重启
SERVICEDIR=         #服务执行目录
RESTARTTHREADLIMITMIN=
RESTARTTHREADLIMITMAX=
RESTARTTHREADFLAGLIMIT=
RETRYCOUNT=
RETRYTHREADMIN=
HistoryFlapping=0
CURNUM=             #当前的性能参数
CORENUM=1


WranResultStr=      #最后形成的告警字符串
ErrorResultStr=     #最后形成的错误字符串
DataResultStr=      #性能呢个参数字符串

QUOTASTR=
quota=

TEMPTHREADFILE="./cmd/linux/DUBBO_HISTHREAD_"
TEMPTRYCOUNTFILE="./cmd/linux/DUBBO_RETRYCOUNT_"
TEMPTRYLOGFILE="./cmd/linux/DUBBO_LOG_"

function gotErr(){
    if [ "1" -eq 2 ];then        result="{'cpname':'dubbo',DataResultStr}"
        result=`echo {result//\'/\"}`
        echoresult
    elif [ "1" -eq 3 ];then        echo "无法获取到指标,疑似组件故障,请确认并请检查监控脚本和运维平台配置"
    elif [ "1" -eq 4 ];then
        echo "该组件进程为僵尸进程,请确认并请检查该组件状态"
        exit 3
    fi

    exit 1
}

#返回服务是否存活
function isAlive(){
    CORENUM=`cat /proc/cpuinfo |grep "physical id"|wc -l`
    FLAGEID=`ss -lnpt "( sport == :PORT )"|tail -n 1|awk -F " " '{print 6}'|awk -F ',' '{print2}'`
    if [[ "FLAGEID" == "" ]];then
       gotErr 3
    fi
    ZOMBIE=(ps -A -ostat,pid | grep -e '^[Zz]' | grep FLAGEID)
    if [[ "ZOMBIE" != "" ]];then
       gotErr 4
    fi
    #TEMPTHREADFILE=TEMPTHREADFILEFLAGE
    #TEMPTRYCOUNTFILE=TEMPTRYCOUNTFILEFLAGE
    #TEMPTRYLOGFILE=TEMPTRYLOGFILEFLAGE
    #获取线程历史数计算波动率
    #HistoryThreadNum=`cat TEMPTHREADFILE|head -n 1`
}


#处理连接数
function getConnNum(){
    CURNUM=(ss -oanp state established "( sport == :PORT )"|grep -v "Address"|wc -l)
}

function getCloseWaitConnNum(){
    CURNUM=(ss -oanp state close-wait "( sport == :PORT )"|grep -v "Address"|wc -l)
}

#ps aux 4是内存 累加4的值
function getMemory(){
    CURNUM=(ps aux | grep FLAGEID |awk 'BEGIN{sum=0}{sum+=4}END{print sum}')
}

#ps aux 3是CPU 累加3的值
function getCPU(){
    CURNUM=(ps aux | grepFLAGEID |awk 'BEGIN{sum=0}{sum+=3}END{print sum}')
    if [ "CORENUM" -ne 0 ];then
       CURNUM=(printf "%.2f" `echo "scale=2;CURNUM/CORENUM" | bc`)
    fi
}

function getThreadNum() {
    CURNUM=(pstree -p FLAGEID | wc -l)
    THREADNUM=CURNUM
    #echo THREADNUM>TEMPTHREADFILE
}

#ps -eo pid,etime 是查询进程运行时间
function getRuntime(){
    CURNUM=(ps -eo pid,etime | grepFLAGEID |awk '{if(1=='FLAGEID') print 2}')
}

#ps -eo pid,lstart是查询进程的开始时间
function getStarttime(){
    CURNUM=(ps -eo pid,lstart | grep FLAGEID |awk '{if(1=='FLAGEID') print2 ,3 ,4 ,5 ,6}')
}

function getThreadFlapping() {
    isNumber HistoryThreadNum
    if [[? -eq 1 ]]; then
        temp1=(exprTHREADNUM - HistoryThreadNum)
        temp2=(expr temp1 \* 100)
        HistoryFlapping=(expr temp2 /HistoryThreadNum)
    fi
    CURNUM=HistoryFlapping
}

function appendJMXDATA() {
    CURNUM=JMXHeapMemoryUsage
    parseFileds "JMXHeapMemoryUsage" JMXMINJMXMAX
    CURNUM=JMXNonHeapMemoryUsage
    parseFileds "JMXNonHeapMemoryUsage"JMXMIN JMXMAX
    CURNUM=JMXThreadCount
    parseFileds "JMXThreadCount" JMXMINJMXMAX
    CURNUM=JMXPeekThreadCount
    parseFileds "JMXPeekThreadCount"JMXMIN JMXMAX
    CURNUM=JMXClassLoadCount
    parseFileds "JMXClassLoadCount" JMXMINJMXMAX
    CURNUM=JMXStartTime
    parseFileds "JMXStartTime"JMXMIN JMXMAX
    CURNUM=JMXISDeadLock
    parseFileds "JMXISDeadLock" JMXMINJMXMAX
}
#然后通过各个函数获取到具体的值
#值之间的
function analysisStat(){

        getConnNum
        parseFileds "ConnectionNum"

        getCloseWaitConnNum
        parseFileds "CloseWaitConnNum"

        getMemory
        parseFileds "Memory"

        getCPU
        parseFileds "CPU"

        getThreadNum
        parseFileds "ThreadNum"


    #if [ "THREADFLAP" -eq 1 ];then
    #   getThreadFlapping
#      parseFiledFlaps "ThreadFlap"THREADFLAPMIN THREADFLAPMAX
#  fi

        CURNUM=FLAGEID
        parseTimeFileds "FLAGEPID"

    JMX=0
    if [ "JMX" -eq 1 ];then
        source ./cmd/linux/JMX_Info.shFLAGEID
        appendJMXDATA

        #Survivor0
        CURNUM=(jstat -gcutilFLAGEID |tail -n 1| awk -F ' ' '{print 1}')
        DataResultStr="DataResultStr Survivor0=CURNUM;999999;999999"

        #Survivor1
        CURNUM=(jstat -gcutil FLAGEID |tail -n 1| awk -F ' ' '{print2}')
        DataResultStr="DataResultStr Survivor1=CURNUM;999999;999999"

        #EdenGen
        CURNUM=(jstat -gcutilFLAGEID |tail -n 1| awk -F ' ' '{print 3}')
        DataResultStr="DataResultStr EdenGen=CURNUM;999999;999999"

        #OldGen
        CURNUM=(jstat -gcutil FLAGEID |tail -n 1| awk -F ' ' '{print4}')
        DataResultStr="DataResultStr OldGen=CURNUM;999999;999999"

        #PermGen
        CURNUM=(jstat -gcutilFLAGEID |tail -n 1| awk -F ' ' '{print 5}')
        DataResultStr="DataResultStr PermGen=CURNUM;999999;999999"

        mkdir temp_dubbo/

        #YoungGC
        CURNUM=(jstat -gcutil FLAGEID |tail -n 1| awk -F ' ' '{print6}')
        LASTNUM=(head -n 1 temp_dubbo/dubbo_YoungGC_"FLAGEID".temp)
        LASTTIME=(tail -n 1 temp_dubbo/dubbo_YoungGC_"FLAGEID".temp)
        CURTIME=(date +'%s')
        if [[ "LASTNUM" != "" && "LASTTIME" != "" ]]; then
            CURNUM2=`awk 'BEGIN{printf "%.2f",(('CURNUM'-'LASTNUM')/(('CURTIME'-'LASTTIME')/'60'))}'`
            DataResultStr="DataResultStr YoungGC=CURNUM2;999999;999999"

        fi
        echoCURNUM > temp_dubbo/dubbo_YoungGC_"FLAGEID".temp
        echoCURTIME >> temp_dubbo/dubbo_YoungGC_"FLAGEID".temp       

        #YoungGCTime
        LASTNUM=
        LASTTIME=
        CURNUM=(jstat -gcutil FLAGEID |tail -n 1| awk -F ' ' '{print7}')
        LASTNUM=(head -n 1 temp_dubbo/dubbo_YoungGCTime_"FLAGEID".temp)
        LASTTIME=(tail -n 1 temp_dubbo/dubbo_YoungGCTime_"FLAGEID".temp)
        CURTIME=(date +'%s')
        if [[ "LASTNUM" != "" && "LASTTIME" != "" ]]; then
            CURNUM2=`awk 'BEGIN{printf "%.2f",(('CURNUM'-'LASTNUM')/(('CURTIME'-'LASTTIME')/'60'))}'`
            DataResultStr="DataResultStr YoungGCTime=CURNUM2;999999;999999"

        fi
        echoCURNUM > temp_dubbo/dubbo_YoungGCTime_"FLAGEID".temp
        echoCURTIME >> temp_dubbo/dubbo_YoungGCTime_"FLAGEID".temp   


        #FullGC
        LASTNUM=
        LASTTIME=
        CURNUM=(jstat -gcutil FLAGEID |tail -n 1| awk -F ' ' '{print8}')
        LASTNUM=(head -n 1 temp_dubbo/dubbo_FullGC_"FLAGEID".temp)
        LASTTIME=(tail -n 1 temp_dubbo/dubbo_FullGC_"FLAGEID".temp)
        CURTIME=(date +'%s')
        if [[ "LASTNUM" != "" && "LASTTIME" != "" ]]; then
            CURNUM2=`awk 'BEGIN{printf "%.2f",(('CURNUM'-'LASTNUM')/(('CURTIME'-'LASTTIME')/'60'))}'`
            DataResultStr="DataResultStr FullGC=CURNUM2;999999;999999"

        fi
        echoCURNUM > temp_dubbo/dubbo_FullGC_"FLAGEID".temp
        echoCURTIME >> temp_dubbo/dubbo_FullGC_"FLAGEID".temp    

        #FGCT
        LASTNUM=
        LASTTIME=
        CURNUM=(jstat -gcutil FLAGEID |tail -n 1| awk -F ' ' '{print9}')
        LASTNUM=(head -n 1 temp_dubbo/dubbo_FullGCTime_"FLAGEID".temp)
        LASTTIME=(tail -n 1 temp_dubbo/dubbo_FullGCTime_"FLAGEID".temp)
        CURTIME=(date +'%s')
        if [[ "LASTNUM" != "" && "LASTTIME" != "" ]]; then
            CURNUM2=`awk 'BEGIN{printf "%.2f",(('CURNUM'-'LASTNUM')/(('CURTIME'-'LASTTIME')/'60'))}'`
            DataResultStr="DataResultStr FullGCTime=CURNUM2;999999;999999"

        fi
        echoCURNUM > temp_dubbo/dubbo_FullGCTime_"FLAGEID".temp
        echoCURTIME >> temp_dubbo/dubbo_FullGCTime_"FLAGEID".temp

        #GCTime
        LASTNUM=
        LASTTIME=
        CURNUM=(jstat -gcutil FLAGEID |tail -n 1| awk -F ' ' '{print10}')
        LASTNUM=(head -n 1 temp_dubbo/dubbo_GCTime_"FLAGEID".temp)
        LASTTIME=(tail -n 1 temp_dubbo/dubbo_GCTime_"FLAGEID".temp)
        CURTIME=(date +'%s')
        if [[ "LASTNUM" != "" && "LASTTIME" != "" ]]; then
            CURNUM2=`awk 'BEGIN{printf "%.2f",(('CURNUM'-'LASTNUM')/(('CURTIME'-'LASTTIME')/'60'))}'`
            DataResultStr="DataResultStr GCTime=CURNUM2;999999;999999"
        fi
        echoCURNUM > temp_dubbo/dubbo_GCTime_"FLAGEID".temp
        echoCURTIME >> temp_dubbo/dubbo_GCTime_"FLAGEID".temp
    fi
        getRuntime
        parseTimeFileds "RUNTIME"   

        getStarttime
        parseTimeFileds "STARTTIME" 


}
function parseTimeFileds(){

    if [[ "DataResultStr" != "" ]];then
        DataResultStr=DataResultStr","
    fi
    DataResultStr="DataResultStr'1':'CURNUM'"
}

#CURNUM            当前值
#WARNLIMIT         告警的上限
#ERRORLIMIT    异常的上限
#判断以上三个值并拼接结果字符串
function parseFileds(){

    if [[ "DataResultStr" != "" ]];then
        DataResultStr=DataResultStr","
    fi
    DataResultStr="DataResultStr'1':'CURNUM'"

}

function parseFiledFlaps() {

    if [[ "DataResultStr" != "" ]];then
        DataResultStr=DataResultStr","
    fi
    DataResultStr="DataResultStr'1':'CURNUM'"
}

#分析结果 给出给出状态
function analysisResult(){
    gotErr 2
}

#输出错误信息并且退出程序
function showHelp(){
    if [ "1" != "" ];then
        echo "请输入1"
    fi
    echo "check_dubbo.sh 可以监听本地的dubbo的状态 参数如下"
    echo "check_dubbo.sh [-f <str>]"
    echo "-f <str> 特征字符串 表示监控哪个dubbo服务"
#  echo "-d <str> dubbo服务执行目录"
#  echo "-t <str> dubbo服务重启线程告警值和紧急值"
#  echo "-m <str> 波动dubbo服务判定的线程数下限值"
#  echo "-l <str> dubbo服务重启判断次数"
#  echo "-a 表示是否进行重启逻辑"
#  echo "-u 表示重启的用户名"
    exit 1
}

#dubbo服务重启功能
#通过比对当前线程数是否有剧烈波动&当前线程数是否高于重启值
function serviceRestartCheck(){
    #先进行是否重启判断
    if [[ RESTARTTHREADLIMITMAX -eq "" ||THREADNUM -eq "" || "RETRYCOUNT" -eq "" ||RESTARTTHREADLIMITMIN -eq "" ]];then
        return
    fi

    #记录时间
    echo "" >> TEMPTRYLOGFILE
    date >>TEMPTRYLOGFILE
    echo "CurrentThread: "THREADNUM"       HistoryThread: "HistoryThreadNum >> TEMPTRYLOGFILE

    #判断是否需要进行重启标记
    #如果大于告警值上限则直接进行重启
    if [[THREADNUM -ge RESTARTTHREADLIMITMAX ]];then
        echo "CRITICAL!!!" >>TEMPTRYLOGFILE
        restartService
    elif [[ THREADNUM -geRESTARTTHREADLIMITMIN ]];then 
        echo "WARNING!!!" >> TEMPTRYLOGFILE
        echo 1 >>TEMPTRYCOUNTFILE
        #读取重启标记文件
        WarnCount=`cat TEMPTRYCOUNTFILE |grep 1|wc -l`
        if [[WarnCount -ge RETRYCOUNT ]];then
            restartService
        fi
    else
        echo "SAFE!!!" >>TEMPTRYLOGFILE
        echo > TEMPTRYCOUNTFILE
    fi
}
function restartService(){
        if [[isExcuteRestart -eq 1 ]];then
            echo > TEMPTRYCOUNTFILE
            echo "CLEAN RESTARTFLAG" >>TEMPTRYLOGFILE
            echo "restart service with port: "FLAGEID >>TEMPTRYLOGFILE
            #重启dubbo服务
            RestartLog=(su -RESTARTUSER -c SERVICEDIR/bin/restart.sh)
            echoRestartLog >> TEMPTRYLOGFILE
            echo "" >>TEMPTRYLOGFILE
            IsSuccess=(echoRestartLog|grep "Starting the isservice-provider ....OK"|wc -l)
            #设置告警信息
            if [[ {quota} != "" ]]; then
                quota="{quota},"
            fi
            echo {quota}
            if [[ "IsSuccess" -ne "" ]]; then
                QUOTASTR="{QUOTASTR}重启状态=成功 "
            else
                QUOTASTR="{QUOTASTR}重启状态=失败 "
            fi
        fi
}

function parseRestartArgs(){
    RESTARTTHREADLIMITMIN=(echo1 | awk -F ',' '{print 1}' | sed "s/[^0-9]//g")
    RESTARTTHREADLIMITMAX=(echo 1| awk -F ',' '{print2}' | sed "s/[^0-9]//g")
}

while getopts "p:d:t:l:m:a:u:h" arg
do
    case arg in
        t)
            parseRestartArgs "OPTARG"
            ;;
        d)
            SERVICEDIR=OPTARG
            ;;
        p)
            PORT=OPTARG
            ;;
        l)
            RETRYCOUNT=OPTARG
            ;;             m)
            RETRYTHREADMIN=OPTARG
            ;;
        a)
            isExcuteRestart=1
            ;;
        h)
            showHelp
            ;;
        u)
            RESTARTUSER=OPTARG
            ;;
        ?)            showHelp
            ;;
    esac
done
if [[ "PORT" == "" ]];then
    showHelp "特征项"
fi

#插件正题部分 
#确定存活的
#启动
#分析
isAlive

analysisStat
serviceRestartCheck
analysisResult

发表评论

您的电子邮箱地址不会被公开。 必填项已用*标注

Captcha Code