check_tfs.sh查看tfs

  sre

5fc8cc38f37b48788.jpg_fo742.png

#! /bin/bash
export LANG=en_US.UTF-8

#WARNLIMIT=-1      #警告值 超过该值则认为是需要告警
#ERRORLIMIT=-1         #错误值 超过该值认为是错误

FLAGE=
FLAGEID=
#是否输出FLAGPID
FLAGEOPID=1
#是否输出RUNTIME
RUNTIME=1
#是否输出STARTTIME
STARTTIME=0
TFSHOME=
NAMEPORT=

#连接数
CONN=0
CONNMIN=
CONNMAX=
#内存使用量
MEMORY=0
MEMORYMIN=
MEMORYMAX=
#CPU使用率
CPU=0
CPUMIN=
CPUMAX=
#空间使用率
USEAGE=0
USEAGEMIN=
USEAGEMAX=
CORENUM=1
CURNUM=             #当前的性能参数

WranResultStr=      #最后形成的告警字符串
ErrorResultStr=     #最后形成的错误字符串
DataResultStr=      #性能呢个参数字符串

quota=
QUOTASTR=
tfs_acquisition_json=""

function gotErr(){
    if [ "1" -eq 0 ];then
        echo "mailstatedes=各项指标恢复正常 statedes=各项指标恢复正常|DataResultStr"
    elif [ "1" -eq 1 ];then        echo "quota={quota} mailstatedes=WranResultStr statedes=WranResultStr {QUOTASTR} |DataResultStr"
    elif [ "1" -eq 2 ];then        if [[ "{WranResultStr}" != "" ]]; then
            echo "quota={quota} mailstatedes={WranResultStr},{ErrorResultStr} statedes={WranResultStr},{ErrorResultStr}{QUOTASTR}|DataResultStr"
            else
            echo "quota={quota} mailstatedes={ErrorResultStr} statedes={ErrorResultStr} {QUOTASTR}|DataResultStr"
        fi
    elif [ "1" -eq 3 ];then        if [ "FLAGEOPID" -eq 1 ];then
            CURNUM=FLAGEID
            parseFileds "FLAGEPID"CURNUM CURNUM
        fi
        echo "mailstatedes=无法获取到指标,疑似组件故障,请确认并请检查监控脚本和运维平台配置 statedes=无法获取到指标,疑似组件故障,请确认并请检查监控脚本和运维平台配置|DataResultStr"
    elif [ "1" -eq 4 ];then
        echo "mailstatedes=该组件进程为僵尸进程,请确认并请检查该组件状态 statedes=该组件进程为僵尸进程,请确认并请检查该组件状态|DataResultStr"
        exit 3
    fi

    exit 1
}

#返回是否服务是否存活
function isAlive(){
    CORENUM=`cat /proc/cpuinfo |grep "physical id"|wc -l`
    #isAliveStr=(ps aux |grep dataserver |grep -v grep|wc -l)
    #if [[ "isAliveStr" -eq 0 ]];then
        #gotErr 3
    #fi

#  isAliveStr=(ps aux |grep TFSHOME/dataserver | grep "-i 3"|grep -v grep|wc -l)
#  if [[ "isAliveStr" -eq 0 ]];then
#      gotErr 3
#  fi

    #isAliveStr=(ps aux |grep htfs|grep -v grep|wc -l)
    #if [[ "isAliveStr" -eq 0 ]];then
        #gotErr 3
    #fi
    #isAliveStr=(ps aux |grepTFSHOME/nameserver|grep -v grep|wc -l)
    #if [[ "isAliveStr" -eq 0 ]];then
        #gotErr 3
    #fi
    #if [[ "isAliveStr" -gt 1 ]];then
        #echo "你的标志项需要唯一标识一个进程"
        #showHelp
    #fi

    FLAGEID=(ps aux |grepTFSHOME/nameserver|grep -v grep| awk '{print 2}' |head -n 1 )

    ZOMBIE=(ps -A -ostat,pid | grep -e '^[Zz]' | grep FLAGEID)
    #if [[ "ZOMBIE" != "" ]];then
       #gotErr 4
    #fi
}


#处理连接数
function getConnNum(){
    CURNUM=(netstat -apn | grepFLAGEID | grep ESTABLISHED | wc -l)
    tfs_acquisition_json="tfs_acquisition_json,'ConnectionNum':CURNUM"
}

#ps aux 4是内存 累加4的值
function getMemory(){
    CURNUM=(ps aux | grepTFSHOME |awk 'BEGIN{sum=0}{sum+=4}END{print sum}')
    tfs_acquisition_json="tfs_acquisition_json,'Memory':CURNUM"
}

#ps aux 3是CPU 累加3的值
function getCPU(){
    CURNUM=(ps aux | grep TFSHOME |awk 'BEGIN{sum=0}{sum+=3}END{print sum}')
    if [ "CORENUM" -ne 0 ];then
       CURNUM=(printf "%.2f" `echo "scale=2;CURNUM/CORENUM" | bc`)
    fi
    tfs_acquisition_json="tfs_acquisition_json,'CPU':CURNUM"
}
function getUSEAGE(){
    CURNUM=(TFSHOME/ssm -s 127.0.0.1:NAMEPORT -i server|grep "TOTAL:"|awk '{print5*100/100}')
    tfs_acquisition_json="tfs_acquisition_json,'USEAGE':CURNUM"
}

#ps -eo pid,etime 是查询进程运行时间
function getRuntime(){
    CURNUM=(ps -eo pid,etime | grepFLAGEID |awk '{if(1=='FLAGEID') print 2}')
    tfs_acquisition_json="tfs_acquisition_json,'RUNTIME':'CURNUM'"
}

#ps -eo pid,lstart是查询进程的开始时间
function getStarttime(){
    CURNUM=(ps -eo pid,lstart | grep FLAGEID |awk '{if(1=='FLAGEID') print2 ,3 ,4 ,5 ,6}')
    tfs_acquisition_json="tfs_acquisition_json,'STARTTIME':'CURNUM'"
}
#依据传入的参数值来判断我们要监控的服务是什么
#然后通过各个函数获取到具体的值
#值之间的
function analysisStat(){

    #if [ "CONN" -eq 1 ];then
        getConnNum
        #parseFileds "ConnectionNum"CONNMIN CONNMAX
    #fi
    #if [ "MEMORY" -eq 1 ];then
        getMemory
        #parseFileds "Memory" MEMORYMINMEMORYMAX
    #fi
    #if [ "CPU" -eq 1 ];then
        getCPU
        #parseFileds "CPU"CPUMIN CPUMAX
    #fi
    #if [ "USEAGE" -eq 1 ];then
        getUSEAGE
        #parseFileds "USEAGE" USEAGEMINUSEAGEMAX
    #fi
    #if [ "FLAGEOPID" -eq 1 ];then
        #CURNUM=FLAGEID
        #parseTimeFileds "FLAGEPID" CURNUMCURNUM
    #fi
    #if [ "RUNTIME" -eq 1 ];then
        getRuntime
        #parseTimeFileds "RUNTIME"RUNTIME RUNTIME
    #fi
    #if [ "STARTTIME" -eq 1 ];then
        getStarttime
        #parseTimeFileds "STARTTIME" STARTTIMESTARTTIME
    #fi

    result="{'cpname':'tfs',{tfs_acquisition_json#*,}}"
    result=`echo{result//\'/\"}`
    echo result
}
function parseTimeFileds(){
    val1=(echo "CURNUM2")
    val2=(echo "CURNUM 3")
    DataResultStr="DataResultStr 1=CURNUM;2;3;0;0"
}

#CURNUM            当前值
#WARNLIMIT         告警的上限
#ERRORLIMIT    异常的上限
#判断以上三个值并拼接结果字符串
function parseFileds(){
    val1=(echo "CURNUM 2"| awk '{if(1<=2){print 1}else{print 0}}')
    val2=(echo "CURNUM3"| awk '{if(1<=2){print 1}else{print 0}}')

    DataResultStr="DataResultStr1=CURNUM;2;3;0;0"
    if [val1 -eq 1 ];then
        return 0
    elif [[ val1 -eq 0 &&val2 -eq 1 ]];then
        if [[ {WranResultStr} != "" ]]; then
            WranResultStr="{WranResultStr},"
        fi

        WranResultStr="{WranResultStr}1当前值为{CURNUM}超过告警值{2}"

        if [[ {quota} != "" ]]; then
            quota="{quota},"
        fi

        quota="{quota}1"

        QUOTASTR="{QUOTASTR}1=1当前值为{CURNUM}超过告警值{2} "

        return 1
    else
        if [[{ErrorResultStr} != "" ]]; then
            ErrorResultStr="{ErrorResultStr},"
        fi
        ErrorResultStr="{ErrorResultStr}1当前值为{CURNUM}超过紧急值{3}"

        if [[{quota} != "" ]]; then
            quota="{quota},"
        fi

        quota="{quota}1"

        QUOTASTR="{QUOTASTR}1=1当前值为{CURNUM}超过紧急值{3} "

        return 2
    fi
}

#分析结果 给出给出状态
function analysisResult(){
    if [[ "WranResultStr" == "" && "ErrorResultStr" == "" ]];then
        gotErr 0
    elif [[ "ErrorResultStr" == "" ]];then
        gotErr 1
    else
        gotErr 2
    fi
}

#输出错误信息并且退出程序
function showHelp(){
    if [ "1" != "" ];then
        echo "请输入1"
    fi
    echo "check_tfs.sh 可以监听本地的tfs的状态 参数如下"
    echo "check_tfs.sh [-w 连接数,CPU,内存,空间使用率] [-c 连接数,CPU,内存,空间使用率] [-b <str>] [-P <str>]"
    echo "-w 表示监控警告的值"
    echo "-c 表示异常值"
    echo "-b <str> tfs的安装目录"
    echo "-P <str> namenode的端口号"
    exit 1
}

#检查参数是不是数字
function isNumber(){
    isNum=(echo "1" | awk '{print(0~/^([0-9])+[\.]?([0-9])?/)?1:0}')
    returnisNum
}

function isLarge(){

    isNumber 1
    if [[? -eq 0 ]]; then
        return 1
    fi

    isNumber 2
    if [[? -eq 0 ]]; then
        return 1
    fi

    isLarge=(echo "1 2"| awk '{if(1>=2){print 1}else{print 0}}')
    if [isLarge -eq 1 ];then
        return 0
    else
        return 1
    fi
}

function checkARG(){
    if [[ "2" != "" && "3" != "" ]];then
        isLarge "3" "2"
        if [[ ? -eq 1 ]]; then
            return 1
        else1=1
        fi
    fi
    return 0
}

#检查参数是否合法
#即参数是否是小数和整数
#对于守护进程的选择 这个值可以不验证
function checkALL(){
    if [[ "CONNMIN" != "" || "CONNMAX" != "" ]];then
        if [[ "CONNMAX" == "" || "CONNMAX" == "" ]]; then
            return 1
        fi

        isLarge "CONNMAX" "CONNMIN"
        if [[ ? -eq 1 ]]; then
            return 1
        else
            CONN=1
        fi
    fi
    if [[ "CPUMIN" != "" || "CPUMAX" != "" ]];then
        if [[ "CPUMAX" == "" || "CPUMIN" == "" ]]; then
            return 1
        fi

        isLarge "CPUMAX" "CPUMIN"
        if [[? -eq 1 ]]; then
            return 1
        else
            CPU=1
        fi
    fi
    if [[ "MEMORYMIN" != "" || "MEMORYMAX" != "" ]];then

        if [[ "MEMORYMAX" == "" || "MEMORYMIN" == "" ]]; then
            return 1
        fi

        isLarge "MEMORYMAX" "MEMORYMIN"
        if [[ ? -eq 1 ]]; then
            return 1
        else
            MEMORY=1
        fi
    fi
    if [[ "USEAGEMIN" != "" || "USEAGEMAX" != "" ]];then

        if [[ "USEAGEMAX" == "" || "USEAGEMIN" == "" ]]; then
            return 1
        fi

        isLarge "USEAGEMAX" "USEAGEMIN"
        if [[? -eq 1 ]]; then
            return 1
        else
            USEAGE=1
        fi
    fi
    return 0
}

function parseARG(){
    #it mean warning
    if [ 1 -eq 1 ];then
        CONNMIN=(echo 2 | awk -F ',' '{print1}' | sed "s/[^0-9]//g")
        CPUMIN=(echo2 | awk -F ',' '{print 2}' | sed "s/[^0-9]//g")
        MEMORYMIN=(echo 2 | awk -F ',' '{print3}' | sed "s/[^0-9]//g")
        USEAGEMIN=(echo2 | awk -F ',' '{print 4}' | sed "s/[^0-9]//g")
    else
        CONNMAX=(echo 2 | awk -F ',' '{print1}' | sed "s/[^0-9]//g")
        CPUMAX=(echo2 | awk -F ',' '{print 2}' | sed "s/[^0-9]//g")
        MEMORYMAX=(echo 2 | awk -F ',' '{print3}' | sed "s/[^0-9]//g")
        USEAGEMAX=(echo2 | awk -F ',' '{print 4}' | sed "s/[^0-9]//g")
    fi

}

while getopts "b:P:h" arg
do
    casearg in
        b)
            TFSHOME=OPTARG
            ;;
        P)
            NAMEPORT=OPTARG
            ;;
        h)
            showHelp
            ;;
        ?) 
            showHelp
            ;;
    esac
done

if [[ "TFSHOME" == "" ]];then
    TFSHOME="/usr/local/tfs"
fi
if [[ "NAMEPORT" == "" ]];then
    NAMEPORT="9108"
fi

#checkALL
if [[ $? == 1 ]];then
    showHelp "正确参数"
fi


#插件正题部分 
#确定存活的
#启动
#分析

isAlive
analysisStat
#analysisResult

LEAVE A COMMENT

Captcha Code