check_flume.sh查看flume状态

  sre

DHBZDJ.jpg

#! /bin/bash
export LANG=en_US.UTF-8

FLAGE=
FLAGEID=
#是否输出FLAGPID
FLAGEOPID=1
#是否输出RUNTIME
RUNTIME=1
#是否输出STARTTIME
STARTTIME=0
CONN=0              #是否检查连接数
CONNMIN=
CONNMAX=
MEMORY=0            #是否检查内存
MEMORYMIN=
MEMORYMAX=
CPU=0               #是否检查CPU
CPUMIN=
CPUMAX=
CORENUM=1

CURNUM=             #当前的性能参数
PORT=
WranResultStr=      #最后形成的告警字符串
ErrorResultStr=     #最后形成的错误字符串
DataResultStr=      #性能呢个参数字符串

QUOTASTR=
quota=

function gotErr(){
    if [ "1" -eq 2 ];then        result="{'cpname':'flume',DataResultStr}"
        result=`echo {result//\'/\"}`
        echoresult
    elif [ "1" -eq 3 ];then
        echo "无法获取到指标,疑似组件故障,请确认并请检查监控脚本和运维平台配置"
    elif [ "1" -eq 4 ];then
        echo "该组件进程为僵尸进程,请确认并请检查该组件状态"
        exit 3
    fi

    exit 1
}

#返回是否服务是否存活
function isAlive(){
    CORENUM=`cat /proc/cpuinfo |grep "physical id"|wc -l`
    #FLAGEID=(ps aux | grep "FLAGE" | grep -v check_flume | grep -v .sh | grep -v "grep" | awk '{print2}'| head -n 1)

    FLAGEID=`ss -lnpt "( sport == :PORT )"|tail -n 1|awk -F " " '{print6}'|awk -F ',' '{print 2}'`
    if [[ "FLAGEID" == "" ]];then
        gotErr 3
    fi
    ZOMBIE=(ps -A -ostat,pid | grep -e '^[Zz]' | grepFLAGEID)
    if [[ "ZOMBIE" != "" ]];then
       gotErr 4
    fi
}

#处理连接数
function getConnNum(){
    CURNUM=(ss -oanp state established "( sport == :PORT )"|grep -v "Address"|wc -l)
}

#ps aux 4是内存 累加4的值
function getMemory(){
    CURNUM=(ps aux | grep FLAGEID |awk 'BEGIN{sum=0}{sum+=4}END{print sum}')
}

#ps aux 3是CPU 累加3的值
function getCPU(){
    CURNUM=(ps aux | grepFLAGEID |awk 'BEGIN{sum=0}{sum+=3}END{print sum}')
    if [ "CORENUM" -ne 0 ];then
       CURNUM=(printf "%.2f" `echo "scale=2;CURNUM/CORENUM" | bc`)
    fi
}

#ps -eo pid,etime 是查询进程运行时间
function getRuntime(){
    CURNUM=(ps -eo pid,etime | grep FLAGEID |awk '{if(1=='FLAGEID') print2}')
}

#ps -eo pid,lstart是查询进程的开始时间
function getStarttime(){
    CURNUM=(ps -eo pid,lstart | grepFLAGEID |awk '{if(1=='FLAGEID') print 2 ,3 ,4 ,5 ,6}')
}

#获取数据积压量
function getBacklogData(){
    if [ "LOG_PATH" == "" ];then
        CURNUM=
        return
    fi
    CURNUM=`du -b {LOG_PATH}|sort -rn|head -n 1|awk '{printf "%.2f\n", (1/1024/1024/1024)}'`
}

#依据传入的参数值来判断我们要监控的服务是什么
#然后通过各个函数获取到具体的值
#值之间的
function analysisStat(){

        getConnNum
        parseFileds "ConnectionNum"

        getMemory
        parseFileds "Memory"

        getCPU
        parseFileds "CPU" 

        CURNUM=FLAGEID
        parseTimeFileds "FLAGEPID" 

        getRuntime
        parseTimeFileds "RUNTIME" 

        getStarttime
        parseTimeFileds "STARTTIME"

        if [[ "LOG_PATH" == "" || "LOG_PATH" == "<<log_path>>" ]];then
            CURNUM=""
            parseTimeFileds "backlogData"
            return
        fi

        getBacklogData
        parseTimeFileds "backlogData"
}

function parseTimeFileds(){
    if [[ "DataResultStr" != "" ]];then
        DataResultStr=DataResultStr","
    fi
    DataResultStr="DataResultStr'1':'CURNUM'"
}

#CURNUM            当前值
#WARNLIMIT         告警的上限
#ERRORLIMIT    异常的上限
#判断以上三个值并拼接结果字符串
function parseFileds(){
    if [[ "DataResultStr" != "" ]];then
        DataResultStr=DataResultStr","
    fi
    DataResultStr="DataResultStr'1':'CURNUM'"
}


#分析结果 给出给出状态
function analysisResult(){
        gotErr 2
}

#输出错误信息并且退出程序
function showHelp(){
    if [ "1" != "" ];then
        echo "请输入1"
    fi
    echo "check_flume.sh 可以监听本地的flume的状态参数如下"
    echo "check_flume.sh [-p <port>]"
    echo "-p <port> 端口"
    echo "-l 表示数据存储路径"
    exit 1
}

while getopts "p:l:h" arg
do
    casearg in
        p)
            PORT=OPTARG
            ;;
        l)
            LOG_PATH=OPTARG
            ;;
        h)
            showHelp
            ;;
        ?) 
            showHelp
            ;;
    esac
done

if [[ "$PORT" == "" ]];then
    showHelp "特征项"
fi

#插件正题部分 
#确定存活的
#启动
#分析
isAlive
analysisStat
analysisResult

LEAVE A COMMENT

Captcha Code