
#! /bin/bash
export LANG=en_US.UTF-8
#WARNLIMIT=-1 #警告值 超过该值则认为是需要告警
#ERRORLIMIT=-1 #错误值 超过该值认为是错误
FLAGE=
FLAGEID=
#是否输出FLAGPID
FLAGEOPID=1
#是否输出RUNTIME
RUNTIME=1
#是否输出STARTTIME
STARTTIME=0
TFSHOME=
NAMEPORT=
#连接数
CONN=0
CONNMIN=
CONNMAX=
#内存使用量
MEMORY=0
MEMORYMIN=
MEMORYMAX=
#CPU使用率
CPU=0
CPUMIN=
CPUMAX=
#空间使用率
USEAGE=0
USEAGEMIN=
USEAGEMAX=
CORENUM=1
CURNUM= #当前的性能参数
WranResultStr= #最后形成的告警字符串
ErrorResultStr= #最后形成的错误字符串
DataResultStr= #性能呢个参数字符串
quota=
QUOTASTR=
tfs_acquisition_json=""
function gotErr(){
if [ "1" -eq 0 ];then
echo "mailstatedes=各项指标恢复正常 statedes=各项指标恢复正常|DataResultStr"
elif [ "1" -eq 1 ];then echo "quota={quota} mailstatedes=WranResultStr statedes=WranResultStr {QUOTASTR} |DataResultStr"
elif [ "1" -eq 2 ];then if [[ "{WranResultStr}" != "" ]]; then
echo "quota={quota} mailstatedes={WranResultStr},{ErrorResultStr} statedes={WranResultStr},{ErrorResultStr}{QUOTASTR}|DataResultStr"
else
echo "quota={quota} mailstatedes={ErrorResultStr} statedes={ErrorResultStr} {QUOTASTR}|DataResultStr"
fi
elif [ "1" -eq 3 ];then if [ "FLAGEOPID" -eq 1 ];then
CURNUM=FLAGEID
parseFileds "FLAGEPID"CURNUM CURNUM
fi
echo "mailstatedes=无法获取到指标,疑似组件故障,请确认并请检查监控脚本和运维平台配置 statedes=无法获取到指标,疑似组件故障,请确认并请检查监控脚本和运维平台配置|DataResultStr"
elif [ "1" -eq 4 ];then
echo "mailstatedes=该组件进程为僵尸进程,请确认并请检查该组件状态 statedes=该组件进程为僵尸进程,请确认并请检查该组件状态|DataResultStr"
exit 3
fi
exit 1
}
#返回是否服务是否存活
function isAlive(){
CORENUM=`cat /proc/cpuinfo |grep "physical id"|wc -l`
#isAliveStr=(ps aux |grep dataserver |grep -v grep|wc -l)
#if [[ "isAliveStr" -eq 0 ]];then
#gotErr 3
#fi
# isAliveStr=(ps aux |grep TFSHOME/dataserver | grep "-i 3"|grep -v grep|wc -l)
# if [[ "isAliveStr" -eq 0 ]];then
# gotErr 3
# fi
#isAliveStr=(ps aux |grep htfs|grep -v grep|wc -l)
#if [[ "isAliveStr" -eq 0 ]];then
#gotErr 3
#fi
#isAliveStr=(ps aux |grepTFSHOME/nameserver|grep -v grep|wc -l)
#if [[ "isAliveStr" -eq 0 ]];then
#gotErr 3
#fi
#if [[ "isAliveStr" -gt 1 ]];then
#echo "你的标志项需要唯一标识一个进程"
#showHelp
#fi
FLAGEID=(ps aux |grepTFSHOME/nameserver|grep -v grep| awk '{print 2}' |head -n 1 )
ZOMBIE=(ps -A -ostat,pid | grep -e '^[Zz]' | grep FLAGEID)
#if [[ "ZOMBIE" != "" ]];then
#gotErr 4
#fi
}
#处理连接数
function getConnNum(){
CURNUM=(netstat -apn | grepFLAGEID | grep ESTABLISHED | wc -l)
tfs_acquisition_json="tfs_acquisition_json,'ConnectionNum':CURNUM"
}
#ps aux 4是内存 累加4的值
function getMemory(){
CURNUM=(ps aux | grepTFSHOME |awk 'BEGIN{sum=0}{sum+=4}END{print sum}')
tfs_acquisition_json="tfs_acquisition_json,'Memory':CURNUM"
}
#ps aux 3是CPU 累加3的值
function getCPU(){
CURNUM=(ps aux | grep TFSHOME |awk 'BEGIN{sum=0}{sum+=3}END{print sum}')
if [ "CORENUM" -ne 0 ];then
CURNUM=(printf "%.2f" `echo "scale=2;CURNUM/CORENUM" | bc`)
fi
tfs_acquisition_json="tfs_acquisition_json,'CPU':CURNUM"
}
function getUSEAGE(){
CURNUM=(TFSHOME/ssm -s 127.0.0.1:NAMEPORT -i server|grep "TOTAL:"|awk '{print5*100/100}')
tfs_acquisition_json="tfs_acquisition_json,'USEAGE':CURNUM"
}
#ps -eo pid,etime 是查询进程运行时间
function getRuntime(){
CURNUM=(ps -eo pid,etime | grepFLAGEID |awk '{if(1=='FLAGEID') print 2}')
tfs_acquisition_json="tfs_acquisition_json,'RUNTIME':'CURNUM'"
}
#ps -eo pid,lstart是查询进程的开始时间
function getStarttime(){
CURNUM=(ps -eo pid,lstart | grep FLAGEID |awk '{if(1=='FLAGEID') print2 ,3 ,4 ,5 ,6}')
tfs_acquisition_json="tfs_acquisition_json,'STARTTIME':'CURNUM'"
}
#依据传入的参数值来判断我们要监控的服务是什么
#然后通过各个函数获取到具体的值
#值之间的
function analysisStat(){
#if [ "CONN" -eq 1 ];then
getConnNum
#parseFileds "ConnectionNum"CONNMIN CONNMAX
#fi
#if [ "MEMORY" -eq 1 ];then
getMemory
#parseFileds "Memory" MEMORYMINMEMORYMAX
#fi
#if [ "CPU" -eq 1 ];then
getCPU
#parseFileds "CPU"CPUMIN CPUMAX
#fi
#if [ "USEAGE" -eq 1 ];then
getUSEAGE
#parseFileds "USEAGE" USEAGEMINUSEAGEMAX
#fi
#if [ "FLAGEOPID" -eq 1 ];then
#CURNUM=FLAGEID
#parseTimeFileds "FLAGEPID" CURNUMCURNUM
#fi
#if [ "RUNTIME" -eq 1 ];then
getRuntime
#parseTimeFileds "RUNTIME"RUNTIME RUNTIME
#fi
#if [ "STARTTIME" -eq 1 ];then
getStarttime
#parseTimeFileds "STARTTIME" STARTTIMESTARTTIME
#fi
result="{'cpname':'tfs',{tfs_acquisition_json#*,}}"
result=`echo{result//\'/\"}`
echo result
}
function parseTimeFileds(){
val1=(echo "CURNUM2")
val2=(echo "CURNUM 3")
DataResultStr="DataResultStr 1=CURNUM;2;3;0;0"
}
#CURNUM 当前值
#WARNLIMIT 告警的上限
#ERRORLIMIT 异常的上限
#判断以上三个值并拼接结果字符串
function parseFileds(){
val1=(echo "CURNUM 2"| awk '{if(1<=2){print 1}else{print 0}}')
val2=(echo "CURNUM3"| awk '{if(1<=2){print 1}else{print 0}}')
DataResultStr="DataResultStr1=CURNUM;2;3;0;0"
if [val1 -eq 1 ];then
return 0
elif [[ val1 -eq 0 &&val2 -eq 1 ]];then
if [[ {WranResultStr} != "" ]]; then
WranResultStr="{WranResultStr},"
fi
WranResultStr="{WranResultStr}1当前值为{CURNUM}超过告警值{2}"
if [[ {quota} != "" ]]; then
quota="{quota},"
fi
quota="{quota}1"
QUOTASTR="{QUOTASTR}1=1当前值为{CURNUM}超过告警值{2} "
return 1
else
if [[{ErrorResultStr} != "" ]]; then
ErrorResultStr="{ErrorResultStr},"
fi
ErrorResultStr="{ErrorResultStr}1当前值为{CURNUM}超过紧急值{3}"
if [[{quota} != "" ]]; then
quota="{quota},"
fi
quota="{quota}1"
QUOTASTR="{QUOTASTR}1=1当前值为{CURNUM}超过紧急值{3} "
return 2
fi
}
#分析结果 给出给出状态
function analysisResult(){
if [[ "WranResultStr" == "" && "ErrorResultStr" == "" ]];then
gotErr 0
elif [[ "ErrorResultStr" == "" ]];then
gotErr 1
else
gotErr 2
fi
}
#输出错误信息并且退出程序
function showHelp(){
if [ "1" != "" ];then
echo "请输入1"
fi
echo "check_tfs.sh 可以监听本地的tfs的状态 参数如下"
echo "check_tfs.sh [-w 连接数,CPU,内存,空间使用率] [-c 连接数,CPU,内存,空间使用率] [-b <str>] [-P <str>]"
echo "-w 表示监控警告的值"
echo "-c 表示异常值"
echo "-b <str> tfs的安装目录"
echo "-P <str> namenode的端口号"
exit 1
}
#检查参数是不是数字
function isNumber(){
isNum=(echo "1" | awk '{print(0~/^([0-9])+[\.]?([0-9])?/)?1:0}')
returnisNum
}
function isLarge(){
isNumber 1
if [[? -eq 0 ]]; then
return 1
fi
isNumber 2
if [[? -eq 0 ]]; then
return 1
fi
isLarge=(echo "1 2"| awk '{if(1>=2){print 1}else{print 0}}')
if [isLarge -eq 1 ];then
return 0
else
return 1
fi
}
function checkARG(){
if [[ "2" != "" && "3" != "" ]];then
isLarge "3" "2"
if [[ ? -eq 1 ]]; then
return 1
else1=1
fi
fi
return 0
}
#检查参数是否合法
#即参数是否是小数和整数
#对于守护进程的选择 这个值可以不验证
function checkALL(){
if [[ "CONNMIN" != "" || "CONNMAX" != "" ]];then
if [[ "CONNMAX" == "" || "CONNMAX" == "" ]]; then
return 1
fi
isLarge "CONNMAX" "CONNMIN"
if [[ ? -eq 1 ]]; then
return 1
else
CONN=1
fi
fi
if [[ "CPUMIN" != "" || "CPUMAX" != "" ]];then
if [[ "CPUMAX" == "" || "CPUMIN" == "" ]]; then
return 1
fi
isLarge "CPUMAX" "CPUMIN"
if [[? -eq 1 ]]; then
return 1
else
CPU=1
fi
fi
if [[ "MEMORYMIN" != "" || "MEMORYMAX" != "" ]];then
if [[ "MEMORYMAX" == "" || "MEMORYMIN" == "" ]]; then
return 1
fi
isLarge "MEMORYMAX" "MEMORYMIN"
if [[ ? -eq 1 ]]; then
return 1
else
MEMORY=1
fi
fi
if [[ "USEAGEMIN" != "" || "USEAGEMAX" != "" ]];then
if [[ "USEAGEMAX" == "" || "USEAGEMIN" == "" ]]; then
return 1
fi
isLarge "USEAGEMAX" "USEAGEMIN"
if [[? -eq 1 ]]; then
return 1
else
USEAGE=1
fi
fi
return 0
}
function parseARG(){
#it mean warning
if [ 1 -eq 1 ];then
CONNMIN=(echo 2 | awk -F ',' '{print1}' | sed "s/[^0-9]//g")
CPUMIN=(echo2 | awk -F ',' '{print 2}' | sed "s/[^0-9]//g")
MEMORYMIN=(echo 2 | awk -F ',' '{print3}' | sed "s/[^0-9]//g")
USEAGEMIN=(echo2 | awk -F ',' '{print 4}' | sed "s/[^0-9]//g")
else
CONNMAX=(echo 2 | awk -F ',' '{print1}' | sed "s/[^0-9]//g")
CPUMAX=(echo2 | awk -F ',' '{print 2}' | sed "s/[^0-9]//g")
MEMORYMAX=(echo 2 | awk -F ',' '{print3}' | sed "s/[^0-9]//g")
USEAGEMAX=(echo2 | awk -F ',' '{print 4}' | sed "s/[^0-9]//g")
fi
}
while getopts "b:P:h" arg
do
casearg in
b)
TFSHOME=OPTARG
;;
P)
NAMEPORT=OPTARG
;;
h)
showHelp
;;
?)
showHelp
;;
esac
done
if [[ "TFSHOME" == "" ]];then
TFSHOME="/usr/local/tfs"
fi
if [[ "NAMEPORT" == "" ]];then
NAMEPORT="9108"
fi
#checkALL
if [[ $? == 1 ]];then
showHelp "正确参数"
fi
#插件正题部分
#确定存活的
#启动
#分析
isAlive
analysisStat
#analysisResult