
#! /bin/bash
export LANG=en_US.UTF-8
#WARNLIMIT=-1 #警告值 超过该值则认为是需要告警
#ERRORLIMIT=-1 #错误值 超过该值认为是错误
logPath= #日志路径
FLAGE=
FLAGEID=
#是否输出FLAGPID
FLAGEOPID=1
#是否输出RUNTIME
RUNTIME=1
#是否输出STARTTIME
STARTTIME=0
CONN=0 #是否检查连接数
CONNMIN=
CONNMAX=
MEMORY=0 #是否检查内存
MEMORYMIN=
MEMORYMAX=
CPU=0 #是否检查CPU
CPUMIN=
CPUMAX=
THREAD=0 #是否检查线程
THREADMIN=
THREADMAX=
PORT=
CURNUM= #当前的性能参数
CORENUM=1
WranResultStr= #最后形成的告警字符串
ErrorResultStr= #最后形成的错误字符串
DataResultStr= #性能呢个参数字符串
QUOTASTR=
quota=
tomcat_acquisition_json=""
keys=(FLAGEPID ConnectionNum Memory CPU Thread RUNTIME STARTTIME accessLog accessLog10)
#echo ${#keys[@]} 看看多少个指标,不包含cpname
declare -A map=()
for var in ${keys[@]};
do
map[$var]=""
done
function gotErr(){
if [ "$1" -eq 3 ];then
echo "无法获取到指标,疑似组件故障,请确认并请检查监控脚本和运维平台配置"
elif [ "$1" -eq 4 ];then
echo "该组件进程为僵尸进程,请确认并请检查该组件状态"
fi
exit $1
}
#返回是否服务是否存活
function isAlive(){
CORENUM=`cat /proc/cpuinfo |grep "physical id"|wc -l`
FLAGEID=$(ss -lnpt "( sport == :$PORT )"|tail -n 1|awk -F " " '{print $6}'|awk -F ',' '{print $2}')
#-ne 1
if [[ "$FLAGEID" == "" ]];then
gotErr 3
fi
ZOMBIE=$(ps -A -ostat,pid | grep -e '^[Zz]' | grep $FLAGEID)
if [[ "$ZOMBIE" != "" ]];then
gotErr 4
fi
}
#处理连接数
function getConnNum(){
CURNUM=$(ss -oanp state established "( sport == :$PORT )"|grep -v "Address"|wc -l)
CURNUM=$(echo $CURNUM | sed 's/^ //;s/ $//')
tomcat_acquisition_json="$tomcat_acquisition_json,'ConnectionNum':$CURNUM"
map["ConnectionNum"]="$CURNUM"
}
#ps aux 4是内存 累加4的值
function getMemory(){
CURNUM=$(ps aux | grep tomcat |grep bootstrap | grep "$FLAGEID" |awk 'BEGIN{sum=0}{sum+=$4}END{print sum}')
tomcat_acquisition_json="$tomcat_acquisition_json,'Memory':$CURNUM"
map["Memory"]="$CURNUM"
}
#ps aux 3是CPU 累加3的值
function getCPU(){
CURNUM=$(ps aux | grep tomcat |grep bootstrap | grep "$FLAGEID" |awk 'BEGIN{sum=0}{sum+=$3}END{print sum}')
if [ "$CORENUM" -ne 0 ];then
CURNUM=$(printf "%.2f" `echo "scale=2;$CURNUM/$CORENUM" | bc`)
fi
tomcat_acquisition_json="$tomcat_acquisition_json,'CPU':$CURNUM"
map["CPU"]="$CURNUM"
}
#ps elm 是PID所包含的线程数目
function getThreadNum(){
CURNUM=$(ps -mp $FLAGEID |wc -l | awk 'BEGIN{sum=0}{sum=$1-2}END{print sum}')
tomcat_acquisition_json="$tomcat_acquisition_json,'Thread':$CURNUM"
map["Thread"]="$CURNUM"
}
#ps -eo pid,etime 是查询进程运行时间
function getRuntime(){
CURNUM=$(ps -eo pid,etime | grep $FLAGEID |awk '{if($1=='$FLAGEID') print $2}')
tomcat_acquisition_json="$tomcat_acquisition_json,'RUNTIME':'$CURNUM'"
map["RUNTIME"]="$CURNUM"
}
#ps -eo pid,lstart是查询进程的开始时间
function getStarttime(){
CURNUM=$(ps -eo pid,lstart | grep $FLAGEID |awk '{if($1=='$FLAGEID') print $2 ,$3 ,$4 ,$5 ,$6}')
tomcat_acquisition_json="$tomcat_acquisition_json,'STARTTIME':'$CURNUM'"
map["STARTTIME"]="$CURNUM"
}
#获取最近1分钟access日志处理请求响应时间超过一秒的所有日志行数
#如果统计的日志行数大于告警值,dump该线程
#把dump文件保存在该日志所在文件夹下
function getAccessLog(){
if [[ "$logPath" == "" ]];then
CURNUM=0
tomcat_acquisition_json="$tomcat_acquisition_json,'accessLog':$CURNUM"
map["accessLog"]="$CURNUM"
return
fi
time=`date +"%Y-%m-%d"`
start_time=`date -d"1 minutes ago" +"%d/%b/%Y:%H:%M:%S"`
end_time=`date +"%d/%b/%Y:%H:%M:%S"`
access_log=`cd $logPath;cat localhost_access_log.$time.txt|awk -v st="$start_time" -v et="$end_time" '{t=substr($4,2); if(t>=st && t<=et){print $0}}'|awk '$NF>1{print $0}'|wc -l`
CURNUM=$access_log
tomcat_acquisition_json="$tomcat_acquisition_json,'accessLog':$CURNUM"
map["accessLog"]="$CURNUM"
}
#获取最近1分钟access日志处理请求响应时间超过十秒的所有日志行数
function getAccessLog10(){
if [[ "$logPath" == "" ]];then
CURNUM=0
tomcat_acquisition_json="$tomcat_acquisition_json,'accessLog10':$CURNUM"
map["accessLog10"]="$CURNUM"
return
fi
time=`date +"%Y-%m-%d"`
start_time=`date -d"1 minutes ago" +"%d/%b/%Y:%H:%M:%S"`
end_time=`date +"%d/%b/%Y:%H:%M:%S"`
access_log10=`cd $logPath;cat localhost_access_log.$time.txt|awk -v st="$start_time" -v et="$end_time" '{t=substr($4,2); if(t>=st && t<=et){print $0}}'|awk '$NF>10{print $0}'|wc -l`
CURNUM=$access_log10
tomcat_acquisition_json="$tomcat_acquisition_json,'accessLog10':$CURNUM"
map["accessLog10"]="$CURNUM"
}
#依据传入的参数值来判断我们要监控的服务是什么
#然后通过各个函数获取到具体的值
#值之间的
function analysisStat(){
getConnNum
getMemory
getCPU
getThreadNum
CURNUM=$FLAGEID
tomcat_acquisition_json="$tomcat_acquisition_json,'FLAGEPID':'$CURNUM'"
map["FLAGEPID"]="$CURNUM"
getRuntime
getStarttime
if [[ "$logPath" == "" || "$logPath" == "<<log_path>>" ]];then
return
fi
getAccessLog
getAccessLog10
}
#输出结果
function analysisResult(){
echo -e "{\c"
for key in ${keys[@]};
do
echo -e "\"${key}\":\"${map[$key]//\'/\"}\",\c"
done
echo -e "\"cpname\":\"tomcat\"\c"
echo -e "}"
}
#输出错误信息并且退出程序
function showHelp(){
if [ "$1" != "" ];then
echo "请输入$1"
fi
echo "check_tomcat.sh 可以监听本地的tomcat的状态 参数如下"
echo "check_tomcat.sh [-w 连接数,CPU,内存,线程数] [-c 连接数,CPU,内存,线程数] -p <port>"
echo "-w 表示监控警告的值"
echo "-c 表示异常值"
echo "-p 表示tomcat端口号"
echo "-l 表示日志路径"
exit 1
}
while getopts "p:l:h" arg
do
case $arg in
h)
showHelp
;;
p)
PORT=$OPTARG
;;
l)
logPath=$OPTARG
;;
?)
showHelp
;;
esac
done
if [[ "$PORT" == "" ]];then
showHelp "特征项"
fi
#插件正题部分
#确定存活的
#启动
#分析
isAlive
analysisStat
analysisResult