下面脚本是阿里云服务器收集性能指标的脚本:
#!/usr/bin/python ######################################### # Function: sample linux performance indices # Usage: python sampler.py # Author: CMS DEV TEAM # Company: Aliyun Inc. # Version: 1.1 ######################################### import os import os.path import sys import time import operator import httplib import logging import socket import random from shutil import copyfile from subprocess import Popen, PIPE from logging.handlers import RotatingFileHandler logger = None REMOTE_HOST = None REMOTE_PORT = None REMOTE_MONITOR_URI = None UUID = None def get_mem_usage_percent(): try: f = open('/proc/meminfo', 'r') for line in f: if line.startswith('MemTotal:'): mem_total = int(line.split()[1]) elif line.startswith('MemFree:'): mem_free = int(line.split()[1]) elif line.startswith('Buffers:'): mem_buffer = int(line.split()[1]) elif line.startswith('Cached:'): mem_cache = int(line.split()[1]) elif line.startswith('SwapTotal:'): vmem_total = int(line.split()[1]) elif line.startswith('SwapFree:'): vmem_free = int(line.split()[1]) else: continue f.close() except: return None physical_percent = usage_percent(mem_total - (mem_free + mem_buffer + mem_cache), mem_total) virtual_percent = 0 if vmem_total > 0: virtual_percent = usage_percent((vmem_total - vmem_free), vmem_total) return physical_percent, virtual_percent black_list = ('iso9660',) def usage_percent(use, total): try: ret = (float(use) / total) * 100 except ZeroDivisionError: raise Exception("ERROR - zero division error") return ret def get_disk_partition(): return_list = [] pd = [] try: f = open("/proc/filesystems", "r") for line in f: if not line.startswith("nodev"): fs_type = line.strip() if fs_type not in black_list: pd.append(fs_type) f.close() f = open('/etc/mtab', "r") for line in f: if line.startswith('none'): continue tmp = line.strip().split() ft = tmp[2] if ft not in pd: continue return_list.append(tmp[1]) f.close() except: return None return return_list def check_disk(): try: return_dict = {} p_list = get_disk_partition() for i in p_list: dt = os.statvfs(i) use = (dt.f_blocks - dt.f_bfree) * dt.f_frsize all = dt.f_blocks * dt.f_frsize return_dict[i] = ('%.2f' % (usage_percent(use, all),), ('%.2f' % (all * 1.0 / (1024 * 1000000)))) except: return None return return_dict _CLOCK_TICKS = os.sysconf("SC_CLK_TCK") def get_cpu_time(): need_sleep = False if not os.path.isfile('/tmp/cpu_stat') or os.path.getsize('/tmp/cpu_stat') == 0: copyfile('/proc/stat', '/tmp/cpu_stat') need_sleep = True try: f1 = open('/tmp/cpu_stat', 'r') values1 = f1.readline().split() total_time1 = 0 for i in values1[1:]: total_time1 += int(i) idle_time1 = int(values1[4]) iowait_time1 = int(values1[5]) finally: f1.close() if need_sleep: time.sleep(1) f2 = open('/proc/stat', 'r') try: values2 = f2.readline().split() total_time2 = 0 for i in values2[1:]: total_time2 += int(i) idle_time2 = int(values2[4]) iowait_time2 = int(values2[5]) finally: f2.close() idle_time = idle_time2 - idle_time1 iowait_time = iowait_time2 - iowait_time1 total_time = total_time2 - total_time1 cpu_percentage = int(100.0 * (total_time - idle_time - iowait_time) / total_time) # compensate logic if total_time < 0 or idle_time < 0 or iowait_time < 0 or cpu_percentage < 0 or cpu_percentage > 100: time.sleep(1) f3 = open('/proc/stat', 'r') try: values3 = f3.readline().split() total_time3 = 0 for i in values3[1:]: total_time3 += int(i) idle_time3 = int(values3[4]) iowait_time3 = int(values3[5]) finally: f3.close() idle_time = idle_time3 - idle_time2 iowait_time = iowait_time3 - iowait_time2 total_time = total_time3 - total_time2 cpu_percentage = int(100.0 * (total_time - idle_time - iowait_time) / total_time) copyfile('/proc/stat', '/tmp/cpu_stat') return cpu_percentage def network_io_kbitps(): """Return network I/O statistics for every network interface installed on the system as a dict of raw tuples. """ f1 = open("/proc/net/dev", "r") try: lines1 = f1.readlines() finally: f1.close() retdict1 = {} for line1 in lines1[2:]: colon1 = line1.find(':') assert colon1 > 0, line1 name1 = line1[:colon1].strip() fields1 = line1[colon1 + 1:].strip().split() bytes_recv1 = float('%.4f' % (float(fields1[0]) * 0.0078125)) bytes_sent1 = float('%.4f' % (float(fields1[8]) * 0.0078125)) retdict1[name1] = (bytes_recv1, bytes_sent1) time.sleep(1) f2 = open("/proc/net/dev", "r") try: lines2 = f2.readlines() finally: f2.close() retdict2 = {} for line2 in lines2[2:]: colon2 = line2.find(':') assert colon2 > 0, line2 name2 = line2[:colon2].strip() fields2 = line2[colon2 + 1:].strip().split() bytes_recv2 = float('%.4f' % (float(fields2[0]) * 0.0078125)) bytes_sent2 = float('%.4f' % (float(fields2[8]) * 0.0078125)) retdict2[name2] = (bytes_recv2, bytes_sent2) retdict = merge_with(retdict2, retdict1) return retdict def disk_io_Kbps(): iostat = Popen("iostat -d -k 1 2 | sed '/Device/|Linux/|^$/d' > /tmp/disk_io", shell=True, stdout=PIPE, stderr=PIPE) iostat_error = iostat.communicate()[1].strip() if iostat_error: logger.error("iostat not exists, %s" % iostat_error) return None retdict = {} exception = None try: try: f = open('/tmp/disk_io', 'r') except Exception, ex: exception = ex logger.error(exception) if exception: return None lines = f.readlines() for line in lines: name, _, readkps, writekps, _, _, = line.split() if name: readkps = float(readkps) writekps = float(writekps) retdict[name] = (readkps, writekps) return retdict finally: f.close() def merge_with(d1, d2, fn=lambda x, y: tuple(map(operator.sub, x, y))): res = d1.copy() # "= dict(d1)" for lists of tuples for key, val in d2.iteritems(): # ".. in d2" for lists of tuples try: res[key] = fn(res[key], val) except KeyError: res[key] = val return res def get_load(): try: f = open('/proc/loadavg', 'r') tmp = f.readline().split() lavg_1 = float(tmp[0]) lavg_5 = float(tmp[1]) lavg_15 = float(tmp[2]) f.close() except: return None return lavg_1, lavg_5, lavg_15 def get_tcp_status(): check_cmd = "command -v ss" check_proc = Popen(check_cmd, shell=True, stdout=PIPE) ss = check_proc.communicate()[0].rstrip('/n') if ss: cmd = "ss -ant | awk '{if(NR != 1) print $1}' | awk '{state=$1;arr[state]++} END{for(i in arr){printf /"%s=%s /", i,arr[i]}}' | sed 's/-/_/g' | sed 's/ESTAB=/ESTABLISHED=/g' | sed 's/FIN_WAIT_/FIN_WAIT/g'" else: cmd = "netstat -anp | grep tcp | awk '{print $6}' | awk '{state=$1;arr[state]++} END{for(i in arr){printf /"%s=%s /", i,arr[i]}}' | tail -n 1" tcp_proc = Popen(cmd, shell=True, stdout=PIPE) tcp_status = tcp_proc.communicate()[0].rstrip('/n') return tcp_status def get_proc_number(): cmd = "ps axu | wc -l | tail -n 1" proc_func = Popen(cmd, shell=True, stdout=PIPE) proc_number = proc_func.communicate()[0].rstrip('/n') return proc_number def all_index(): return ( int(time.time() * 1000), get_cpu_time(), get_mem_usage_percent(), check_disk(), disk_io_Kbps(), network_io_kbitps(), get_load(), get_tcp_status(), get_proc_number() ) def collector(): timestamp, cpu, mem, disk, disk_io, net, load, tcp_status, process_number = all_index() disk_utilization = '' disk_io_read = '' disk_io_write = '' internet_networkrx = '' internet_networktx = '' tcp_status_count = '' period_1 = '' period_5 = '' period_15 = '' if UUID: cpu_utilization = 'vm.CPUUtilization ' + str(timestamp) + ' ' + str(cpu) + ' ns=ACS/ECS unit=Percent instanceId=%s/n' % UUID memory_utilization = 'vm.MemoryUtilization ' + str(timestamp) + ' ' + str(mem[0]) + ' ns=ACS/ECS unit=Percent instanceId=%s/n' % UUID if load: period_1 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[0]) + ' ns=ACS/ECS unit=count' + ' instanceId=%s period=1min/n' % UUID period_5 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[1]) + ' ns=ACS/ECS unit=count' + ' instanceId=%s period=5min/n' % UUID period_15 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[2]) + ' ns=ACS/ECS unit=count' + ' instanceId=%s period=15min/n' % UUID if disk: for name, value in disk.items(): disk_utilization = disk_utilization + 'vm.DiskUtilization ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Percent instanceId=%s mountpoint=%s/n' % (UUID, name) if disk_io: for name, value in disk_io.items(): disk_io_read = disk_io_read + 'vm.DiskIORead ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Kilobytes/Second instanceId=%s diskname=%s/n' % (UUID, name) disk_io_write = disk_io_write + 'vm.DiskIOWrite ' + str(timestamp) + ' ' + str(value[1]) + ' ns=ACS/ECS unit=Kilobytes/Second instanceId=%s diskname=%s/n' % (UUID, name) for name, value in net.items(): internet_networkrx = internet_networkrx + 'vm.InternetNetworkRX ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Kilobits/Second instanceId=%s netname=%s/n' % (UUID, name) internet_networktx = internet_networktx + 'vm.InternetNetworkTX ' + str(timestamp) + ' ' + str(value[1]) + ' ns=ACS/ECS unit=Kilobits/Second instanceId=%s netname=%s/n' % (UUID, name) if tcp_status: status_count = tcp_status.split() for element in status_count: key_value = element.split('=') tcp_status_count = tcp_status_count + 'vm.TcpCount ' + str(timestamp) + ' ' + key_value[1] + ' ns=ACS/ECS unit=Count instanceId=%s state=%s/n' % (UUID, key_value[0]) process_count = 'vm.ProcessCount ' + str(timestamp) + ' ' + process_number + ' ns=ACS/ECS unit=Count instanceId=%s/n' % UUID else: cpu_utilization = 'vm.CPUUtilization ' + str(timestamp) + ' ' + str(cpu) + ' ns=ACS/ECS unit=Percent/n' memory_utilization = 'vm.MemoryUtilization ' + str(timestamp) + ' ' + str(mem[0]) + ' ns=ACS/ECS unit=Percent/n' if load: period_1 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[0]) + ' ns=ACS/ECS unit=count period=1min/n' period_5 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[1]) + ' ns=ACS/ECS unit=count period=5min/n' period_15 = 'vm.LoadAverage ' + str(timestamp) + ' ' + str(load[2]) + ' ns=ACS/ECS unit=count period=15min/n' if disk: for name, value in disk.items(): disk_utilization = disk_utilization + 'vm.DiskUtilization ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Percent mountpoint=%s/n' % name if disk_io: for name, value in disk_io.items(): disk_io_read = disk_io_read + 'vm.DiskIORead ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Kilobytes/Second diskname=%s/n' % name disk_io_write = disk_io_write + 'vm.DiskIOWrite ' + str(timestamp) + ' ' + str(value[1]) + ' ns=ACS/ECS unit=Kilobytes/Second diskname=%s/n' % name for name, value in net.items(): internet_networkrx = internet_networkrx + 'vm.InternetNetworkRX ' + str(timestamp) + ' ' + str(value[0]) + ' ns=ACS/ECS unit=Kilobits/Second netname=%s/n' % name internet_networktx = internet_networktx + 'vm.InternetNetworkTX ' + str(timestamp) + ' ' + str(value[1]) + ' ns=ACS/ECS unit=Kilobits/Second netname=%s/n' % name if tcp_status: status_count = tcp_status.split() for element in status_count: key_value = element.split('=') tcp_status_count = tcp_status_count + 'vm.TcpCount ' + str(timestamp) + ' ' + key_value[1] + ' ns=ACS/ECS unit=Count state=%s/n' % key_value[0] process_count = 'vm.ProcessCount ' + str(timestamp) + ' ' + process_number + ' ns=ACS/ECS unit=Count/n' data_post = cpu_utilization + memory_utilization + period_1 + period_5 + period_15 + disk_utilization + disk_io_read + disk_io_write + internet_networkrx + internet_networktx + tcp_status_count + process_count print data_post interval = random.randint(0, 5000) time.sleep(interval / 1000.0) headers = {"Content-Type": "text/plain", "Accept": "text/plain"} exception = None http_client = None try: try: http_client = httplib.HTTPConnection(REMOTE_HOST, REMOTE_PORT) http_client.request(method="POST", url=REMOTE_MONITOR_URI, body=data_post, headers=headers) response = http_client.getresponse() if response.status == 200: return else: logger.warn("response code %d" % response.status) logger.warn("response code %s" % response.read()) except Exception, ex: exception = ex finally: if http_client: http_client.close() if exception: logger.error(exception) if __name__ == '__main__': REMOTE_HOST = 'open.cms.aliyun.com' REMOTE_PORT = 80 # get report address if not os.path.isfile("../cmscfg"): pass else: props = {} prop_file = file("../cmscfg", 'r') for line in prop_file.readlines(): kv = line.split('=') props[kv[0].strip()] = kv[1].strip() prop_file.close() if props.get('report_domain'): REMOTE_HOST = props.get('report_domain') if props.get('report_port'): REMOTE_PORT = props.get('report_port') # get uuid if not os.path.isfile("../aegis_quartz/conf/uuid"): pass else: uuid_file = file("../aegis_quartz/conf/uuid", 'r') UUID = uuid_file.readline() UUID = UUID.lower() REMOTE_MONITOR_URI = "/metrics/putLines" MONITOR_DATA_FILE_DIR = "/tmp" LOG_FILE = "/tmp/" + "vm.log" LOG_LEVEL = logging.INFO LOG_FILE_MAX_BYTES = 1024 * 1024 LOG_FILE_MAX_COUNT = 3 logger = logging.getLogger('sampler') logger.setLevel(LOG_LEVEL) handler = RotatingFileHandler(filename=LOG_FILE, mode='a', maxBytes=LOG_FILE_MAX_BYTES, backupCount=LOG_FILE_MAX_COUNT) formatter = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) socket.setdefaulttimeout(10) try: collector() except Exception, e: logger.error(e) sys.exit(1)