Graphing Prstat raw data with Python, matplotlib and pandas

On a recent performance testing engagement the customer didn’t have a resource monitoring solution in place.  Luckily, most Unix OS’s have awesome tools, just takes a little tinkering.

So, for the project requirements, I needed the following two system level KPI’s:

  • CPU usage
  • Java HEAP usage

As you may know, prstat can’t provide data for the second KPI – Java Heap usage.  Not to fear, for that I used the verbose GC logs generated by the Weblogic JVM.  I wrote another blog entry dedicate to graphing verbose GC logs.  This article will cover the first KPI.

First, let’s provide some details on how the prstat data was harvested.

Harvesting prstat raw data

The following prstat command was used to harvest the raw data.

prstat -u <target_process_owner> 1 > ~/prstat-%s.log </dev/null 2>/dev/null &" % timestr

Post processing prstat raw data

In order to make the graphing part of the task as easy as possible, some post-processing of the raw data is required.  Let me provide some code to illustrate.  I won’t cover it in detail, but the code should be re-usable as-is.


'''
Created on Feb 24, 2014

@author: Ouray Viney

Description: This script should be used to post-process prstat output files.
This should work for any format of prstat output.

'''
import csv
import re
from os import listdir
from os.path import isfile, join

# Let's create a function that reads in a directory
# and creates a list of files to be processed.
def list_files(prstat_dir):
 onlyfiles = [ f for f in listdir(prstat_dir) if isfile(join(prstat_dir, f)) ]
 return onlyfiles

def filter_files(list_filter, list_to_filter):
# filtered_list = filter(filter, list_to_filter)
 r = re.compile(list_filter)
 filtered_list = filter(r.match, list_to_filter)
 return filtered_list

# Build a list of prstat files to process.
raw_prstat_files_dir = "C:\\tmp\prstat_logs\\"
list_of_files = list_files(raw_prstat_files_dir)
list_filter = ".*.log$" # let's filter out processed files
filter_list_of_files = filter_files(list_filter, list_of_files)
# Filter the list of files
list_filter = "prstat.*.log$"
filter_list_of_files = filter_files(list_filter, list_of_files)
print filter_list_of_files

bucket = []

# open file for reading
for prstat_file in filter_list_of_files:
 with open(raw_prstat_files_dir + prstat_file, "r") as f:
 # Let's get the headers and format them correctly
 first_line = f.readline().split()
 # Let's iterate over all the lines
 # add the ones we want to our list.
 for line in f:
 # Let's avoid the extra headers printed throughout the file
 if "Total:" not in line and "PID" not in line:
 # Let's only trap the java processes
 if "java" in line:
 # The characters enclosed in brackets constitute a character class.
 # Any characters in 'line' which are in that class are replaced
 # with the second parameter to sub: an empty string.
 # Essentially we want to remove '%' and 'M'. THis makes
 # it easier when we need to plot the data.
 line = re.sub('[%MK]', '', line)
 bucket.append(line.strip())
 f.closed

 with open("%s/%s_postprocessed.csv" % (raw_prstat_files_dir, prstat_file), 'wb') as fp:
 csvwriter = csv.writer(fp, dialect='excel')
 # Write headers
 csvwriter.writerow(first_line)

 # Write the rest to the file
 for item in bucket:
 csvwriter.writerow(item.split())

 # clean-up and close the file handle
 fp.closed
 # clear the array
 bucket = []

Graphing the post-processed prstat data

Here is the Python code that I wrote to graph the post-processed prstat data files.


'''
Created on Feb 13, 2015

@author: oviney
'''

import matplotlib.pyplot as plt
import pandas as pd
import re
from os import listdir
from os.path import isfile, join

# Let's create a function that reads in a directory
# and creates a list of files to be processed.
def list_files(prstat_dir):
onlyfiles = [ f for f in listdir(prstat_dir) if isfile(join(prstat_dir, f)) ]
return onlyfiles

def filter_files(list_filter, list_to_filter):
# filtered_list = filter(filter, list_to_filter)
r = re.compile(list_filter)
filtered_list = filter(r.match, list_to_filter)
return filtered_list

def graph(post_process_prstat_file, output_graph_dir, raw_prstat_files_dir):

# read in the csv file
prstatData = pd.read_csv(raw_prstat_files_dir + post_process_prstat_file)

# regex pattern match the name of the server
hostname = re.search("static_string\d+", post_process_prstat_file)
hostname = hostname.group(0)

# Group the data by PIDS and get the CPU% series.
pidGroups = prstatData.groupby("PID")

############################################################
# Create chart for CPU%

# Create a list of the columns that we want to graph along
# with some metadata.
desired_columns = {
"SIZE" : {
"chart_title": "Weblogic VM 'Memory Usage' for host [%s]" % hostname,
"ylabel": "Total virtual memory (used by Java process) MB",
},
"RSS" : {
"chart_title": "Weblogic VM 'resident set size' for host [%s]" % hostname,
"ylabel": "resident set size (used by Java process) MB",
},
"CPU" : {
"chart_title": "Weblogic VM 'CPU usage' for host [%s]" % hostname,
"ylabel": "CPU % (used by Java process)",
},
}
# loop over the desired columns
for desired_column, df in desired_columns.iteritems():
# Create a new image for reach desired column that we want to plot.
plt.figure()
# Iterate over the pids and each associated dataframe
for pid, df in pidGroups:
plt.plot(df[desired_column], label="Java PID: %s" % pid)

# chart formatting and styles
# plt.legend(loc=0)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # this places the legend outside the plot area
plt.title(desired_columns[desired_column]["chart_title"])
plt.ylabel(desired_columns[desired_column]["ylabel"])
plt.xlabel('sample every 10 seconds')
plt.savefig("%s/%s_%s.png" % (output_graph_dir, post_process_prstat_file, desired_column), bbox_inches='tight')

# Clean-up the figure from memory
plt.clf()
plt.cla()
plt.close()

# Build a list of post-processed prstat files to process.
raw_prstat_files_dir = "C:\\tmp\\prstat_postprocessed_logs"
list_of_files = list_files(raw_prstat_files_dir)
list_filter = ".*_postprocessed.csv$" # let's filter post-processed files only
filter_list_of_files = filter_files(list_filter, list_of_files)
output_graph_dir = raw_prstat_files_dir

# Loop over each post-processed prstat file and generate
# desired prstat graphs.
for prstat_post_processed_file in filter_list_of_files:
print "Graphing the following file: %s" % prstat_post_processed_file
graph(prstat_post_processed_file, output_graph_dir, raw_prstat_files_dir)

Here is a sample CPU graph

Weblogic VM CPU usage

You must be logged in to post a comment.

Proudly powered by WordPress   Premium Style Theme by www.gopiplus.com
WordPress Appliance - Powered by TurnKey Linux