Graphing verbose GC logs using Python and Matplotlib

I was recently working on a project that required some resource automation. Since the target application server was running on Java, I wanted to provide Java Heap statistics. I didn’t have access to instrument the JVM, so I was forced to use the verbose GC logs generated by the application server.

One challenge was that the GC logs don’t logrotate, so all the data in each file represents all GC events since the VM was started. Something to keep in mind when you need to figure out timescales.

In most projects in the past, I used commercial or open source tools to graph the verbose GC logs. This time there was a need to automate it.

I did some research on what Python libraries I should use, most articles pointed towards Python matplotlib for the graphing and pandas for data analysis.

My thoughts were I would match each line, extracting fragments (metrics) into budgets.  Each bucket would be a data set or series (excel graphing term).  I wrote a basic directory parsing function to reading in a directory of files.  Read in each file into memory, parse each line extracting incremental and full GC metrics.

There are two patterns we have to look for:

2203.169: [GC 1859411K->1063883K(2001856K), 0.7195652 secs]
2203.889: [Full GC 1063883K->634282K(2001856K), 15.9175307 secs]

We’ll be extracting the following key metrics:

tsoffset, usedBefore, usedAfter, elapsed, heapSize, isFullGc

Once we have read to the end of each file, a graph will be produced.  We want to draw a graph of the GC behavior we are interested in. There are three line graphs and one series of points.

    – Heap in use

    – Total JVM heap size

    – Times when full GC happens (blue dots on X-axis)

    – The Y-axis (for memory usage) numbers are shown in MB

    – The X-axis (for time) is plotted in minutes since start

    – The second Y-axis is for GC response times is plotted in seconds

 Sample output after passing the buckets of data to matplot.

Verbose GC graph

Verbose GC graph

Here is the sample Python code. Not pretty or cleaned up but worth sharing to help someone else with a similar need.

'''
Created on Feb 17, 2015

@author: oviney
'''

#!/usr/bin/python
import os
import sys
import re
import time
from stat import *
from pylab import *
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt

def parse(line):

    '''
    Parses an input line from gc log file into a set of tokens and returns them.
    There are two patterns we have to look for:
      2203.169: [GC 1859411K->1063883K(2001856K), 0.7195652 secs]
      2203.889: [Full GC 1063883K->634282K(2001856K), 15.9175307 secs]
    '''
    pgre = re.compile("(^\d+\.\d+):\ \[GC\ (\d+)K->(\d+)K\((\d+)K\),\s(\d+\.\d+)\ secs\]")
    fre = re.compile("(^\d+\.\d+):\ \[Full\ GC\ (\d+)K->(\d+)K\((\d+)K\),\s(\d+\.\d+)\ secs\]")
    # First try matching with the partial GC pattern pgre
    isFullGc = False
    # clean up log entries like
    # 162399.256: [GC-- 2025577K->2035237K(2042688K), 1.6687504 secs]
    if "[GC--" in line:
        print "Found odd entry in gc log."
        print line
        print "Fixing line to avoid regex issues."
        line = line.replace("[GC--", "[GC")
        print line

    mo = pgre.match(line)
    # Then match with the full GC pattern
    if (mo == None):
        mo = fre.match(line)
        isFullGc = True
    #print line
    return float(mo.group(1)), int(mo.group(2)), int(mo.group(3)), float(mo.group(5)), int(mo.group(4)), isFullGc

def drawGraph(mergedBeforeAfterGCTimeStamps, mergedBeforeAfterGCMem,
              heapSizes, gcDurations, timeStampsBeforeGc, timeStampsForFullGc, fullGcIndicators,
              logStartTS, logEndTS, output, gc_log_dir, gc_log_file):

    '''
    Draws a graph of the GC behavior we are interested in. There are three
    line graphs and one series of points.
    - Heap in use
    - Total JVM heap size
    - Times when full GC happens (blue dots on X-axis)
    - The Y-axis (for memory usage) numbers are shown in MB
    - The X-axis (for time) is plotted in minutes since start
    - The second Y-axis is for GC response times is plotted in seconds
    '''

    # lets plot the two lists as one line giving
    # us the traditional GC line patterns
    # we have come to expect from most GC viewers.
#    merged_list_xvals = x1vals + x2vals
#    merged_list_yvals = y1vals + y2vals

    fig, ax1 = plt.subplots()
    plt.xlabel("Time (minutes)")
    plt.ylabel("Heap (Mb)")
    plt.title("GC Log")
    plt.title("GC Log for server: %s" % (gc_log_file.replace("-gc.log", "")))

    ax2 = ax1.twinx()

    # Heap in use graph over time
    ax1.plot(mergedBeforeAfterGCTimeStamps, mergedBeforeAfterGCMem, 'b', label='used heap')
    # Total heap size over time
    ax1.plot(timeStampsBeforeGc, heapSizes, 'y', label='total JVM heap size')
    # Full GC over time
    ax1.plot(timeStampsForFullGc, fullGcIndicators, 'bo', label='times when full gc')

#    ax1.set_xlabel('Time (minutes)')
#    ax1.set_ylabel('Heap (MB)', color='g')
    ax2.set_ylabel('Time (seconds)', color='b')

    # Plot the time of each GC
    ax2.plot(timeStampsBeforeGc, gcDurations, 'g', label='GC times')

    # Add the legend
    # ask matplotlib for the plotted objects and their labels
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
#    ax2.legend(lines + lines2, labels + labels2, loc='lower center', shadow=True)
    ax2.legend(lines + lines2, labels + labels2, loc='best', shadow=True)
    plt.savefig("%s/%s" % (gc_log_dir, output))
    plt.close()

def convertISOToUnixTS(isots):
    '''
    Takes a timestamp (supplied from the command line) in ISO format, ie
    yyyy-MM-dd HH:mm and converts it to seconds since the epoch.
    '''
    isore = re.compile("(\d{4})-(\d{2})-(\d{2})\s(\d{2}):(\d{2})")
    mo = isore.match(isots)
    return time.mktime([int(mo.group(1)), int(mo.group(2)), int(mo.group(3)), int(mo.group(4)), int(mo.group(5)), 0, 0, 0, -1])

def baseTimeStamp(gc_log_file, gc_log_dir):
    '''
    Since the timestamps in the gc.log file are probably in seconds since
    server startup, we want to get an indication of the time the first log
    line was written. We do this by getting the ctime of the gc.log file.
    '''
#    print os.lstat("%s/%s" % (gc_log_dir, gc_log_file))
    # This code is buggy on Linux/Unix since it doesn't return the original file creation time
    # therefore can't be trusted to provide the start of the GC log.
    return os.lstat("%s/%s" % (gc_log_dir, gc_log_file))[ST_CTIME]

def minutesElapsed(currentTS, baseTS):
    '''
    Convert the timestamp (in seconds since JVM startup) to mins elapsed
    since first timestamp entry.
    '''
    return (currentTS - baseTS) / 60

def timeString(ts):
    '''
    Return printable version of time represented by seconds since epoch
    '''
    return time.strftime("%Y-%m-%d %H:%M", time.localtime(ts))

def main(gc_log_file, gc_log_dir):

    input_gc_file = gc_log_file
    output = "%s.png" % gc_log_file

    sliceLogFile = False
    startTime = 0
    endTime = 0

    # The base time is the ctime for the log file
    baseTS = baseTimeStamp(input_gc_file, gc_log_dir)
    # initialize local variables
    mergedBeforeAfterGCMem = []
    timeStampsBeforeGc = []
    usedBeforeGc = []
    timeStampsAfterGc = []
    usedAfterGc = []
    heapSizes = []
    timeStampsForFullGc = []
    fullGcIndicators = []
    gcStartTS = -1
    gcEndTS = -1
    mergedBeforeAfterGCTimeStamps = []
    gcDurations = []

    # read input_gc_file and parse line by line
    fin = open("%s/%s" % (gc_log_dir, gc_log_file), 'r')
    while (True):
        line = fin.readline()
        if (line == ""):
            break
        (tsoffset, usedBefore, usedAfter, elapsed, heapSize, isFullGc) = parse(line.rstrip())
        # Set the first timestamp once for the very first record, and keep
        # updating the last timestamp until we run out of lines to read
        if (gcStartTS == -1):
            gcStartTS = tsoffset
        gcEndTS = tsoffset
        # If start and end times are specified, then we should ignore data
        # that are outside the range
        if (sliceLogFile):
            actualTime = baseTS - gcStartTS + tsoffset
            if (actualTime < startTime or actualTime > endTime):
                continue
        # X and Y arrays for before GC line, X will need postprocessing
        timeStampsBeforeGc.append(tsoffset)
        usedBeforeGc.append(usedBefore / 1024)
        # X and Y arrays for after GC line, X will need postprocessing
        timeStampsAfterGc.append(tsoffset + elapsed)
        usedAfterGc.append(usedAfter / 1024)

        # Let's create a bucket for before and after
        mergedBeforeAfterGCMem.append(usedBefore / 1024)
        mergedBeforeAfterGCMem.append(usedAfter / 1024)
        mergedBeforeAfterGCTimeStamps.append(tsoffset)
        mergedBeforeAfterGCTimeStamps.append(tsoffset + elapsed)

        # Let's create the bucket for the gc times
        gcDurations.append(elapsed)

        # Y array for heap size (use minOffSetBeforeGC for X), will use
        # Y axis for after GC line
        heapSizes.append(heapSize / 1024)
        # X and Y arrays for Full GC line, X will need postprocessing
        if (isFullGc):
            timeStampsForFullGc.append(tsoffset)
            fullGcIndicators.append(1)
    fin.close()
    # Convert log start and end time stamps to printable format
    if (sliceLogFile):
        logStartTS = sys.argv[3]
        logEndTS = sys.argv[4]
    else:
#        print baseTS
        logStartTS = timeString(baseTS)
        logEndTS = timeString(baseTS + gcEndTS - gcStartTS)
#        print logStartTS, logEndTS
    # convert timestamps from seconds since JVM startup to minutes elapsed
    # since first timestamp entry
    startTime = timeStampsBeforeGc[0]
    for i in range(len(timeStampsBeforeGc)):
        timeStampsBeforeGc[i] = minutesElapsed(timeStampsBeforeGc[i], startTime)
        timeStampsAfterGc[i] = minutesElapsed(timeStampsAfterGc[i], startTime)
    for i in range(len(timeStampsForFullGc)):
        timeStampsForFullGc[i] = minutesElapsed(timeStampsForFullGc[i], startTime)
    for i in range(len(mergedBeforeAfterGCTimeStamps)):
        mergedBeforeAfterGCTimeStamps[i] = minutesElapsed(mergedBeforeAfterGCTimeStamps[i], startTime)
    # Send off to graph results
    drawGraph(mergedBeforeAfterGCTimeStamps, mergedBeforeAfterGCMem,
              heapSizes, gcDurations, timeStampsBeforeGc,
              timeStampsForFullGc, fullGcIndicators,
              logStartTS, logEndTS, output, gc_log_dir, gc_log_file)

# Let's create a function that reads in a directory
# and creates a list of files to be processed.
def list_files(prstat_dir):
    onlyfiles = [ f for f in listdir(prstat_dir) if isfile(join(prstat_dir, f)) ]
    return onlyfiles

def filter_files(list_filter, list_to_filter):
#    filtered_list = filter(filter, list_to_filter)
    r = re.compile(list_filter)
    filtered_list = filter(r.match, list_to_filter)
    return filtered_list

# Build a list of prstat files to process.
gc_log_dir = r"C:\tmp\verbose_gc_logs/"
list_of_files = list_files(gc_log_dir)
list_filter = ".*.-gc.log$" # let's filter out processed files
filter_list_of_files = filter_files(list_filter, list_of_files)

if __name__ == "__main__":
    for gc_file in filter_list_of_files:
        main(gc_file, gc_log_dir)

You must be logged in to post a comment.

Proudly powered by WordPress   Premium Style Theme by www.gopiplus.com
WordPress Appliance - Powered by TurnKey Linux