Combining SQL TRACE & SYSTEMTAP Part 2: No more Unaccounted-for Time due to time spent on CPU run queue

I my previous post i showed how we can eliminate one of the causes for Unaccounted-for Time,which is CPU double-counting, from SQL trace file using systemtap. But we can do more,The other important causes of missing data in an Extended SQL trace file is “Time Spent Not Executing” (Cary Millsap) which is time spent on CPU run queue.So how to measure it ?

Here is an excerpt of what we are going to achieve :

Old trace file :

Capture 12

New trace file  showing cpu consumption inside wait event and time spent on CPU run queue :

Capture 11

Test database :
Oracle 12.1.0.2.6 with THREADED_EXECUTION set to FALSE

Step 1 :

For the wait events we will add instrumentation code to the two functions “kslwtbctx” and “kslwtectx” marking the begin and end of a wait event like we did in the later blog post.But what about db call ? which opi call to instrument ? (i will only treat parse,exec and fetch)

Here is some call graph i used on my investigation :

PARSE :

Capture 2

Capture 4

EXEC :

Capture 3

FETCH :

Capture 1

After some diagnostics using gdb and systemtap it seems that :

  • opifch2  -> FETCH
  • opiexe -> EXEC
  • opiosq0 -> PARSE

And here is the script i used for displaying the call graph (Duration is not correctly calculated for the cases including recursive calls.Sorry too busy :D) :

global begin_time,begin_time_write

probe process("oracle").function("opi*").call
         {
        if (pid() == target()) {

     begin_time[ppfunc()] = gettimeofday_us()
     printf("%s%s%s \n",
         thread_indent (1),
         "->",
         ppfunc ()
         )
   }
}

probe process("oracle").function("opi*").return
         {
        if (pid() == target()) {

     end_time = gettimeofday_us() - begin_time[ppfunc()]
     printf("%s%s%s Duration : %d \n",
         thread_indent (-1),
         "<-",          ppfunc (),end_time          )    } } probe nd_syscall.write.call {  if (pid() == target()) {   begin_time_write = gettimeofday_us()  printf("%s->write args :%s \n",thread_indent (1),argstr)
}
}

probe nd_syscall.write.return {
 if (pid() == target()) {
  end_time_write = gettimeofday_us() - begin_time_write
 printf("%s<-write Duration : %d \n",thread_indent (-1),end_time_write)
}
}

Based on the script developed in my previous post lets add the new section for measuring the time spent on the CPU Run queue. (This script contains excerpt of code developed by : Jason Baron  and  Josh Stone   https://sourceware.org/systemtap/examples/process/schedtimes.stp)
Note : As you have seen on the call graph if we base our instrumentation on the preceding opi calls the the write to trace file will also be included on the calculation.In fact this script can be improved in many ways,it’s just an example of what we can achieve with systemtap in a very short time.

global DEAD=-1, RUNNING=1, QUEUED=2, SLEEPING=3
global wait_ev_queued_time,opiexe_queued_time,opifch2_queued_time,opiosq0_queued_time
global pid_state
global previous_timestamp=0
global cputime_begin=-1
global in_wait_event
global parse_recu_level=0
global exec_recu_level=0
global fech_recu_level=0

function __cputime:long()
%{ /* pure */
   STAP_RETVALUE = ((current->utime + current->stime)* (1000000 / HZ));
%}

probe process("oracle").function("kslwtbctx") {
 if (pid() == target()) {
   in_wait_event = 1
   wait_ev_queued_time = 0
   cputime_begin =  __cputime()
}
}

probe process("oracle").function("kslwtectx") {
 if (pid() == target()) {
   in_wait_event = 0
   cputime_end =  __cputime()
   if(cputime_begin != -1) {
      cpu_consumed = cputime_end - cputime_begin
      printf ("c=%d q=%d\n",cpu_consumed,wait_ev_queued_time)
    }
    else
    {
      printf ("c=0 q=0\n");
    }
}
}

probe process("oracle").function("opiexe").call {
 if (pid() == target()) {
	 opiexe_queued_time[exec_recu_level] = 0
	 exec_recu_level = exec_recu_level + 1
}
}

probe process("oracle").function("opiexe").return {
 if (pid() == target()) {
    if(exec_recu_level > 0 ) {
	 exec_recu_level = exec_recu_level - 1
	 printf ("q=%d\n",opiexe_queued_time[exec_recu_level])
	} else {
         printf ("q=0\n");
	}
}
}

probe process("oracle").function("opifch2").call {
 if (pid() == target()) {
	 opifch2_queued_time[fech_recu_level] = 0
	 fech_recu_level = fech_recu_level + 1
}
}

probe process("oracle").function("opifch2").return {
 if (pid() == target()) {
    if(fech_recu_level > 0 ) {
	 fech_recu_level = fech_recu_level - 1
	 printf ("q=%d\n",opifch2_queued_time[fech_recu_level])
	} else {
         printf ("q=0\n");
	}
}
}

probe process("oracle").function("opiosq0").call {
 if (pid() == target()) {
	 opiosq0_queued_time[parse_recu_level] = 0
	 parse_recu_level = parse_recu_level + 1
}
}

probe process("oracle").function("opiosq0").return {
 if (pid() == target()) {
    if(parse_recu_level > 0 ) {
	 parse_recu_level = parse_recu_level - 1
	 printf ("q=%d\n",opiosq0_queued_time[parse_recu_level])
	} else {
         printf ("q=0\n");
	}
}
}

function timestamp()
{
  return gettimeofday_us()
}

function update_times(now)
{
  if (previous_timestamp != 0) {
  delta = now - previous_timestamp
  previous_timestamp = now

  if ( pid_state == QUEUED ) {
         if (in_wait_event == 1) {
            wait_ev_queued_time += delta
	 }
	 if (exec_recu_level != 0 ) {
		for(i=0;i<exec_recu_level;i+=1) {
			opiexe_queued_time[i] += delta
		}
	 }
	  if (fech_recu_level != 0 ) {
		for(i=0;i<fech_recu_level;i+=1) {
			opifch2_queued_time[i] += delta
		}
	 }
	  if (parse_recu_level != 0 ) {
		for(i=0;i<parse_recu_level;i+=1) { 			opiosq0_queued_time[i] += delta 		} 	 }   }   } else {     previous_timestamp = now   } } probe scheduler.ctxswitch {   // Task $prev is scheduled off this cpu   if ($prev->pid == target()) {

    state = $prev->state
    update_times(timestamp())

    if (state > 0) {
      pid_state = SLEEPING
    } else if (state == 0) {
      pid_state = QUEUED
    } else {
      pid_state = DEAD
    }
  }

  // Task $next is scheduled onto this cpu
  if ($next->pid == target() ) {
    update_times(timestamp())
    pid_state = RUNNING
  }
}

probe scheduler.wakeup
{
  // Task $p is awakened
  if ($p->pid == target()) {
    update_times(timestamp())
    pid_state = QUEUED
  }
}

Now launch the script.

Capture 7

Step 2 :

The second and final thing to do is to combine the two trace files generated by systemtap and SQL trace.

#!/bin/sh
trace_filename="$1"
stap_filename="$2"
i=1
j=1

while read -r line
do
    if [[ $line == "WAIT"* ]]
    then
     echo "$line" | sed  "s/ela=/`grep  c= $stap_filename | sed -n $i'p' | cat` ela=/g"
     i=$((i+1))
    elif [[ $line == "PARSE"* || $line == "EXEC"* || $line == "FETCH"* ]]
	then
	  echo "$line" | sed  "s/c=/`grep -v c= $stap_filename | sed -n $j'p'`,c=/g"
      j=$((j+1))
	else
     echo "$line"
    fi
done < "$trace_filename"

Now launch the combine script.

capture-4

You can do a lot more with systemtap !

That’s it and have fun 😀

One thought on “Combining SQL TRACE & SYSTEMTAP Part 2: No more Unaccounted-for Time due to time spent on CPU run queue

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s