adjusted.boxplot <- function(x,a=-4,b=3,style="single",boxcol=3,ylab="Values",title=" ") { ################################################################################### # The ADJUSTED BOXPLOT draws the skewness-adjusted boxplot for data sampled from a continuous, # unimodal, univariate data set. # Typical for this boxplot are its skewness-adjusted whiskers, which are based on # the medcouple, a robust measure of skewness. At skewed data, the standard boxplot # typically marks too many regular observations as outliers. The adjusted boxplot on the other # hand makes a better distinction between regular observations and real outliers. # # The ADJUSTED BOXPLOT is constructed as follows: # - A line is put at the height of the sample median. # - The box is drawn from the first to the third quartile. # - At right skewed data (MC >= 0), all points outside the interval # [Q1 - 1.5*e^(a*MC)*IQR ; Q3 + 1.5*e^(b*MC)*IQR] are marked as outliers, # where Q1 and Q3 denote the first and third quartile respectively, # IQR stands for the interquartile range and MC is an abbreviation # for the medcouple. # At left skewed data (MC < 0),the interval [Q1 - 1.5*e^(-b*MC)*IQR ; Q3 + 1.5*e^(-a*MC)*IQR] # is used, because of symmetry reasons. # - Finally, the whiskers are drawn, going from the ends of the box to the # most remote points that are no outliers. # Note that the standard boxplot has the same box, but other whiskers. In that case, # all points outside the interval [Q1 - 1.5*IQR ; Q3 + 1.5*IQR] are marked as outliers. # The adjusted boxplot is thus similar to the standard boxplot at symmetric distributions # where MC=0. # # The medcouple is described in: # Brys, G., Hubert, M., Struyf, A. (2004), "A robust measure of # skewness", Journal of Computational and Graphical Statistics, 13, 996-1017. # # The skewness-adjusted boxplot is described in: # Hubert, M. and Vandervieren, E. (2006), # "An adjusted boxplot for skewed distributions", # submitted for publication, available at wis.kuleuven.be/stat/robust.html. # # Required input argument: # x : a univariate data vector. Missing values (NaN's) and infinite values # (Inf's) are allowed, since observations (rows) with missing or infinite # values will automatically be excluded from the computations. # # Optional input arguments: # a,b : numbers, defining the outlier cutoffs from the adjusted boxplot. # The default is to use a = -4, b = 3. # This means that for right skewed data the interval # [Q1 - 1.5*e^(-4*MC)*IQR ; Q3 + 1.5*e^(3*MC)*IQR] and # for left skewed data the interval # [Q1 - 1.5*e^(-3*MC)*IQR ; Q3 + 1.5*e^(4*MC)*IQR] is used. # style : 'string', name of the style that has to be used. Four different styles are available: # - "single" : gives one plot with the adjusted boxplot. # - "separate" : generates 2 plots. The adjusted boxplot is drawn on the first # graph and the standard boxplot on the second one. # - "imposed" : shows the adjusted boxplot, with dotted lines at the height of # the standard whiskers superimposed. # - "both" : gives one plot with the adjusted boxplot and the standard boxplot # next to each other. # The default is to use the "single" style. # boxcol : number, filled box color. The box is filled with the indicated color. # A color of 0 can be used to designate filling with the background color. # A specification of boxcol=-1 is used to designate "no fill" at all. # The default is to fill with color 3. # ylab : 'string', label for plotting next to the y-axis. The default is to use 'Values' as string. # title : 'string', main title. # # This function calls the 'mc' function which in turn calls a C-routine for fast # computation of the medcouple. It is therefore required to load the 'mc' function # as well. This can be done as follows (if 'mc.ssc' and 'medcouple.dll' are in the working chapter # directory): # > source("mc.ssc") # > dyn.open("medcouple.dll") # # If this does not work well, the 'mc' function can be replaced with 'mcplus' which does # not use C-code but which is much more time-consuming. # To work with 'mcplus', one must replace 'medc <- mc(x)' by 'medc <- mcplus(x)'. # Afterwards, 'mcplus' can be loaded as follows (if 'mcplus.ssc' is in the working chapter directory): # > source("mcplus.ssc") # # # Examples : # adjusted.boxplot(lottery.payoff) # adjusted.boxplot(lottery.payoff,style="separate",boxcol=2) # adjusted.boxplot(lottery.payoff,boxcol=4,style="imposed",ylab="Lottery values") # adjusted.boxplot(lottery.payoff,style="both",ylab="Lottery values",boxcol=6,title="Lottery.payoff") # # Written by Ellen Vandervieren, Mia Hubert # Created on : 19/04/2005 # Last Update: 21/07/2005 #################################################################################### # Observations with missing or infinite values are omitted. x <- x[is.finite(x)==T] # Internal variables. q <- quantile(x, c(0.25, 0.5, 0.75)) med <- q[2] iqr <- q[3] - q[1] medc <- mc(x) std.cutoff.low <- q[1] - 1.5 * iqr std.cutoff.high <- q[3] + 1.5 * iqr if (medc>=0) { # Right skewed data. adj.cutoff.low <- q[1] - 1.5 * exp(a * medc) * iqr adj.cutoff.high <- q[3] + 1.5 * exp(b * medc) * iqr } else { # Left skewed data. adj.cutoff.low <- q[1] - 1.5 * exp((-1) * b * medc) * iqr adj.cutoff.high <- q[3] + 1.5 * exp((-1) * a * medc) * iqr } eps <- 0.070000000000000007 * iqr # Looking for outliers. std.outliers <- x[(x < std.cutoff.low) | (x > std.cutoff.high)] adj.outliers <- x[(x < adj.cutoff.low) | (x > adj.cutoff.high)] if (style=="both") { std.xloc <- 1.25*eps adj.xloc <- std.xloc + 2.5*eps xmin <- 0 xmax <- 5*eps ymin <- min(x)-10*eps ymax <- max(x)+eps plot(c(adj.xloc-eps/4,adj.xloc+eps/4),c(med,med),xlab=" ",ylab=ylab,xaxt="n",xlim=c(xmin,xmax),ylim=c(ymin,ymax),pch=22,lwd=2,lab=c(0,5,7)) mtext("Standard boxplot Adjusted boxplot",side=1,line=1) # Warning if medcouple is large. if (medc>0.5) { text(adj.xloc-eps/6,min(x)-8.5*eps,"MC =") text(adj.xloc+eps/5,min(x)-8.5*eps,charToString(round(medc,3))) } # Drawing the outliers of the standard boxplot. for (i in 1:length(std.outliers)) { lines(c(std.xloc-eps/6.5,std.xloc+eps/6.5),c(std.outliers[i],std.outliers[i]),type="l") } # Drawing the whiskers of the standard boxplot. lines(c(std.xloc,std.xloc),c(q[3],max(x[x<=std.cutoff.high])),lty=2) lines(c(std.xloc,std.xloc),c(q[1],min(x[x>=std.cutoff.low])),lty=2) lines(c(std.xloc-eps/3,std.xloc+eps/3),c(max(x[x<=std.cutoff.high]),max(x[x<=std.cutoff.high]))) lines(c(std.xloc-eps/3,std.xloc+eps/3),c(min(x[x>=std.cutoff.low]),min(x[x>=std.cutoff.low]))) # Drawing the standard box. polygon(c(std.xloc-eps/2,std.xloc+eps/2,std.xloc+eps/2,std.xloc-eps/2),c(q[1],q[1],q[3],q[3]),col=boxcol,border=1,lwd=2) # Drawing the median of the standard boxplot. lines(c(std.xloc-eps/2,std.xloc+eps/2),c(med,med),col=0,lwd=3) title(main=title) } else { adj.xloc <- 1 xmin <- 1-1.5*eps xmax <- 1+1.5*eps ymin <- min(x)-7*eps ymax <- max(x)+eps plot(c(adj.xloc-eps/4,adj.xloc+eps/4),c(med,med),xlab="",ylab=ylab,xaxt="n",xlim=c(xmin,xmax),ylim=c(ymin,ymax),pch=22,lwd=2,lab=c(0,5,7)) # Warning if medcouple is large. if (medc>0.5) { text(adj.xloc-eps/6,min(x)-5.5*eps,"MC =") text(adj.xloc+eps/5,min(x)-5.5*eps,charToString(round(medc,3))) } if (style=="single") { title(main=title) mtext(" Adjusted boxplot ",side=1,line=1) } if (style=="imposed") { # Drawing dotted lines at the height of the standard whiskers. lines(c(adj.xloc-eps/3,adj.xloc+eps/3),c(max(x[x<=std.cutoff.high]),max(x[x<=std.cutoff.high])),lwd=2,lty=3) lines(c(adj.xloc-eps/3,adj.xloc+eps/3),c(min(x[x>=std.cutoff.low]),min(x[x>=std.cutoff.low])),lwd=2,lty=3) # Illustrating the difference between the standard and the adjusted boxplot. if (medc>=0) { xtra.outliers <- std.outliers[(std.outliers<=adj.cutoff.high)&(std.outliers>=q[3])] } else { xtra.outliers <- std.outliers[(std.outliers>=adj.cutoff.low)&(std.outliers<=q[1])] } if (length(xtra.outliers)!=0) { for (k in 1:length(xtra.outliers)) { points(adj.xloc,xtra.outliers[k],type="p",pch=16) } } title(main=title) mtext(" Adjusted ( ____ ) and Standard ( _ . _ . _ ) boxplot ",side=1,line=1) } } # Plotting the outliers of the adjusted boxplot. for (i in 1:length(adj.outliers)) { lines(c(adj.xloc-eps/6.5,adj.xloc+eps/6.5),c(adj.outliers[i],adj.outliers[i]),type="l") } # Drawing the adjusted whiskers. lines(c(adj.xloc,adj.xloc),c(q[3],max(x[x<=adj.cutoff.high])),lty=2) lines(c(adj.xloc,adj.xloc),c(q[1],min(x[x>=adj.cutoff.low])),lty=2) lines(c(adj.xloc-eps/3,adj.xloc+eps/3),c(max(x[x<=adj.cutoff.high]),max(x[x<=adj.cutoff.high]))) lines(c(adj.xloc-eps/3,adj.xloc+eps/3),c(min(x[x>=adj.cutoff.low]),min(x[x>=adj.cutoff.low]))) # Drawing the adjusted box. polygon(c(adj.xloc-eps/2,adj.xloc+eps/2,adj.xloc+eps/2,adj.xloc-eps/2),c(q[1],q[1],q[3],q[3]),col=boxcol,border=1,lwd=2) # Drawing the median of the adjusted boxplot. lines(c(adj.xloc-eps/2,adj.xloc+eps/2),c(med,med),col=0,lwd=3) title(main=title) if (style=="separate") { mtext(" Adjusted boxplot ",side=1,line=1) # Generating an extra plot with the standard boxplot. std.xloc <- 1 plot(c(std.xloc-eps/4,std.xloc+eps/4),c(med,med),xlab=" ",ylab=ylab,xaxt="n",xlim=c(xmin,xmax),ylim=c(ymin,ymax),pch=22,lwd=2,lab=c(0,5,7)) # Drawing the outliers of the standard boxplot. for (i in 1:length(std.outliers)) { lines(c(std.xloc-eps/6.5,std.xloc+eps/6.5),c(std.outliers[i],std.outliers[i]),type="l") } # Drawing the whiskers of the standard boxplot. lines(c(std.xloc,std.xloc),c(q[3],max(x[x<=std.cutoff.high])),lty=2) lines(c(std.xloc,std.xloc),c(q[1],min(x[x>=std.cutoff.low])),lty=2) lines(c(std.xloc-eps/3,std.xloc+eps/3),c(max(x[x<=std.cutoff.high]),max(x[x<=std.cutoff.high]))) lines(c(std.xloc-eps/3,std.xloc+eps/3),c(min(x[x>=std.cutoff.low]),min(x[x>=std.cutoff.low]))) # Drawing the standard box. polygon(c(std.xloc-eps/2,std.xloc+eps/2,std.xloc+eps/2,std.xloc-eps/2),c(q[1],q[1],q[3],q[3]),col=boxcol,border=1,lwd=2) # Drawing the median of the standard boxplot. lines(c(std.xloc-eps/2,std.xloc+eps/2),c(med,med),col=0,lwd=3) title(main=title) mtext(" Standard boxplot ",side=1,line=1) } }