################################################################### ### Hemant Ishwaran ################################################################### ### Plot ROC curve for ordinal categorical data (works for ### arbitrary number of categores, thus applies to "continuous" ### ordinal categorical data) ### ### Originally written for S-PLUS, but now converted to R ### ### Function does: ### ------------- ### -- Plot ROC curve and compute ROC area using trapezoidal rule ### -- Plot partial area under the curve ### -- Plot CDF for diseased and non-diseased groups ### -- Plot stacked histogram for diseased and non-diseased groups ### ### ### Example 1 Degree of suspicion of stage C/D prostate ### --------- cancer on ordinal scale {1,2,...100} ### ### -- true disease status (gold standard information) ### -- value of 1 corresponds to a diseased patient: ### ### group <- c( ### 1,0,1,0,1,1,0,1,1,1,1,0,1,1,0,0,1,1,1,0,1,1,0,0,1,0,1,0,1,0,0,1,0,1,0, ### 0,0,1,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,0,1,1,0,1,1, ### 1,1,0,1,0,1,0,0,0,0,1,0,1,0,1,0,0,1,0,0,1,0,1,1,1,1,1,1,1) ### ### -- ordinal value for degree of suspicion: ### ### ordinal <- c( ### 99,5,40,20,10,30,5,50,5,25,50,5,60,10,5,5,15,15,5,5,40,60,5,5,20,5,60, ### 5,15,5,5,30,25,5,5,5,5,60,40,5,5,10,10,5,0,40,5,50,5,5,5,70,5,50,0,5, ### 30,10,20,60,5,10,5,10,5,70,10,15,40,10,80,40,5,60,5,60,5,10,10,50,15, ### 5,60,20,70,10,5,30,10,50,10,60,5,40,50,80,5,15,25) ### ### -- generic call: ### ### rocPlot(group,ordinal) ### ### ### Example 2 Randomly generated continuous data: ### --------- ### ### rocPlot(rep(c(0,1),c(50,50)),rnorm(100,rep(c(90,100),c(50,50)),10)) ### ### -------------------------------------------------------------- ### Written by: ### ### Hemant Ishwaran hemant.ishwaran@gmail.com ### Department of QHS / Wb4 phone: 216.444.9932 ### The Cleveland Clinic Foundation fax: 216.636.0585 ### 9500 Euclid Avenue ### Cleveland, OH 44195 ### ### http://www.bio.ri.ccf.org/Resume/Pages/Ishwaran ### ------------------------------------------------------------- ### THIS PROGRAM SHOULD NOT BE COPIED, USED, MODIFIED, OR ### DISSEMINATED IN ANY WAY WITHOUT SPECIFIC WRITTEN PERMISSION ### FROM THE AUTHOR. ################################################################### rocPlot <- function(group, #group label ordinal, #ordinal values nbreaks=15, #nbreaks used in histogram minAtom.plot=25, #parameter for superimposing points on plots xtxt="Ordinal Values",#x-axis label plots.one.page=T, #put all plots on one page? area.only=F, #compute area; skip everything else ...) { group.uniq <- sort(unique(group)) neg <- ordinal[group==group.uniq[1]] pos <- ordinal[group==group.uniq[2]] n.neg <- length(neg) n.pos <- length(pos) range.data <- range(neg, pos) ### Find all distinct atoms generated by pos and neg test values atoms <- unique(sort(c(neg,pos))) atoms.neg <- unique(sort(neg)) atoms.pos <- unique(sort(pos)) n.atoms <- length(atoms) tp <- rep(0,n.atoms) fp <- rep(0,n.atoms) cdf.neg <- rep(0,length(atoms.neg)) cdf.pos <- rep(0,length(atoms.pos)) ### (1) Compute tp and fp values for each distinct atom ### (2) Compute CDF's for (i in 1:n.atoms){ tp[i] <- sum(pos > atoms[i])/n.pos fp[i] <- sum(neg > atoms[i])/n.neg } for (i in 1:length(atoms.neg)){ cdf.neg[i] <- sum(neg <= atoms.neg[i])/n.neg } for (i in 1:length(atoms.pos)){ cdf.pos[i] <- sum(pos <= atoms.pos[i])/n.pos } ### Add end values for tp, fp values and CDF's tp <- c(1,tp) fp <- c(1,fp) delta <- (range.data[2]-range.data[1])/20 atoms.neg <- c(atoms[1]-delta,atoms.neg,atoms[n.atoms]+delta) atoms.pos <- c(atoms[1]-delta,atoms.pos,atoms[n.atoms]+delta) cdf.neg <- c(0,cdf.neg,1) cdf.pos <- c(0,cdf.pos,1) ### (1) Calculate area and partial area under curve using Wilcoxon U-statistic ### (2) Estimate standard error for ROC area tp.mean <- (tp[1:n.atoms] + tp[2:(n.atoms+1)])/2 fp.diff <- -diff(fp) area.U <- sum(fp.diff*tp.mean) PUC.U <- cumsum(fp.diff*tp.mean) area.S <- sqrt((area.U*(1 - area.U)+(n.pos - 1)*(area.U/(2 - area.U) - area.U^2) + (n.neg - 1)*((2*area.U^2)/(1 + area.U) - area.U^2))/(n.pos*n.neg)) if (!area.only) { ### (1) ROC curve ### (2) PUC curve versus test value ### (3) CDF for test values ### (4) Stacked histogram of positive and negative test values. if (plots.one.page) { par(pty="s",mfrow=c(2,2)) main <- c("ROC Curve","Partial ROC Area", "CDF Plot","Histogram of Ordinal Values") } else { par(mfrow=c(1,1)) main <- c("","","","") } if(n.atoms>=minAtom.plot){ plot(fp,tp,type="l",xlim=c(0.0,1.0), ylim=c(0.0,1.0),xlab="False Positive Ratio", ylab="Sensitivity",main=main[1]) text(0.7,0.1,paste('Area=',format(round(area.U,3)), '+/-',format(round(area.S,3)))) } else{ plot(fp,tp,type="b",xlim=c(0.0,1.0), ylim=c(0.0,1.0),xlab="False Positive Ratio", ylab="Sensitivity",main=main[1]) text(0.7,0.1,paste('Area=',format(round(area.U,3)), '+/-',format(round(area.S,3)))) } abline(0,1,lty=3) if(n.atoms>=minAtom.plot){ plot(atoms,PUC.U,type="l", ylim=c(0.0,1.0),xlab=xtxt, ylab="Partial Area",main=main[2]) } else{ plot(atoms,PUC.U,type="b", ylim=c(0.0,1.0),xlab=xtxt, ylab="Partial Area",main=main[2]) } abline(h=area.U,lty=3) if(n.atoms>=minAtom.plot){ plot(atoms.neg,cdf.neg,type="s",lty=1,col=4, xlim=range(atoms.neg,atoms.pos), ylim=range(cdf.neg,cdf.pos), xlab=xtxt, ylab="Cumulative Probabilities", main=main[3]) par(new=T) plot(atoms.pos,cdf.pos,type="s",lty=2,col=2, xlab="",ylab="") } else{ plot(atoms.neg,cdf.neg,type="s",lty=1,col=4, xlim=range(atoms.neg,atoms.pos), ylim=range(cdf.neg,cdf.pos), xlab=xtxt,ylab="Cumulative Probabilities", main=main[3]) rug(atoms) par(new=T) points(atoms.neg,cdf.neg,pch="x") par(new=T) plot(atoms.pos,cdf.pos,type="s",lty=2,col=2, xlab="",ylab="") par(new=T) points(atoms.pos,cdf.pos,type="p",pch="o") } ### Mimic a histogram using barplot nbreaks <- min(nbreaks,max(n.atoms,range.data[2]-range.data[1])) data.bks <- seq(range.data[1]-.001,range.data[2]+.001, length=nbreaks+2) temp.neg <- hist(neg,breaks=data.bks,plot=F) pct.neg <- temp.neg$count temp.pos <- hist(pos,breaks=data.bks,plot=F) pct.pos <- temp.pos$count barplot(rbind(pct.pos,pct.neg), beside=F, yaxt="n",srt=90, space=0,col=c(2,4), xlab=xtxt, ylab="Counts", main=main[4]) junk.fit <- lm(seq(0.5, length(data.bks), by=1) ~ data.bks) pretty.x <- coef(junk.fit) %*% t(cbind(1,pretty(data.bks,10))) nice.probs <- pretty(range(c(pct.pos+pct.neg))) axis(1,at=pretty.x,labels=pretty(data.bks,10)) axis(2,at=nice.probs,lab=abs(nice.probs)) ### Output results cat("-----------------------------------------------------------","\n") cat("ROC Analysis","\n") cat("------------","\n") cat("sample size :", n.neg+n.pos,"\n") cat("no. observations from group 1 :", n.neg,"\n") cat("no. observations from group 2 :", n.pos,"\n") cat("area under the curve :", paste(round(area.U,3)), '+/-',round(area.S,3)) cat("\n") } else { return(list( area=area.U, #area under curve tp=tp, #true positive rate fp=fp, #false positive rate unique.values=atoms #unique test values )) } }