###################### Rggobi for a given set of variables (only training data) ########################## ############################## three different sets of variables can be tried ############################ ################################## all data is used for training ######################################### #several SVM runs library(MASS) library(sma) library(e1071) library(Rggobi) #function to normalize data f.std.data<-function(x){ xm<-mean(x) xs<-sd(x) return((x-xm)/xs) } hypNormal<-function(sv,coefs){ w<-length(sv[1,]) for (i in (1:length(sv[1,]))){ w[i]<-sum(coefs*sv[,i]) } return(w) } pred<-function(x,w,b){ p1<-rep(0,length(x[,1])) for (i in (1:length(x[,1]))){ p1[i]<-sum(x[i,]*w) } p1<-p1-b return(p1) } #read data from file t2 <- read.table("74x822_12.txt") #elim<-c(3, 5, 12, 13, 20, 21, 24, 40, 46, 50, 56, 63, 67, 69, 71, 74, 1, 2, 9, 10, 16, 34, 37, 38, 45, 48, 51, 54, 55, 60, 61, 64, 66) #t1<- t[-elim,] #normalize data (attributes) y<-apply(t1[,1:822],2,f.std.data) #add class column to the normalized data x<-cbind(y,t1[,823]) #select first 4 most important SVM from 40 most imp BW variables - pics 22,25,26 #x.imp <- cbind(x[,c(800,403,535,596)],x[,823]) #dimnames(x.imp)[[2]]<-c("V800","V403","V535","V596","cl") # 1 2 # 1 46 4 # 2 6 18 #select first 4 most important comb 2 svm and two common with BW and PDA - pic 21 x.imp <- cbind(x[,c(800,721,357,596)],x[,823]) dimnames(x.imp)[[2]]<-c("V800","V721","V357","V596","cl") # 1 2 # 1 47 3 # 2 9 15 #select first 4 most important SVM from all 822 - pic 19 #x.imp <- cbind(x[,c(390,389,4,596)],x[,823]) #dimnames(x.imp)[[2]]<-c("V390","V389","V4","V596","cl") # 1 2 # 1 46 4 # 2 8 16 x.imp.row<-nrow(x.imp) x.imp.col<-ncol(x.imp) x.imp<-data.frame(x.imp) dim<-length(x.imp[1,]) names(x.imp)[dim]<-"cl" #generate grid xr<-apply(x.imp[,1:length(x.imp)-1],2,range) grid.len<-10 xg1<-seq(xr[1,1],xr[2,1],len=grid.len) xg2<-seq(xr[1,2],xr[2,2],len=grid.len) xg3<-seq(xr[1,3],xr[2,3],len=grid.len) xg4<-seq(xr[1,4],xr[2,4],len=grid.len) xg<-NULL for (i in 1:length(xg1)) for (j in 1:length(xg2)) for (k in 1:length(xg3)) for (l in 1:length(xg4)) { xg<-rbind(xg,c(xg1[i],xg2[j],xg3[k],xg4[l])) } len<-length(xg1)*length(xg2)*length(xg3)*length(xg4) x.ggobi<-rbind(cbind(rep(1,x.imp.row),x.imp[,x.imp.col],cbind(rep(0,x.imp.row)), cbind(rep(0,x.imp.row)), x.imp[,1:4],cbind(rep(0,x.imp.row)),cbind(rep(0,x.imp.row)))) names(x.ggobi)[1]<-"Data ind" names(x.ggobi)[2]<-"Class" names(x.ggobi)[3]<-"Pred Cl" names(x.ggobi)[4]<-"Pred Val" names(x.ggobi)[9]<-"SV" names(x.ggobi)[10]<-"Wgt" xg<-cbind(xg,rep(0,len)) grid.ggobi<-cbind(rep(2,len),xg[,x.imp.col],rep(0,len),rep(0,len),xg[,1:x.imp.col-1],rep(0,len),rep(0,len)) dimnames(grid.ggobi)<-list(NULL,names(x.ggobi)) x.ggobi<-rbind(x.ggobi,grid.ggobi) library(Rggobi) g<-ggobi(x.ggobi) #setMode.ggobi("2D Tour") #setPlotVariables.ggobi(1, 2, 3) # initialize colors/glyphs indx1<-c(1:dim(x.ggobi)[1]) #change glyph and color for the grid points indx2<-indx1[x.ggobi[,1]==2] setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(1,length(indx2)),which=indx2) #change glyph and color for the positive (1) points indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1] setColors.ggobi(rep(3,length(indx2)),which=indx2) #change glyph and color for the negative (2) points indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2] setColors.ggobi(rep(4,length(indx2)),which=indx2) dimnames(xg)[[2]]<-dimnames(x.imp)[[2]] attach(x.imp) eps1<-0.1 eps2<-1e-07 #run svm with different C values for (i in 1:10) { #i<-1 setGlyphs.ggobi(types=rep(5,length(indx1)),sizes=rep(1,length(indx1)),which=indx1) setColors.ggobi(rep(1,length(indx1)),which=indx1) c1=1-(i-1)/10 tmp.svm<-svm(factor(cl)~.,data=x.imp,kernel="linear",cost=c1) table(x[,823],predict(tmp.svm,x.imp[,1:4])) setVariableValues.ggobi(rep(0,x.imp.row),"SV",1:x.imp.row) setVariableValues.ggobi(rep(1,length(tmp.svm$index)),"SV",tmp.svm$index) setVariableValues.ggobi(rep(0,x.imp.row),"Wgt",1:x.imp.row) setVariableValues.ggobi(tmp.svm$coefs,"Wgt",tmp.svm$index) w<-hypNormal(tmp.svm$SV,tmp.svm$coefs) setVariableValues.ggobi(pred(x.imp[,1:4],w,tmp.svm$rho),"Pred Val",1:x.imp.row) setVariableValues.ggobi(pred(xg[,1:(x.imp.col-1)],w,tmp.svm$rho),"Pred Val",(x.imp.row+1):(x.imp.row+len)) predVals<-predict(tmp.svm,x.imp[,1:x.imp.col-1]) setVariableValues.ggobi(predVals,"Pred Cl",1:x.imp.row) xg[,x.imp.col]<-predict(tmp.svm,xg[,1:x.imp.col-1]) setVariableValues.ggobi(xg[,x.imp.col],"Pred Cl",(x.imp.row+1):(x.imp.row+len)) x.ggobi<-getData.ggobi() #change glyph and color for the positive grid points indx2<-indx1[x.ggobi[,1]==2 & x.ggobi[,3]==1] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative grid points indx2<-indx1[x.ggobi[,1]==2 & x.ggobi[,3]==2] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } #change glyph and color for the positive boundary grid points indx2<-indx1[x.ggobi[,1]==2&x.ggobi[,3]==1&abs(x.ggobi[,4])eps2)&abs(x.ggobi[,10])!=0] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative (2) real support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2&x.ggobi[,9]==1&(c1-abs(x.ggobi[,10])>eps2)&abs(x.ggobi[,10])!=0] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } #change glyph and color for the positive (1) bounded support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1&x.ggobi[,9]==1&(c1-abs(x.ggobi[,10])<=eps2)] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(3,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative (2) bounded support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2&x.ggobi[,9]==1&(c1-abs(x.ggobi[,10])<=eps2)] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(3,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } print(i) readline() } detach(x.imp) ########### record errors ############# i<-1 1 2 1 47 3 2 9 15 i<-2 1 2 1 48 2 2 9 15 i<-6 1 2 1 48 2 2 12 12 i<-7 1 2 1 48 2 2 11 13 i<-8 1 2 1 48 2 2 13 11 i<-9 1 2 1 50 0 2 15 9 i<-10 1 2 1 50 0 2 17 7 ########################## SVM errors for 4 best genes ########################### library(MASS) library(classPP) library(e1071) library(Rggobi) library(sma) #function to normalize data f.std.data<-function(x){ xm<-mean(x) xs<-sd(x) return((x-xm)/xs) } #read data from file t <- read.table("74x822_12.txt") #normalize data (attributes) y<-apply(t[,1:822],2,f.std.data) #add class column to the normalized data x<-cbind(y,t[,823]) g1<-c(3:8,11:15,17:22,24:33,36,39:40,42,46:47,49:50,56:59,62:63,65,66:74) g2<-c(1:2,9:10,16,23,34:35,37:38,41,43:45,48,51:55,60:61,64,66) n1<-round(length(g1)*2/3) n2<-round(length(g2)*2/3) error.train<-NULL error.test<-NULL sv.real.num<-NULL sv.bounded.num<-NULL for(j in 1:100) { gg1<-sample(g1,n1,replace=F) gg2<-sample(g2,n2,replace=F) g<-c(gg1,gg2) #sel<-c(800,403,535,596) #sel<-c(390,389,4,596) sel<-c(800,721,357,596) temp.data<-x[,sel] t.data<-cbind(temp.data[g,],x[g,823]) proj<-data.frame(t.data) dim<-length(proj[1,]) names(proj)[dim]<-"cl" attach(proj) tmp.svm<-svm(factor(cl)~.,data=proj,kernel="linear") detach(proj) svm.proj<-proj error.train<-c(error.train,sum( predict(tmp.svm, temp.data[g,]) !=as.character(x[g,823]))) error.test<-c(error.test,sum(predict(tmp.svm,temp.data[-g,]) !=as.character(x[-g,823]))) sv.real.num<-c(sv.real.num,sum( abs(tmp.svm$coefs) != 1)) sv.bounded.num<-c(sv.bounded.num,sum( abs(tmp.svm$coefs) == 1)) print(j) } error.train<-error.train/(n1+n2) error.test<-error.test/(length(x[,1])-(n1+n2)) sv.real.num<-sv.real.num/(n1+n2) sv.bounded.num<-sv.bounded.num/(n1+n2) ##################### record ###################### error.train mean(error.train) sd(error.train) min(error.train) max(error.train) error.test mean(error.test) sd(error.test) min(error.test) max(error.test) sv.real.num mean( sv.real.num) sd( sv.real.num) min( sv.real.num) max( sv.real.num) sv.bounded.num mean( sv.bounded.num) sd( sv.bounded.num) min( sv.bounded.num) max( sv.bounded.num) #################### results for sel<-c(800,403,535,596) ################### > > error.train [1] 8 5 7 6 5 9 4 5 8 6 8 6 6 5 5 7 6 4 7 5 6 7 5 6 8 9 4 6 5 7 10 [32] 7 8 8 8 5 6 6 4 9 7 4 5 6 5 5 6 6 7 6 8 8 8 11 5 8 7 4 8 7 11 6 [63] 4 6 8 7 6 5 7 7 7 6 8 6 7 8 6 7 8 8 6 8 6 6 7 7 3 8 7 5 5 7 7 [94] 5 6 9 5 8 7 7 > mean(error.train) [1] 6.54 > sd(error.train) [1] 1.526996 > min(error.train) [1] 3 > max(error.train) [1] 11 > > error.test [1] 4 4 3 3 4 3 6 5 5 4 1 4 5 5 5 2 5 5 7 4 4 2 8 4 2 3 7 4 5 3 3 6 2 3 3 5 6 3 7 2 6 5 8 5 3 8 [47] 7 4 6 6 3 5 3 2 4 2 5 7 3 3 4 5 4 3 4 5 5 6 3 3 3 5 3 3 2 4 4 3 6 1 5 3 4 6 5 6 6 4 3 4 6 4 [93] 2 5 5 2 4 3 2 7 > mean(error.test) [1] 4.23 > sd(error.test) [1] 1.594530 > min(error.test) [1] 1 > max(error.test) [1] 8 > > sv.real.num [1] 5 4 4 3 5 5 5 5 5 5 5 4 4 4 2 5 4 5 5 5 6 5 4 5 5 4 4 3 3 5 3 5 5 5 3 4 5 4 4 4 4 5 6 5 5 5 [47] 5 5 4 3 6 5 5 4 5 3 4 3 4 4 4 5 5 5 3 5 6 5 4 4 4 4 4 5 6 4 4 4 3 4 4 5 3 5 5 5 4 4 3 5 3 4 [93] 5 5 3 4 5 5 4 4 > mean( sv.real.num) [1] 4.38 > sd( sv.real.num) [1] 0.8260897 > min( sv.real.num) [1] 2 > max( sv.real.num) [1] 6 > > sv.bounded.num [1] 22 18 22 22 21 20 17 19 23 20 24 22 19 20 21 24 20 21 18 20 20 22 18 19 25 21 15 19 23 20 23 [32] 21 24 23 22 18 19 21 18 25 19 19 17 21 22 19 19 18 21 19 23 24 21 25 21 26 22 17 21 23 24 19 [63] 18 23 23 20 19 20 23 23 24 22 22 22 25 23 20 22 24 25 20 21 17 18 21 20 20 24 25 18 19 25 25 [94] 20 21 24 20 23 24 24 > mean( sv.bounded.num) [1] 21.16 > sd( sv.bounded.num) [1] 2.372911 > min( sv.bounded.num) [1] 15 > max( sv.bounded.num) [1] 26 > #################### results for sel<-c(800,721,357,596) ################### > error.train [1] 10 8 9 7 8 8 4 11 8 4 6 7 6 9 8 10 8 8 9 8 7 7 7 6 8 10 8 8 9 7 10 [32] 7 11 9 8 9 10 5 10 9 8 7 10 8 11 8 6 9 9 6 10 10 8 11 9 8 9 10 10 9 6 8 [63] 6 7 6 6 10 6 8 6 10 8 9 11 7 6 7 10 8 9 10 7 11 7 8 9 9 7 5 6 9 9 9 [94] 10 8 5 8 5 9 9 > mean(error.train) [1] 8.11 > sd(error.train) [1] 1.656911 > min(error.train) [1] 4 > max(error.train) [1] 11 > > error.test [1] 6 6 6 7 6 5 8 4 6 7 6 5 6 7 4 4 6 7 7 5 6 6 8 8 3 3 5 7 4 5 4 5 4 6 5 5 4 7 5 5 5 5 5 4 4 5 [47] 6 5 4 8 3 5 3 4 6 5 5 4 3 4 6 6 9 7 7 7 4 6 6 6 5 4 3 3 6 5 6 2 7 5 3 7 5 5 3 6 4 6 6 6 5 3 [93] 2 2 5 8 5 9 5 4 > mean(error.test) [1] 5.25 > sd(error.test) [1] 1.513408 > min(error.test) [1] 2 > max(error.test) [1] 9 > > sv.real.num [1] 6 4 4 5 5 4 3 4 5 5 5 4 4 6 6 5 3 5 5 5 5 5 4 6 5 4 3 5 5 6 4 4 4 5 5 4 5 5 5 5 5 6 4 4 5 5 [47] 4 6 6 3 4 4 4 5 6 5 5 3 5 5 5 6 4 4 3 6 5 4 4 3 5 5 4 4 4 4 5 4 6 5 5 5 5 5 4 5 5 6 5 6 5 5 [93] 6 5 4 4 5 5 4 5 > mean( sv.real.num) [1] 4.7 > sd( sv.real.num) [1] 0.8102874 > min( sv.real.num) [1] 3 > max( sv.real.num) [1] 6 > > sv.bounded.num [1] 21 21 21 19 19 21 16 24 21 17 18 22 20 19 20 24 22 21 19 20 20 21 21 15 21 26 21 18 24 21 24 [32] 21 24 19 21 23 23 18 21 23 20 20 24 20 22 22 19 19 22 20 25 23 23 24 20 20 24 24 23 25 19 21 [63] 19 18 21 18 20 21 22 18 22 19 23 24 19 22 25 27 19 20 27 18 21 19 26 22 22 20 19 18 23 23 26 [94] 26 19 19 19 18 24 22 > mean( sv.bounded.num) [1] 21.17 > sd( sv.bounded.num) [1] 2.466155 > min( sv.bounded.num) [1] 15 > max( sv.bounded.num) [1] 27 > > > ############################## sel<-c(390,389,4,596) ############################################ > error.train [1] 8 9 7 7 9 8 9 11 7 7 7 10 8 13 8 7 9 9 9 11 7 7 10 9 6 9 8 10 7 4 9 [32] 9 6 4 7 8 7 7 11 7 8 7 12 8 11 9 8 11 6 7 8 10 8 11 4 6 8 9 11 6 9 8 [63] 6 4 9 7 6 7 10 7 7 5 7 8 9 14 10 6 9 5 6 10 13 8 6 3 5 11 4 7 9 7 8 [94] 7 6 10 11 7 9 5 > mean(error.train) [1] 7.98 > sd(error.train) [1] 2.112875 > min(error.train) [1] 3 > max(error.train) [1] 14 > > error.test [1] 5 5 6 6 4 6 4 1 4 6 6 3 5 4 6 5 4 2 1 3 5 5 5 4 4 3 3 4 4 5 6 4 6 5 6 5 1 6 5 3 5 5 6 3 3 4 [47] 5 3 5 6 5 6 2 5 8 7 6 1 3 3 4 5 5 5 7 2 6 4 4 3 6 4 4 5 5 2 3 3 5 6 4 4 7 5 6 5 5 3 5 6 5 4 [93] 4 5 3 3 3 9 7 6 > mean(error.test) [1] 4.53 > sd(error.test) [1] 1.513942 > min(error.test) [1] 1 > max(error.test) [1] 9 > > sv.real.num [1] 6 4 5 5 6 4 4 5 6 4 5 5 5 6 7 5 5 4 5 4 9 4 5 5 5 5 4 4 4 5 6 8 4 5 5 5 5 5 3 5 5 5 5 5 6 5 [47] 4 5 7 5 5 6 5 6 5 5 5 5 4 5 5 5 5 5 5 5 6 6 5 4 8 6 4 4 5 4 5 6 5 5 4 6 6 7 5 4 6 5 5 4 4 4 [93] 4 3 5 5 5 5 5 5 > mean( sv.real.num) [1] 5.04 > sd( sv.real.num) [1] 0.9631598 > min( sv.real.num) [1] 3 > max( sv.real.num) [1] 9 > > sv.bounded.num [1] 25 25 21 22 24 21 22 28 20 22 19 24 21 25 18 23 25 21 28 27 18 21 25 24 21 23 27 25 19 19 20 [32] 23 18 21 16 24 27 24 26 26 19 25 27 23 22 23 26 26 21 23 23 23 25 22 15 16 20 26 24 25 22 21 [63] 22 19 25 26 22 20 19 19 17 20 27 21 25 27 23 24 23 21 22 25 25 21 21 16 19 27 20 22 23 23 25 [94] 23 21 24 24 20 25 20 > mean( sv.bounded.num) [1] 22.51 > sd( sv.bounded.num) [1] 2.928422 > min( sv.bounded.num) [1] 15 > max( sv.bounded.num) [1] 28 > ########################## SVM errors for all genes ########################### library(MASS) library(classPP) library(e1071) library(Rggobi) library(sma) #function to normalize data f.std.data<-function(x){ xm<-mean(x) xs<-sd(x) return((x-xm)/xs) } #read data from file t <- read.table("74x822_12.txt") #normalize data (attributes) y<-apply(t[,1:822],2,f.std.data) #add class column to the normalized data x<-cbind(y,t[,823]) g1<-c(3:8,11:15,17:22,24:33,36,39:40,42,46:47,49:50,56:59,62:63,65,66:74) g2<-c(1:2,9:10,16,23,34:35,37:38,41,43:45,48,51:55,60:61,64,66) n1<-round(length(g1)*2/3) n2<-round(length(g2)*2/3) error.train<-NULL error.test<-NULL sv.real.num<-NULL sv.bounded.num<-NULL for(j in 1:100) { gg1<-sample(g1,n1,replace=F) gg2<-sample(g2,n2,replace=F) g<-c(gg1,gg2) #sel<-c(800,403,535,596) #sel<-c(800,721,357,596) #sel<-c(390,389,4,596) sel<-c(1:822) temp.data<-x[,sel] t.data<-cbind(temp.data[g,],x[g,823]) proj<-data.frame(t.data) dim<-length(proj[1,]) names(proj)[dim]<-"cl" attach(proj) tmp.svm<-svm(factor(cl)~.,data=proj,kernel="linear") detach(proj) svm.proj<-proj error.train<-c(error.train,sum( predict(tmp.svm, temp.data[g,]) !=as.character(x[g,823]))) error.test<-c(error.test,sum(predict(tmp.svm,temp.data[-g,]) !=as.character(x[-g,823]))) sv.real.num<-c(sv.real.num,sum( abs(tmp.svm$coefs) != 1)) sv.bounded.num<-c(sv.bounded.num,sum( abs(tmp.svm$coefs) == 1)) print(j) } ################################ > > error.train [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [47] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [93] 0 0 0 0 0 0 0 0 > mean(error.train) [1] 0 > sd(error.train) [1] 0 > min(error.train) [1] 0 > max(error.train) [1] 0 > > error.test [1] 6 5 5 4 3 8 6 4 7 5 7 4 6 8 8 6 6 10 5 6 5 6 6 4 5 6 5 9 4 3 5 [32] 5 5 6 6 3 5 7 3 3 5 7 9 7 5 7 6 11 8 7 5 8 5 9 5 4 8 4 6 5 8 3 [63] 11 5 4 5 5 6 6 6 5 7 4 6 5 10 5 5 4 5 6 6 7 1 8 9 8 8 7 8 6 9 5 [94] 7 6 6 6 2 3 4 > mean(error.test) [1] 5.88 > sd(error.test) [1] 1.892462 > min(error.test) [1] 1 > max(error.test) [1] 11 > > sv.real.num [1] 45 44 45 47 47 45 46 49 47 46 47 48 43 45 46 46 45 43 45 45 46 46 44 43 45 46 46 48 46 45 46 [32] 46 41 44 44 46 45 45 47 47 42 46 45 47 47 44 47 44 45 44 46 47 46 45 46 42 45 45 44 46 43 45 [63] 43 46 47 48 45 45 49 47 46 44 45 47 45 45 47 45 45 46 47 45 42 43 45 44 46 44 45 44 45 46 46 [94] 46 42 46 45 45 45 49 > mean( sv.real.num) [1] 45.36 > sd( sv.real.num) [1] 1.540825 > min( sv.real.num) [1] 41 > max( sv.real.num) [1] 49 > > sv.bounded.num [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [47] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 [93] 0 0 0 0 0 0 0 0 > mean( sv.bounded.num) [1] 0 > sd( sv.bounded.num) [1] 0 > min( sv.bounded.num) [1] 0 > max( sv.bounded.num) [1] 0 > ########################## calculate SVM errors using best 2-15 SVM genes (from all 822) ########################### library(MASS) library(classPP) library(e1071) library(Rggobi) library(sma) #function to normalize data f.std.data<-function(x){ xm<-mean(x) xs<-sd(x) return((x-xm)/xs) } #read data from file t <- read.table("74x822_12.txt") #normalize data (attributes) y<-apply(t[,1:822],2,f.std.data) #add class column to the normalized data x<-cbind(y,t[,823]) g1<-c(3:8,11:15,17:22,24:33,36,39:40,42,46:47,49:50,56:59,62:63,65,66:74) g2<-c(1:2,9:10,16,23,34:35,37:38,41,43:45,48,51:55,60:61,64,66) n1<-round(length(g1)*2/3) n2<-round(length(g2)*2/3) #selgenes<-c(390, 389, 4, 596) #selgenes<-c(390, 389, 4, 596, 54, 409, 741, 398, 725, 736,581, 263, 817, 701, 655) # [1] "V800" "V403" "V535" "V596" "V357" "V398" "V113" "V73" "V119" "V488" # [11] "V99" "V138" "V736" "V26" "V803" "V112" "V693" "V165" "V406" "V788" # [21] "V277" "V45" "V666" "V176" "V721" "V663" "V417" "V769" "V208" "V111" # [31] "V523" "V761" "V55" "V166" "V94" "V299" "V409" "V70" "V61" "V4" #best 15 svm from 40 best bw #selgenes<-c(800, 403, 535, 596, 357, 398, 113, 73, 119, 488, 99, 138, 736, 26, 803) #best 20 svm from 40 best bw selgenes<-c(800, 403, 535, 596, 357, 398, 113, 73, 119, 488, 99, 138, 736, 26, 803, 112, 693, 165, 406, 788) error.train.avg<-NULL error.train.st<-NULL error.train.min<-NULL error.train.max<-NULL error.test.avg<-NULL error.test.st<-NULL error.test.min<-NULL error.test.max<-NULL sv.real.num.avg<-NULL sv.real.num.st<-NULL sv.real.num.min<-NULL sv.real.num.max<-NULL sv.bounded.num.avg<-NULL sv.bounded.num.st<-NULL sv.bounded.num.min<-NULL sv.bounded.num.max<-NULL for (i in 2:20){ print(i) sel<-selgenes[1:i] error.train<-NULL error.test<-NULL sv.real.num<-NULL sv.bounded.num<-NULL for(j in 1:100) { print(j) gg1<-sample(g1,n1,replace=F) gg2<-sample(g2,n2,replace=F) g<-c(gg1,gg2) temp.data<-x[,sel] t.data<-cbind(temp.data[g,],x[g,823]) proj<-data.frame(t.data) dim<-length(proj[1,]) names(proj)[dim]<-"cl" attach(proj) tmp.svm<-svm(factor(cl)~.,data=proj,kernel="linear") detach(proj) svm.proj<-proj error.train<-c(error.train,sum( predict(tmp.svm, temp.data[g,]) !=as.character(x[g,823]))) error.test<-c(error.test,sum(predict(tmp.svm,temp.data[-g,]) !=as.character(x[-g,823]))) sv.real.num<-c(sv.real.num,sum( abs(tmp.svm$coefs) != 1)) sv.bounded.num<-c(sv.bounded.num,sum( abs(tmp.svm$coefs) == 1)) } error.train<-error.train/(n1+n2) error.test<-error.test/(length(x[,1])-(n1+n2)) sv.real.num<-sv.real.num/(n1+n2) sv.bounded.num<-sv.bounded.num/(n1+n2) error.train.avg<-c(error.train.avg,mean(error.train)) error.train.st<-c(error.train.st,sd(error.train)) error.train.min<-c(error.train.min,min(error.train)) error.train.max<-c(error.train.max,max(error.train)) error.test.avg<-c(error.test.avg,mean(error.test)) error.test.st<-c(error.test.st,sd(error.test)) error.test.min<-c(error.test.min,min(error.test)) error.test.max<-c(error.test.max,max(error.test)) sv.real.num.avg<-c(sv.real.num.avg,mean(sv.real.num)) sv.real.num.st<-c(sv.real.num.st,sd(sv.real.num)) sv.real.num.min<-c(sv.real.num.min,min(sv.real.num)) sv.real.num.max<-c(sv.real.num.max,max(sv.real.num)) sv.bounded.num.avg<-c(sv.bounded.num.avg,mean(sv.bounded.num)) sv.bounded.num.st<-c(sv.bounded.num.st,sd(sv.bounded.num)) sv.bounded.num.min<-c(sv.bounded.num.min,min(sv.bounded.num)) sv.bounded.num.max<-c(sv.bounded.num.max,max(sv.bounded.num)) } error.train.avg error.train.st error.train.min error.train.max error.test.avg error.test.st error.test.min error.test.max sv.real.num.avg sv.real.num.st sv.real.num.min sv.real.num.max sv.bounded.num.avg sv.bounded.num.st sv.bounded.num.min sv.bounded.num.max ################################ ################################ Plot statistics #################################### #vars<-c(2,3,4,5,6,7,8,9,10,11,12,13,14,15) vars<-c(2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20) X11() par(mfrow=c(2,2)) plot(cbind(vars,error.train.avg),type='l',xlab="Number of Variables",ylab="Average Training Errors") plot(cbind(vars,error.test.avg),type='l',xlab="Number of Variables",ylab="Average Test Errors") plot(cbind(vars,sv.real.num.avg),type='l',xlab="Number of Variables",ylab="Average Real Support Vectors") plot(cbind(vars,sv.bounded.num.avg),type='l',xlab="Number of Variables",ylab="Average Bounded Support Vectors") ###################### Rggobi for a given set of variables (only training data) ########################## ############################## show 7 dimensions - no grid ############################ #several SVM runs library(MASS) library(sma) library(e1071) library(Rggobi) #function to normalize data f.std.data<-function(x){ xm<-mean(x) xs<-sd(x) return((x-xm)/xs) } #function to normalize data mylog<-function(x){ if (x!=0){ return (log(x)) } else{ return (0) } } hypNormal<-function(sv,coefs){ w<-NULL for (i in (1:length(sv[1,]))){ w[i]<-sum(coefs*sv[,i]) } return(w) } pred<-function(x,w,b){ p1<-NULL for (i in (1:length(x[,1]))){ p1[i]<-sum(x[i,]*w) } p1<-p1-b return(p1) } #read data from file t1 <- read.table("74x822_12.txt") #normalize data (attributes) y<-apply(t[,1:822],2,f.std.data) #select first 9 most important SVM from 40 most imp BW variables - #x.imp <- cbind(x[,sel<-c(800, 403, 535, 596, 357, 398, 113, 73,119)],x[,823]) #dimnames(x.imp)[[2]]<-c("V800","V403","V535","V596","V357","V398","V113","V73","V119","cl") #select first 7 most important SVM from 40 most imp BW variables - x.imp <- cbind(x[,sel<-c(800, 403, 535, 596, 357, 398, 113)],x[,823]) dimnames(x.imp)[[2]]<-c("V800","V403","V535","V596","V357","V398","V113","cl") #select first 9 most important SVM from all 822 variables - #x.imp <- cbind(x[,sel<-c(390, 389, 4, 596, 54, 409, 741, 398, 725,736)],x[,823]) #dimnames(x.imp)[[2]]<-c("V390","V389","V4","V596","V54","V409","V741","V398","V725","736","cl") x.imp.row<-nrow(x.imp) x.imp.col<-ncol(x.imp) x.imp<-data.frame(x.imp) dim<-length(x.imp[1,]) names(x.imp)[dim]<-"cl" x.ggobi<-rbind(cbind(rep(1,x.imp.row),x.imp[,x.imp.col],cbind(rep(0,x.imp.row)), cbind(rep(0,x.imp.row)), x.imp[,1:(x.imp.col-1)],cbind(rep(0,x.imp.row)),cbind(rep(0,x.imp.row)))) names(x.ggobi)[1]<-"Data ind" names(x.ggobi)[2]<-"Class" names(x.ggobi)[3]<-"Pred Cl" names(x.ggobi)[4]<-"Pred Val" names(x.ggobi)[x.imp.col+4]<-"SV" names(x.ggobi)[x.imp.col+5]<-"Wgt" library(Rggobi) g<-ggobi(x.ggobi) # initialize colors/glyphs indx1<-c(1:dim(x.ggobi)[1]) #change glyph and color for the positive (1) points indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1] setColors.ggobi(rep(3,length(indx2)),which=indx2) #change glyph and color for the negative (2) points indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2] setColors.ggobi(rep(4,length(indx2)),which=indx2) attach(x.imp) eps1<-0.1 eps2<-1e-07 #run svm with different C values #for (i in 1:10) { i<-1 setGlyphs.ggobi(types=rep(5,length(indx1)),sizes=rep(1,length(indx1)),which=indx1) setColors.ggobi(rep(1,length(indx1)),which=indx1) c1=1-(i-1)/10 tmp.svm<-svm(factor(cl)~.,data=x.imp,kernel="linear",cost=c1) table(x[,823],predict(tmp.svm,x.imp[,1:(x.imp.col-1)])) setVariableValues.ggobi(rep(0,74),"SV",1:74) setVariableValues.ggobi(rep(1,length(tmp.svm$index)),"SV",tmp.svm$index) setVariableValues.ggobi(rep(0,74),"Wgt",1:74) setVariableValues.ggobi(tmp.svm$coefs,"Wgt",tmp.svm$index) w<-hypNormal(tmp.svm$SV,tmp.svm$coefs) setVariableValues.ggobi(pred(x.imp[,1:(x.imp.col-1)],w,tmp.svm$rho),"Pred Val",1:74) predVals<-predict(tmp.svm,x.imp[,1:x.imp.col-1]) setVariableValues.ggobi(predVals,"Pred Cl",1:74) x.ggobi<-getData.ggobi() #change glyph and color for the positive (1) points indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(1,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative (2) points indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(1,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } #change glyph and color for the positive (1) outliers indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2&x.ggobi[,3]==1] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(4,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative (2) outliers indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1&x.ggobi[,3]==2] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(4,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } #change glyph and color for the positive (1) real support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1&x.ggobi[,x.imp.col+4]==1&(c1-abs(x.ggobi[,x.imp.col+5])>eps2)&abs(x.ggobi[,x.imp.col+5])!=0] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative (2) real support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2&x.ggobi[,x.imp.col+4]==1&(c1-abs(x.ggobi[,x.imp.col+5])>eps2)&abs(x.ggobi[,x.imp.col+5])!=0] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } #change glyph and color for the positive (1) bounded support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1&x.ggobi[,x.imp.col+4]==1&(c1-abs(x.ggobi[,x.imp.col+5])<=eps2)] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(3,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative (2) bounded support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2&x.ggobi[,x.imp.col+4]==1&(c1-abs(x.ggobi[,x.imp.col+5])<=eps2)] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(3,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } print(i) # readline() #} detach(x.imp) ########################## ############################################################################# # SVM grids for non linear kernels - radial & ploynomial (commented) kernel ############################################################################### library(MASS) library(sma) library(e1071) library(Rggobi) #function to normalize data f.std.data<-function(x){ xm<-mean(x) xs<-sd(x) return((x-xm)/xs) } #function to normalize data mylog<-function(x){ if (x!=0){ return (log(x)) } else{ return (0) } } #linear kernel #kernel<-function(x,y) #{ # val<-sum(x*y) # return(val) #} #poly kernel, default param - degree= 3, coef0 = 0 kernel<-function(x,y) { gamma<-0.25 val<-sum(x*y) val<-val*gamma return(val*val*val) } #kernel<-function(x,y) #{ # gamma<-0.25 # val<-sum((x-y)^2) # val<-val*gamma # return(exp(-val)) #} pred.kernel<-function(x,sv,coefs,b){ p2<-rep(0,length(x[,1])) for (j in 1:length(x[,1])){ print(j) for (i in 1:length(sv[,1])){ p2[j]<-p2[j]+sum(coefs[i]*kernel(x[j,],sv[i,])) } } p2<-p2-b return(p2) } #read data from file t1 <- read.table("74x822_12.txt") #normalize data (attributes) y<-apply(t[,1:822],2,f.std.data) #take log #y<-apply(y[,1:822],c(1,2),mylog) #add class column to the normalized data x<-cbind(y,t[,823]) #select first 4 most important SVM from 40 most imp BW variables #x.imp <- cbind(x[,c(800,403,535,596)],x[,823]) #dimnames(x.imp)[[2]]<-c("V800","V403","V535","V596","cl") #select first 4 most important comb 2 svm and two common with BW and PDA x.imp <- cbind(x[,c(800,721,357,596)],x[,823]) dimnames(x.imp)[[2]]<-c("V800","V721","V357","V596","cl") #select first 4 most important SVM from all 822 - pics rad1, rad2, rad3, rad4 #x.imp <- cbind(x[,c(390,389,4,596)],x[,823]) #dimnames(x.imp)[[2]]<-c("V390","V389","V4","V596","cl") #select first 4 most important comb 2 svm and two common with BW and PDA - radial kernel #x.imp <- cbind(x[,c(166,119,112,299)],x[,823]) #dimnames(x.imp)[[2]]<-c("V166","V119","V112","V299","cl") x.imp.row<-nrow(x.imp) x.imp.col<-ncol(x.imp) x.imp<-data.frame(x.imp) dim<-length(x.imp[1,]) names(x.imp)[dim]<-"cl" #generate grid xr<-apply(x.imp[,1:length(x.imp)-1],2,range) grid.len<-10 xg1<-seq(xr[1,1],xr[2,1],len=grid.len) xg2<-seq(xr[1,2],xr[2,2],len=grid.len) xg3<-seq(xr[1,3],xr[2,3],len=grid.len) xg4<-seq(xr[1,4],xr[2,4],len=grid.len) xg<-NULL for (i in 1:length(xg1)) for (j in 1:length(xg2)) for (k in 1:length(xg3)) for (l in 1:length(xg4)) { xg<-rbind(xg,c(xg1[i],xg2[j],xg3[k],xg4[l])) } len<-length(xg1)*length(xg2)*length(xg3)*length(xg4) x.ggobi<-rbind(cbind(rep(1,x.imp.row),x.imp[,x.imp.col],cbind(rep(0,x.imp.row)), cbind(rep(0,x.imp.row)), x.imp[,1:4],cbind(rep(0,x.imp.row)),cbind(rep(0,x.imp.row)))) names(x.ggobi)[1]<-"Data ind" names(x.ggobi)[2]<-"Class" names(x.ggobi)[3]<-"Pred Cl" names(x.ggobi)[4]<-"Pred Val" names(x.ggobi)[9]<-"SV" names(x.ggobi)[10]<-"Wgt" xg<-cbind(xg,rep(0,len)) grid.ggobi<-cbind(rep(2,len),xg[,x.imp.col],rep(0,len),rep(0,len),xg[,1:x.imp.col-1],rep(0,len),rep(0,len)) dimnames(grid.ggobi)<-list(NULL,names(x.ggobi)) x.ggobi<-rbind(x.ggobi,grid.ggobi) library(Rggobi) g<-ggobi(x.ggobi) #setMode.ggobi("2D Tour") #setPlotVariables.ggobi(1, 2, 3) # initialize colors/glyphs indx1<-c(1:dim(x.ggobi)[1]) #change glyph and color for the grid points indx2<-indx1[x.ggobi[,1]==2] setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(1,length(indx2)),which=indx2) #change glyph and color for the positive (1) points indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1] setColors.ggobi(rep(3,length(indx2)),which=indx2) #change glyph and color for the negative (2) points indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2] setColors.ggobi(rep(4,length(indx2)),which=indx2) dimnames(xg)[[2]]<-dimnames(x.imp)[[2]] attach(x.imp) eps1<-0.01 eps2<-1e-07 #run svm with different C values #for (i in 1:10) { i<-1 setGlyphs.ggobi(types=rep(5,length(indx1)),sizes=rep(1,length(indx1)),which=indx1) setColors.ggobi(rep(1,length(indx1)),which=indx1) c1=1-(i-1)/10 tmp.svm<-svm(factor(cl)~.,data=x.imp,kernel="polynomial",cost=c1) table(x[,823],predict(tmp.svm,x.imp[,1:4])) setVariableValues.ggobi(rep(0,74),"SV",1:74) setVariableValues.ggobi(rep(1,length(tmp.svm$index)),"SV",tmp.svm$index) setVariableValues.ggobi(rep(0,74),"Wgt",1:74) setVariableValues.ggobi(tmp.svm$coefs,"Wgt",tmp.svm$index) setVariableValues.ggobi(pred.kernel(x.imp[,1:4],tmp.svm$SV,tmp.svm$coefs,tmp.svm$rho),"Pred Val",1:74) setVariableValues.ggobi(pred.kernel(xg[,1:(x.imp.col-1)],tmp.svm$SV,tmp.svm$coefs,tmp.svm$rho),"Pred Val",(74+1):10074) predVals<-predict(tmp.svm,x.imp[,1:x.imp.col-1]) setVariableValues.ggobi(predVals,"Pred Cl",1:74) xg[,x.imp.col]<-predict(tmp.svm,xg[,1:x.imp.col-1]) setVariableValues.ggobi(xg[,x.imp.col],"Pred Cl",(74+1):10074) x.ggobi<-getData.ggobi() #change glyph and color for the positive grid points indx2<-indx1[x.ggobi[,1]==2 & x.ggobi[,3]==1] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative grid points indx2<-indx1[x.ggobi[,1]==2 & x.ggobi[,3]==2] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } #change glyph and color for the positive boundary grid points indx2<-indx1[x.ggobi[,1]==2&x.ggobi[,3]==1&abs(x.ggobi[,4])eps2)&abs(x.ggobi[,10])!=0] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative (2) real support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2&x.ggobi[,9]==1&(c1-abs(x.ggobi[,10])>eps2)&abs(x.ggobi[,10])!=0] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } #change glyph and color for the positive (1) bounded support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1&x.ggobi[,9]==1&(c1-abs(x.ggobi[,10])<=eps2)] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(3,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative (2) bounded support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2&x.ggobi[,9]==1&(c1-abs(x.ggobi[,10])<=eps2)] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(3,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } print(i) # readline() #} detach(x.imp) ############################## order 40 most imp BW genes according with importance by SVM radial ################### ############################################### ### Feature sel SVM -- 40 BW genes ######### ############################################### #--------------------------------------------- #gene expression small data set #select 40 genes based on BW index #order the 40 selected genes using SVM #select best (according to SVM) 4 genes #run SVM using these 4 genes #generate a grid over data (4 columns) #predict the grid #start Rggobi library(MASS) library(sma) library(e1071) library(Rggobi) #function to normalize data f.std.data<-function(x){ xm<-mean(x) xs<-sd(x) return((x-xm)/xs) } f.ind<-function(x, n){ ind<-0 for (i in (1:length(x))){ if ((names(x)[i])==n){ ind <-i } } return(ind) } #read data from file t <- read.table("74x822_12.txt") #normalize data (attributes) y<-apply(t[,1:822],2,f.std.data) #add class column to the normalized data x<-cbind(y,t[,823]) #compute bw index for all the genes (attributes) given the class corresponding to each sample x.bw<-stat.bwss(t(x[,1:822]),x[,823]) #select the best 40 genes and print their bw index value x.bw.40 <- stat.gnames(x.bw$bw, 1:length(x.bw$bw), crit=40) #x.bw.40 #find the index (attribute number/name) of the best 40 genes x.bw.40.gnames <- stat.gnames(x.bw$bw, 1:length(x.bw$bw), crit=40)$gnames #x.bw.40.gnames #x[,x.bw.40.gnames] y<-cbind(x[,x.bw.40.gnames],x[,823]) s<-c(1:length(x.bw.40.gnames)) names(s)=c("V721", "V113", "V61", "V299", "V176","V406", "V409", "V596", "V138", "V488", "V663", "V208", "V165", "V403", "V736", "V535", "V70", "V803", "V112", "V417", "V357", "V166", "V761", "V119", "V666", "V99", "V398", "V769", "V26", "V4", "V55", "V277", "V788", "V73", "V45", "V800", "V111", "V523", "V94", "V693") r<-NULL u<-y for (i in (1:length(x.bw.40.gnames))) { i u<-data.frame(u) dim<-length(u[1,]) names(u)[dim]<-"cl" attach(u) tmp.svm<-svm(factor(cl)~.,data=u,kernel="radial") detach(u) svm.u<-u w<-NULL #compute w vector for(i in (1:length(tmp.svm$SV[1,]))) { w[i]<-sum(tmp.svm$coefs*tmp.svm$SV[,i]) } w.square<-w*w names(w)<-names(s) min.val <-min(w.square) min.ind<-which.min(w.square) name<-names(w[min.ind]) index.in.s<-f.ind(s,name) s<-s[-index.in.s] r<-cbind(r,name) u<-y[,s] u<-cbind(u,x[,823]) } r.best<-rev(r) ############################## three different sets of variables can be tried ############################ ################################## separate training and test sets ######################################### #several SVM runs library(MASS) library(e1071) library(Rggobi) #function to normalize data f.std.data<-function(x){ xm<-mean(x) xs<-sd(x) return((x-xm)/xs) } hypNormal<-function(sv,coefs){ w<-rep(0,length(sv[1,])) for (i in (1:length(sv[1,]))){ w[i]<-sum(coefs*sv[,i]) } return(w) } pred<-function(x,w,b){ p1<-rep(0,length(x[,1])) for (i in (1:length(x[,1]))){ p1[i]<-sum(x[i,]*w) } p1<-p1-b return(p1) } #read data from file t1 <- read.table("74x822_12.txt") #select first 4 most important comb 2 svm and two common with BW and PDA d.all <- cbind(t1[,c(800,721,357,596)],t1[,823]) dimnames(d.all)[[2]]<-c("V800","V721","V357","V596","cl") col<-ncol(d.all) #normalize data (attributes) y<-apply(d.all[,1:(col-1)],2,f.std.data) #add class column to the normalized data d.all<-cbind(y,d.all[,col]) #examples in class 1 c1<-c(3:8,11:15,17:22,24:33,36,39:40,42,46:47,49:50,56:59,62:63,65,66:74) #examples in class 2 c2<-c(1:2,9:10,16,23,34:35,37:38,41,43:45,48,51:55,60:61,64,66) n1<-round(length(c1)*2/3) n2<-round(length(c2)*2/3) col<-ncol(d.all) train.row<-n1+n2 test.row<-nrow(d.all)-train.row #generate grid d.all<-data.frame(d.all) xr<-apply(d.all[,1:(length(d.all)-1)],2,range) grid.len<-10 xg1<-seq(xr[1,1],xr[2,1],len=grid.len) xg2<-seq(xr[1,2],xr[2,2],len=grid.len) xg3<-seq(xr[1,3],xr[2,3],len=grid.len) xg4<-seq(xr[1,4],xr[2,4],len=grid.len) xg<-NULL for (i in 1:length(xg1)) for (j in 1:length(xg2)) for (k in 1:length(xg3)) for (l in 1:length(xg4)) { xg<-rbind(xg,c(xg1[i],xg2[j],xg3[k],xg4[l])) } len<-length(xg1)*length(xg2)*length(xg3)*length(xg4) xg<-cbind(xg,rep(0,len)) dimnames(xg)[[2]]<-c("V800","V721","V337","V496","cl") d.train<-cbind(rep(0,train.row),rep(0,train.row),rep(0,train.row),rep(0,train.row),rep(0,train.row)) dimnames(d.train)[[2]]<-c("V800","V721","V337","V496","cl") d.train<-data.frame(d.train) x.ggobi<-rbind(cbind(rep(1,train.row),d.train[,col],cbind(rep(0,train.row)), cbind(rep(0,train.row)), d.train[,1:(col-1)], cbind(rep(0,train.row)),cbind(rep(0,train.row)))) names(x.ggobi)[1]<-"Data ind" names(x.ggobi)[2]<-"Class" names(x.ggobi)[3]<-"Pred Cl" names(x.ggobi)[4]<-"Pred Val" names(x.ggobi)[9]<-"SV" names(x.ggobi)[10]<-"Wgt" d.test<-cbind(rep(0,test.row),rep(0,test.row),rep(0,test.row),rep(0,test.row),rep(0,test.row)) dimnames(d.test)[[2]]<-c("V800","V721","V337","V496","cl") d.test<-data.frame(d.test) test.ggobi<-cbind(rep(0,test.row),d.test[,col],rep(0,test.row), rep(0,test.row), d.test[,1:(col-1)], rep(0,test.row),rep(0,test.row)) names(test.ggobi)<-names(x.ggobi) #dimnames(test.ggobi)<-list(NULL,names(x.ggobi)) x.ggobi<-rbind(x.ggobi,test.ggobi) grid.ggobi<-cbind(rep(2,len),xg[,col],rep(0,len),rep(0,len),xg[,1:(col-1)],rep(0,len),rep(0,len)) dimnames(grid.ggobi)<-list(NULL,names(x.ggobi)) x.ggobi<-rbind(x.ggobi,grid.ggobi) library(Rggobi) gb<-ggobi(x.ggobi) # initialize colors/glyphs indx1<-c(1:dim(x.ggobi)[1]) #change glyph and color for the grid points indx2<-indx1[x.ggobi[,1]==2] setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(1,length(indx2)),which=indx2) #change glyph and color for the positive train (1) points indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1] setColors.ggobi(rep(3,length(indx2)),which=indx2) #change glyph and color for the negative train (2) points indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2] setColors.ggobi(rep(4,length(indx2)),which=indx2) #change glyph and color for the positive test (1) points indx2<-indx1[x.ggobi[,1]==0&x.ggobi[,2]==1] setColors.ggobi(rep(7,length(indx2)),which=indx2) #change glyph and color for the negative test (2) points indx2<-indx1[x.ggobi[,1]==0&x.ggobi[,2]==2] setColors.ggobi(rep(9,length(indx2)),which=indx2) eps1<-0.1 eps2<-1e-07 #for (i in 1:10){ #sample class 1 sc1<-sample(c1,n1,replace=F) #sample class 2 sc2<-sample(c2,n2,replace=F) #final sample g<-c(sc1,sc2) d.tr<-d.all[g,] dimnames(d.tr)[[2]]<-c("V800","V721","V337","V496","cl") #normalize train data (attributes) y<-apply(d.tr[,1:(col-1)],2,f.std.data) #add class column to the normalized data d.train<-cbind(y,d.tr[,col]) d.ts<-d.all[-g,] dimnames(d.ts)[[2]]<-c("V800","V721","V337","V496","cl") #normalize test data (attributes) y<-apply(d.ts[,1:(col-1)],2,f.std.data) #add class column to the normalized data d.test<-cbind(y,d.ts[,col]) setVariableValues.ggobi(data.train[,1],"V800",1:train.row) setVariableValues.ggobi(data.test[,1],"V800",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(data.train[,2],"V721",1:train.row) setVariableValues.ggobi(data.test[,2],"V721",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(data.train[,3],"V337",1:train.row) setVariableValues.ggobi(data.test[,3],"V337",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(data.train[,4],"V496",1:train.row) setVariableValues.ggobi(data.test[,4],"V496",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(data.train[,5],"Class",1:train.row) setVariableValues.ggobi(data.test[,5],"Class",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(rep(0,(train.row+test.row+len)),"Pred Cl",1:(train.row+test.row+len)) d.train<-data.frame(d.train) dimnames(d.train)[[2]]<-c("V800","V721","V337","V496","cl") attach(d.train) #run svm with C=1 value i<-1 c1<-1-(i-1)/10 tmp.svm<-svm(factor(cl)~.,data=d.train,kernel="linear",cost=c1) table(d.train[,col],predict(tmp.svm,d.train[,1:(col-1)])) table(d.test[,col],predict(tmp.svm,d.test[,1:(col-1)])) setVariableValues.ggobi(rep(0,length(g)),"SV",1:length(g)) setVariableValues.ggobi(rep(1,length(tmp.svm$index)),"SV",tmp.svm$index) setVariableValues.ggobi(rep(0,length(g)),"Wgt",1:length(g)) setVariableValues.ggobi(tmp.svm$coefs,"Wgt",tmp.svm$index) w<-hypNormal(tmp.svm$SV,tmp.svm$coefs) pred.val.train<-pred(d.train[,1:(col-1)],w,tmp.svm$rho) setVariableValues.ggobi(pred.val.train,"Pred Val",1:train.row) pred.val.test<-pred(d.test[,1:(col-1)],w,tmp.svm$rho) setVariableValues.ggobi(pred.val.test,"Pred Val",(train.row+1):(train.row+test.row)) pred.val.grid<-pred(xg[,1:(col-1)],w,tmp.svm$rho) setVariableValues.ggobi(pred.val.grid,"Pred Val",(train.row+test.row+1):(train.row+test.row+len)) pred.class.train<-predict(tmp.svm,d.train[,1:(col-1)]) setVariableValues.ggobi(pred.class.train,"Pred Cl",1:train.row) pred.class.test<-predict(tmp.svm,d.test[,1:(col-1)]) setVariableValues.ggobi(pred.class.test,"Pred Cl",(train.row+1):(train.row+test.row)) pred.class.grid<-predict(tmp.svm,xg[,1:(col-1)]) setVariableValues.ggobi(pred.class.grid,"Pred Cl",(train.row+test.row+1):(train.row+test.row+len)) x.ggobi<-getData.ggobi() #change glyph and color for the positive test (1) points indx2<-indx1[x.ggobi[,1]==0&x.ggobi[,2]==1] setColors.ggobi(rep(7,length(indx2)),which=indx2) #change glyph and color for the negative test (2) points indx2<-indx1[x.ggobi[,1]==0&x.ggobi[,2]==2] setColors.ggobi(rep(9,length(indx2)),which=indx2) #change glyph and color for the positive grid points indx2<-indx1[x.ggobi[,1]==2 & x.ggobi[,3]==1] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative grid points indx2<-indx1[x.ggobi[,1]==2 & x.ggobi[,3]==2] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } #change glyph and color for the positive boundary grid points indx2<-indx1[x.ggobi[,1]==2&x.ggobi[,3]==1&abs(x.ggobi[,4])eps2)&abs(x.ggobi[,10])!=0] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative (2) real support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2&x.ggobi[,9]==1&(c1-abs(x.ggobi[,10])>eps2)&abs(x.ggobi[,10])!=0] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } #change glyph and color for the positive (1) bounded support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1&x.ggobi[,9]==1&(c1-abs(x.ggobi[,10])<=eps2)] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(3,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative (2) bounded support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2&x.ggobi[,9]==1&(c1-abs(x.ggobi[,10])<=eps2)] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(3,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } print(i) # readline() #} # detach(x.imp) ########################################################## ###################### Rggobi for different subsets (9/10) of an (almost) separable data set ################### ############ bounded support vectors s of a run with all data are eliminated to obtain a sep data set ############## ####################### the projection is kept fixed and the separation hyperplane is observed ################ ############################## three different sets of variables can be tried ############################ ################################## the left out subset (1/10) is considered test set ##################### #several SVM runs library(MASS) library(e1071) library(Rggobi) #function to normalize data f.std.data<-function(x){ xm<-mean(x) xs<-sd(x) return((x-xm)/xs) } hypNormal<-function(sv,coefs){ w<-rep(0,length(sv[1,])) for (i in (1:length(sv[1,]))){ w[i]<-sum(coefs*sv[,i]) } return(w) } pred<-function(x,w,b){ p1<-rep(0,length(x[,1])) for (i in (1:length(x[,1]))){ p1[i]<-sum(x[i,]*w) } p1<-p1-b return(p1) } #read data from file t2 <- read.table("74x822_12.txt") elim<-c(3, 5, 12, 13, 20, 21, 24, 40, 46, 50, 56, 63, 67, 69, 71, 74, 1, 2, 9, 10, 16, 34, 37, 38, 45, 48, 51, 54, 55, 60, 61, 64, 66) t1<-t2[-elim,] #select first 4 most important comb 2 svm and two common with BW and PDA d.all <- cbind(t1[,c(800,721,357,596)],t1[,823]) dimnames(d.all)[[2]]<-c("V800","V721","V357","V596","cl") col<-ncol(d.all) #normalize data (attributes) y<-apply(d.all[,1:(col-1)],2,f.std.data) #add class column to the normalized data d.all<-cbind(y,d.all[,col]) n<-round(nrow(d.all)*9/10) col<-ncol(d.all) train.row<-n test.row<-nrow(d.all)-train.row #generate grid d.all<-data.frame(d.all) xr<-apply(d.all[,1:(length(d.all)-1)],2,range) grid.len<-10 xg1<-seq(xr[1,1],xr[2,1],len=grid.len) xg2<-seq(xr[1,2],xr[2,2],len=grid.len) xg3<-seq(xr[1,3],xr[2,3],len=grid.len) xg4<-seq(xr[1,4],xr[2,4],len=grid.len) xg<-NULL for (i in 1:length(xg1)) for (j in 1:length(xg2)) for (k in 1:length(xg3)) for (l in 1:length(xg4)) { xg<-rbind(xg,c(xg1[i],xg2[j],xg3[k],xg4[l])) } len<-length(xg1)*length(xg2)*length(xg3)*length(xg4) xg<-cbind(xg,rep(0,len)) dimnames(xg)[[2]]<-c("V800","V721","V337","V496","cl") d.train<-cbind(rep(0,train.row),rep(0,train.row),rep(0,train.row),rep(0,train.row),rep(0,train.row)) dimnames(d.train)[[2]]<-c("V800","V721","V337","V496","cl") d.train<-data.frame(d.train) x.ggobi<-rbind(cbind(rep(1,train.row),d.train[,col],cbind(rep(0,train.row)), cbind(rep(0,train.row)), d.train[,1:(col-1)], cbind(rep(0,train.row)),cbind(rep(0,train.row)))) names(x.ggobi)[1]<-"Data ind" names(x.ggobi)[2]<-"Class" names(x.ggobi)[3]<-"Pred Cl" names(x.ggobi)[4]<-"Pred Val" names(x.ggobi)[9]<-"SV" names(x.ggobi)[10]<-"Wgt" d.test<-cbind(rep(0,test.row),rep(0,test.row),rep(0,test.row),rep(0,test.row),rep(0,test.row)) dimnames(d.test)[[2]]<-c("V800","V721","V337","V496","cl") d.test<-data.frame(d.test) test.ggobi<-cbind(rep(0,test.row),d.test[,col],rep(0,test.row), rep(0,test.row), d.test[,1:(col-1)], rep(0,test.row),rep(0,test.row)) names(test.ggobi)<-names(x.ggobi) #dimnames(test.ggobi)<-list(NULL,names(x.ggobi)) x.ggobi<-rbind(x.ggobi,test.ggobi) grid.ggobi<-cbind(rep(2,len),xg[,col],rep(0,len),rep(0,len),xg[,1:(col-1)],rep(0,len),rep(0,len)) dimnames(grid.ggobi)<-list(NULL,names(x.ggobi)) x.ggobi<-rbind(x.ggobi,grid.ggobi) library(Rggobi) gb<-ggobi(x.ggobi) # initialize colors/glyphs indx1<-c(1:dim(x.ggobi)[1]) #change glyph and color for the grid points indx2<-indx1[x.ggobi[,1]==2] setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(1,length(indx2)),which=indx2) #change glyph and color for the positive train (1) points indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1] setColors.ggobi(rep(3,length(indx2)),which=indx2) #change glyph and color for the negative train (2) points indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2] setColors.ggobi(rep(4,length(indx2)),which=indx2) #change glyph and color for the positive test (1) points indx2<-indx1[x.ggobi[,1]==0&x.ggobi[,2]==1] setColors.ggobi(rep(7,length(indx2)),which=indx2) #change glyph and color for the negative test (2) points indx2<-indx1[x.ggobi[,1]==0&x.ggobi[,2]==2] setColors.ggobi(rep(9,length(indx2)),which=indx2) eps1<-0.1 eps2<-1e-07 ##################### make sure to un-shadow all the points before each of the for runs ############## for (i in 1:10){ g<-sample(1:nrow(d.all),n,replace=F) d.tr<-d.all[g,] dimnames(d.tr)[[2]]<-c("V800","V721","V337","V496","cl") #normalize train data (attributes) y<-apply(d.tr[,1:(col-1)],2,f.std.data) #add class column to the normalized data d.train<-cbind(y,d.tr[,col]) d.ts<-d.all[-g,] dimnames(d.ts)[[2]]<-c("V800","V721","V337","V496","cl") #normalize test data (attributes) y<-apply(d.ts[,1:(col-1)],2,f.std.data) #add class column to the normalized data d.test<-cbind(y,d.ts[,col]) setVariableValues.ggobi(d.train[,1],"V800",1:train.row) setVariableValues.ggobi(d.test[,1],"V800",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(d.train[,2],"V721",1:train.row) setVariableValues.ggobi(d.test[,2],"V721",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(d.train[,3],"V337",1:train.row) setVariableValues.ggobi(d.test[,3],"V337",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(d.train[,4],"V496",1:train.row) setVariableValues.ggobi(d.test[,4],"V496",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(d.train[,5],"Class",1:train.row) setVariableValues.ggobi(d.test[,5],"Class",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(rep(0,(train.row+test.row+len)),"Pred Cl",1:(train.row+test.row+len)) d.train<-data.frame(d.train) dimnames(d.train)[[2]]<-c("V800","V721","V337","V496","cl") attach(d.train) #run svm with C=1 value j<-1 cost1<-1-(j-1)/10 tmp.svm<-svm(factor(cl)~.,data=d.train,kernel="linear",cost=cost1) table(d.train[,col],predict(tmp.svm,d.train[,1:(col-1)])) table(d.test[,col],predict(tmp.svm,d.test[,1:(col-1)])) setVariableValues.ggobi(rep(0,length(g)),"SV",1:length(g)) setVariableValues.ggobi(rep(1,length(tmp.svm$index)),"SV",tmp.svm$index) setVariableValues.ggobi(rep(0,length(g)),"Wgt",1:length(g)) setVariableValues.ggobi(tmp.svm$coefs,"Wgt",tmp.svm$index) w<-hypNormal(tmp.svm$SV,tmp.svm$coefs) pred.val.train<-pred(d.train[,1:(col-1)],w,tmp.svm$rho) setVariableValues.ggobi(pred.val.train,"Pred Val",1:train.row) pred.val.test<-pred(d.test[,1:(col-1)],w,tmp.svm$rho) setVariableValues.ggobi(pred.val.test,"Pred Val",(train.row+1):(train.row+test.row)) pred.val.grid<-pred(xg[,1:(col-1)],w,tmp.svm$rho) setVariableValues.ggobi(pred.val.grid,"Pred Val",(train.row+test.row+1):(train.row+test.row+len)) pred.class.train<-predict(tmp.svm,d.train[,1:(col-1)]) setVariableValues.ggobi(pred.class.train,"Pred Cl",1:train.row) pred.class.test<-predict(tmp.svm,d.test[,1:(col-1)]) setVariableValues.ggobi(pred.class.test,"Pred Cl",(train.row+1):(train.row+test.row)) pred.class.grid<-predict(tmp.svm,xg[,1:(col-1)]) setVariableValues.ggobi(pred.class.grid,"Pred Cl",(train.row+test.row+1):(train.row+test.row+len)) x.ggobi<-getData.ggobi() #change glyph and color for the positive test (1) points indx2<-indx1[x.ggobi[,1]==0&x.ggobi[,2]==1] setColors.ggobi(rep(7,length(indx2)),which=indx2) #change glyph and color for the negative test (2) points indx2<-indx1[x.ggobi[,1]==0&x.ggobi[,2]==2] setColors.ggobi(rep(9,length(indx2)),which=indx2) #change glyph and color for the positive grid points indx2<-indx1[x.ggobi[,1]==2 & x.ggobi[,3]==1] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative grid points indx2<-indx1[x.ggobi[,1]==2 & x.ggobi[,3]==2] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } #change glyph and color for the positive boundary grid points indx2<-indx1[x.ggobi[,1]==2&x.ggobi[,3]==1&abs(x.ggobi[,4])eps2)&abs(x.ggobi[,10])!=0] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative (2) real support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2&x.ggobi[,9]==1&(cost1-abs(x.ggobi[,10])>eps2)&abs(x.ggobi[,10])!=0] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } #change glyph and color for the positive (1) bounded support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1&x.ggobi[,9]==1&(cost1-abs(x.ggobi[,10])<=eps2)] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(3,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative (2) bounded support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2&x.ggobi[,9]==1&(cost1-abs(x.ggobi[,10])<=eps2)] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(3,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } print(i) readline() } detach(x.imp) ########################################################## ###################### Rggobi for different subsets (2/3 from pos and 2/3 from neg) of a non-separable data set ############# ####################### the projection is kept fixed and the separation hyperplane is observed ################ ############################## three different sets of variables can be tried ############################ ################################## the left out subset (1/3 from pos and 1/3 from neg) is considered test set ##################### #several SVM runs library(MASS) library(e1071) library(Rggobi) #function to normalize data f.std.data<-function(x){ xm<-mean(x) xs<-sd(x) return((x-xm)/xs) } hypNormal<-function(sv,coefs){ w<-rep(0,length(sv[1,])) for (i in (1:length(sv[1,]))){ w[i]<-sum(coefs*sv[,i]) } return(w) } pred<-function(x,w,b){ p1<-rep(0,length(x[,1])) for (i in (1:length(x[,1]))){ p1[i]<-sum(x[i,]*w) } p1<-p1-b return(p1) } #read data from file t1 <- read.table("74x822_12.txt") #select first 4 most important comb 2 svm and two common with BW and PDA d.all <- cbind(t1[,c(800,721,357,596)],t1[,823]) dimnames(d.all)[[2]]<-c("V800","V721","V357","V596","cl") col<-ncol(d.all) #normalize data (attributes) y<-apply(d.all[,1:(col-1)],2,f.std.data) #add class column to the normalized data d.all<-cbind(y,d.all[,col]) #examples in class 1 c1<-c(3:8,11:15,17:22,24:33,36,39:40,42,46:47,49:50,56:59,62:63,65,66:74) #examples in class 2 c2<-c(1:2,9:10,16,23,34:35,37:38,41,43:45,48,51:55,60:61,64,66) n1<-round(length(c1)*2/3) n2<-round(length(c2)*2/3) col<-ncol(d.all) train.row<-n1+n2 test.row<-nrow(d.all)-train.row #generate grid d.all<-data.frame(d.all) xr<-apply(d.all[,1:(length(d.all)-1)],2,range) grid.len<-10 xg1<-seq(xr[1,1],xr[2,1],len=grid.len) xg2<-seq(xr[1,2],xr[2,2],len=grid.len) xg3<-seq(xr[1,3],xr[2,3],len=grid.len) xg4<-seq(xr[1,4],xr[2,4],len=grid.len) xg<-NULL for (i in 1:length(xg1)) for (j in 1:length(xg2)) for (k in 1:length(xg3)) for (l in 1:length(xg4)) { xg<-rbind(xg,c(xg1[i],xg2[j],xg3[k],xg4[l])) } len<-length(xg1)*length(xg2)*length(xg3)*length(xg4) xg<-cbind(xg,rep(0,len)) dimnames(xg)[[2]]<-c("V800","V721","V337","V496","cl") d.train<-cbind(rep(0,train.row),rep(0,train.row),rep(0,train.row),rep(0,train.row),rep(0,train.row)) dimnames(d.train)[[2]]<-c("V800","V721","V337","V496","cl") d.train<-data.frame(d.train) x.ggobi<-rbind(cbind(rep(1,train.row),d.train[,col],cbind(rep(0,train.row)), cbind(rep(0,train.row)), d.train[,1:(col-1)], cbind(rep(0,train.row)),cbind(rep(0,train.row)))) names(x.ggobi)[1]<-"Data ind" names(x.ggobi)[2]<-"Class" names(x.ggobi)[3]<-"Pred Cl" names(x.ggobi)[4]<-"Pred Val" names(x.ggobi)[9]<-"SV" names(x.ggobi)[10]<-"Wgt" d.test<-cbind(rep(0,test.row),rep(0,test.row),rep(0,test.row),rep(0,test.row),rep(0,test.row)) dimnames(d.test)[[2]]<-c("V800","V721","V337","V496","cl") d.test<-data.frame(d.test) test.ggobi<-cbind(rep(0,test.row),d.test[,col],rep(0,test.row), rep(0,test.row), d.test[,1:(col-1)], rep(0,test.row),rep(0,test.row)) names(test.ggobi)<-names(x.ggobi) #dimnames(test.ggobi)<-list(NULL,names(x.ggobi)) x.ggobi<-rbind(x.ggobi,test.ggobi) grid.ggobi<-cbind(rep(2,len),xg[,col],rep(0,len),rep(0,len),xg[,1:(col-1)],rep(0,len),rep(0,len)) dimnames(grid.ggobi)<-list(NULL,names(x.ggobi)) x.ggobi<-rbind(x.ggobi,grid.ggobi) library(Rggobi) gb<-ggobi(x.ggobi) # initialize colors/glyphs indx1<-c(1:dim(x.ggobi)[1]) #change glyph and color for the grid points indx2<-indx1[x.ggobi[,1]==2] setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(1,length(indx2)),which=indx2) #change glyph and color for the positive train (1) points indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1] setColors.ggobi(rep(3,length(indx2)),which=indx2) #change glyph and color for the negative train (2) points indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2] setColors.ggobi(rep(4,length(indx2)),which=indx2) #change glyph and color for the positive test (1) points indx2<-indx1[x.ggobi[,1]==0&x.ggobi[,2]==1] setColors.ggobi(rep(7,length(indx2)),which=indx2) #change glyph and color for the negative test (2) points indx2<-indx1[x.ggobi[,1]==0&x.ggobi[,2]==2] setColors.ggobi(rep(9,length(indx2)),which=indx2) eps1<-0.1 eps2<-1e-07 ##################### make sure to un-shadow all the points before each of the for runs ############## #for (i in 1:10){ #sample class 1 sc1<-sample(c1,n1,replace=F) #sample class 2 sc2<-sample(c2,n2,replace=F) #final sample g<-c(sc1,sc2) d.tr<-d.all[g,] dimnames(d.tr)[[2]]<-c("V800","V721","V337","V496","cl") #normalize train data (attributes) y<-apply(d.tr[,1:(col-1)],2,f.std.data) #add class column to the normalized data d.train<-cbind(y,d.tr[,col]) d.ts<-d.all[-g,] dimnames(d.ts)[[2]]<-c("V800","V721","V337","V496","cl") #normalize test data (attributes) y<-apply(d.ts[,1:(col-1)],2,f.std.data) #add class column to the normalized data d.test<-cbind(y,d.ts[,col]) setVariableValues.ggobi(d.train[,1],"V800",1:train.row) setVariableValues.ggobi(d.test[,1],"V800",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(d.train[,2],"V721",1:train.row) setVariableValues.ggobi(d.test[,2],"V721",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(d.train[,3],"V337",1:train.row) setVariableValues.ggobi(d.test[,3],"V337",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(d.train[,4],"V496",1:train.row) setVariableValues.ggobi(d.test[,4],"V496",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(d.train[,5],"Class",1:train.row) setVariableValues.ggobi(d.test[,5],"Class",(train.row+1):(train.row+test.row)) setVariableValues.ggobi(rep(0,(train.row+test.row+len)),"Pred Cl",1:(train.row+test.row+len)) d.train<-data.frame(d.train) dimnames(d.train)[[2]]<-c("V800","V721","V337","V496","cl") attach(d.train) #run svm with C=1 value i<-1 cost1<-1-(i-1)/10 tmp.svm<-svm(factor(cl)~.,data=d.train,kernel="linear",cost=cost1) table(d.train[,col],predict(tmp.svm,d.train[,1:(col-1)])) table(d.test[,col],predict(tmp.svm,d.test[,1:(col-1)])) setVariableValues.ggobi(rep(0,length(g)),"SV",1:length(g)) setVariableValues.ggobi(rep(1,length(tmp.svm$index)),"SV",tmp.svm$index) setVariableValues.ggobi(rep(0,length(g)),"Wgt",1:length(g)) setVariableValues.ggobi(tmp.svm$coefs,"Wgt",tmp.svm$index) w<-hypNormal(tmp.svm$SV,tmp.svm$coefs) pred.val.train<-pred(d.train[,1:(col-1)],w,tmp.svm$rho) setVariableValues.ggobi(pred.val.train,"Pred Val",1:train.row) pred.val.test<-pred(d.test[,1:(col-1)],w,tmp.svm$rho) setVariableValues.ggobi(pred.val.test,"Pred Val",(train.row+1):(train.row+test.row)) pred.val.grid<-pred(xg[,1:(col-1)],w,tmp.svm$rho) setVariableValues.ggobi(pred.val.grid,"Pred Val",(train.row+test.row+1):(train.row+test.row+len)) pred.class.train<-predict(tmp.svm,d.train[,1:(col-1)]) setVariableValues.ggobi(pred.class.train,"Pred Cl",1:train.row) pred.class.test<-predict(tmp.svm,d.test[,1:(col-1)]) setVariableValues.ggobi(pred.class.test,"Pred Cl",(train.row+1):(train.row+test.row)) pred.class.grid<-predict(tmp.svm,xg[,1:(col-1)]) setVariableValues.ggobi(pred.class.grid,"Pred Cl",(train.row+test.row+1):(train.row+test.row+len)) x.ggobi<-getData.ggobi() #change glyph and color for the positive test (1) points indx2<-indx1[x.ggobi[,1]==0&x.ggobi[,2]==1] setColors.ggobi(rep(7,length(indx2)),which=indx2) #change glyph and color for the negative test (2) points indx2<-indx1[x.ggobi[,1]==0&x.ggobi[,2]==2] setColors.ggobi(rep(9,length(indx2)),which=indx2) #change glyph and color for the positive grid points indx2<-indx1[x.ggobi[,1]==2 & x.ggobi[,3]==1] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative grid points indx2<-indx1[x.ggobi[,1]==2 & x.ggobi[,3]==2] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(0,length(indx2)),sizes=rep(2,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } #change glyph and color for the positive boundary grid points indx2<-indx1[x.ggobi[,1]==2&x.ggobi[,3]==1&abs(x.ggobi[,4])eps2)&abs(x.ggobi[,10])!=0] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative (2) real support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2&x.ggobi[,9]==1&(cost1-abs(x.ggobi[,10])>eps2)&abs(x.ggobi[,10])!=0] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } #change glyph and color for the positive (1) bounded support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==1&x.ggobi[,9]==1&(cost1-abs(x.ggobi[,10])<=eps2)] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(3,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) } #change glyph and color for the negative (2) bounded support vectors indx2<-indx1[x.ggobi[,1]==1&x.ggobi[,2]==2&x.ggobi[,9]==1&(cost1-abs(x.ggobi[,10])<=eps2)] if (length(indx2)!=0){ setGlyphs.ggobi(types=rep(3,length(indx2)),sizes=rep(3,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) } print(i) # readline() #} # detach(x.imp) ################################## illustrate tour method ######################## #several SVM runs library(MASS) library(sma) library(e1071) library(Rggobi) #function to normalize data f.std.data<-function(x){ xm<-mean(x) xs<-sd(x) return((x-xm)/xs) } hypNormal<-function(sv,coefs){ w<-length(sv[1,]) for (i in (1:length(sv[1,]))){ w[i]<-sum(coefs*sv[,i]) } return(w) } pred<-function(x,w,b){ p1<-rep(0,length(x[,1])) for (i in (1:length(x[,1]))){ p1[i]<-sum(x[i,]*w) } p1<-p1-b return(p1) } #read data from file t1 <- read.table("74x822_12.txt") #elim<-c(3, 5, 12, 13, 20, 21, 24, 40, 46, 50, 56, 63, 67, 69, 71, 74, 1, 2, 9, 10, 16, 34, 37, 38, 45, 48, 51, 54, 55, 60, 61, 64, 66) #t1<- t1[-elim,] #normalize data (attributes) y<-apply(t1[,1:822],2,f.std.data) #add class column to the normalized data x<-cbind(y,t1[,823]) #select first 4 most important SVM from 40 most imp BW variables - pics 22,25,26 #x.imp <- cbind(x[,c(800,403,535,596,357)],x[,823]) #dimnames(x.imp)[[2]]<-c("V800","V403","V535","V596","V357","cl") #select first 4 most important comb 2 svm and two common with BW and PDA - pic 21 x.imp <- cbind(x[,c(721,357,50,594,559)],x[,823]) dimnames(x.imp)[[2]]<-c("V721","V357","V50","V594","V559","cl") #select first 4 most important SVM from all 822 - pic 19 #x.imp <- cbind(x[,c(390,389,4,596,54)],x[,823]) #dimnames(x.imp)[[2]]<-c("V390","V389","V4","V596","V54","cl") x.imp.row<-nrow(x.imp) x.imp.col<-ncol(x.imp) x.imp<-data.frame(x.imp) dim<-length(x.imp[1,]) names(x.imp)[dim]<-"cl" x.ggobi<-rbind(cbind(x.imp[,1:5], x.imp[,x.imp.col])) names(x.ggobi)[6]<-"Class" library(Rggobi) g<-ggobi(x.ggobi) # initialize colors/glyphs indx1<-c(1:dim(x.ggobi)[1]) #change glyph and color for the grid points indx2<-indx1[x.ggobi[,6]==1] setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(1,length(indx2)),which=indx2) setColors.ggobi(rep(5,length(indx2)),which=indx2) #change glyph and color for the grid points indx2<-indx1[x.ggobi[,6]==2] setGlyphs.ggobi(types=rep(5,length(indx2)),sizes=rep(1,length(indx2)),which=indx2) setColors.ggobi(rep(3,length(indx2)),which=indx2) ##############################################################