[R] new data
arun
smartpink111 at yahoo.com
Thu Jun 13 00:37:57 CEST 2013
Hi,
Try this:
final3New<-read.table(file="real_data_cecilia.txt",sep="\t")
final3New1<-read.csv("real_data_cecilia_new.csv")
fun2<-function(dat){
indx<- duplicated(dat)|duplicated(dat,fromLast=TRUE)
dat1<- subset(dat[indx,],dummy==1)
dat2<- dat1[order(dat1$dimension),]
indx1<- as.numeric(row.names(dat2))
names(indx1)<- (seq_along(indx1)-1)%/%2+1
dat3<- dat[c(indx1,indx1+1),]
dat3$id<- names(c(indx1,indx1+1))
lst1<- lapply(split(dat3,dat3$id),function(x){
x1<- x[-1,]
x2<- x1[which.min(abs(x1$dimension[1]-x1$dimension[-1]))+1,]
x3<- subset(x,dummy==1)
rowNx2<- as.numeric(row.names(x2))
rowNx3<- as.numeric(row.names(x3))
x4<- x3[which.min(abs(rowNx2-rowNx3)),]
x5<- rbind(x4,x2)
x6<- x[is.na(match(row.names(x),row.names(x5))),]
})
dat4<- do.call(rbind,lst1)
row.names(dat4)<- gsub(".*\\.","",row.names(dat4))
indxNew1<- sort(as.numeric(unique(row.names(dat4))))
dat0<- subset(dat[indx,],dummy==0)
if(nrow(dat0)>0){
dat20<-dat0[order(dat0$dimension),]
indx0<- as.numeric(row.names(dat20))
names(indx0)<- (seq_along(indx0)-1)%/%2+1
dat30<- dat[c(indx0-1,indx0),]
dat30$id<- names(c(indx0-1,indx0))
lst0<- lapply(split(dat30,dat30$id),function(x) {
x1<- subset(x,dummy==1)
x2<- subset(x,dummy==0)
x3<- x1[which.min(abs(x1$dimension- unique(x2$dimension))),]
rowNx2<- as.numeric(row.names(x2))
rowNx3<- as.numeric(row.names(x3))
x4<- x2[which.min(abs(rowNx2-rowNx3)),]
x5<- rbind(x3,x4)
x6<- x[is.na(match(row.names(x),row.names(x5))),]
})
dat40<- do.call(rbind,lst0)
row.names(dat40)<- gsub(".*\\.","",row.names(dat40))
indxNew0<- sort(as.numeric(unique(row.names(dat40))))
res1Del<-dat[indxNew1,]
res0Del<-dat[indxNew0,]
indx10<-sort(as.numeric(union(row.names(res0Del),row.names(res1Del))))
if(length(indx10)%%2==1){
res10Del<-unique(rbind(res1Del,res0Del))
indx10New<- sort(as.numeric(row.names(res10Del)))
resF<- dat[-indx10New,]
resF
}
else{
resF<- dat[-indx10,]
resF
}
}
else{
resF<- dat[-indxNew1,]
}
}
###Old Function
fun3<- function(dat){
indx<- duplicated(dat)
dat1<- subset(dat[indx,],dummy==1)
dat0<- subset(dat[indx,],dummy==0)
indx1<- as.numeric(row.names(dat1))
indx11<- sort(c(indx1,indx1+1))
indx0<- as.numeric(row.names(dat0))
indx00<- sort(c(indx0,indx0-1))
indx10<- sort(c(indx11,indx00))
res <- dat[-indx10,]
res
}
##Applying fun1() (from previous post)
res5Percent<- fun1(final3New,0.05,50)
res5Percent1<- fun1(final3New1,0.05,50)
res10Percent<- fun1(final3New,0.10,200)
res10Percent1<- fun1(final3New1,0.10,200)
res20Percent<- fun1(final3New,0.20,100)
res20Percent1<- fun1(final3New1,0.20,100)
###Applying fun2()
res5F2<- fun2(res5Percent)
res5F2_1<- fun2(res5Percent1)
res10F2<- fun2(res10Percent)
res10F2_1<- fun2(res10Percent1)
res20F2<- fun2(res20Percent)
res20F2_1<- fun2(res20Percent1)
#Applying fun3()
res5F3<- fun3(res5Percent)
res5F3_1<- fun3(res5Percent1)
res10F3<- fun3(res10Percent)
res10F3_1<- fun3(res10Percent1)
res20F3<- fun3(res20Percent)
res20F3_1<- fun3(res20Percent1)
vec1<- rep(c("res5F2","res10F2","res20F2"),2)
vec2<- rep(c("res5F3","res10F3","res20F3"),2)
vec1[4:6]<-paste(vec1[4:6],"_1",sep="")
vec2[4:6]<-paste(vec2[4:6],"_1",sep="")
resTbl<-data.frame( Dataset=rep(rep(c("final3New","final3New1"),each=3),2),Funct=rep(c("fun2","fun3"),each=6),do.call(rbind,lapply(as.list(c(vec1,vec2)),function(x) {x1<-get(x);c(N_row=nrow(x1),Sub0_Nrow=nrow(subset(x1,dummy==0)),Sub1_Nrow=nrow(subset(x1,dummy==1)),Uniq_Nrow=nrow(unique(x1)))})),stringsAsFactors=FALSE)
row.names(resTbl)<- c(vec1,vec2)
resTbl
# Dataset Funct N_row Sub0_Nrow Sub1_Nrow Uniq_Nrow
#res5F2 final3New fun2 276 138 138 276
#res10F2 final3New fun2 454 227 227 454
#res20F2 final3New fun2 284 142 142 284
#res5F2_1 final3New1 fun2 288 144 144 288
#res10F2_1 final3New1 fun2 488 244 244 488
#res20F2_1 final3New1 fun2 310 155 155 310
#res5F3 final3New fun3 276 138 138 276
#res10F3 final3New fun3 452 226 226 452
#res20F3 final3New fun3 284 142 142 284
#res5F3_1 final3New1 fun3 288 144 144 288
#res10F3_1 final3New1 fun3 488 244 244 488
#res20F3_1 final3New1 fun3 310 155 155 310
head(res5F2_1,4)
# firm year industry dummy dimension
#1 500622043 2004 1 1 1172
#2 501611886 2004 1 0 1183
#3 500778787 2004 1 1 5680
#4 500047006 2004 1 0 5692
A.K.
________________________________
From: Cecilia Carmo <cecilia.carmo at ua.pt>
To: arun <smartpink111 at yahoo.com>
Sent: Tuesday, June 11, 2013 4:36 PM
Subject: new data
Here it is.
Cecília
More information about the R-help
mailing list