# [R] matched samples, dataframe, panel data

arun smartpink111 at yahoo.com
Mon Jun 17 03:33:28 CEST 2013

```Hi,
This gives me more combinations than you got with SAS.  Also, this selects the one with minimum dimension between duplicates.

fun3<- function(dat){
if(any(duplicated(dat))){
indx<- which(duplicated(dat))
row.names(dat)<-1:nrow(dat)
dat1<- subset(dat[indx,],dummy==1)
dat0<- subset(dat[indx,],dummy==0)
indx1<- as.numeric(row.names(dat1))
indx11<- sort(c(indx1,indx1+1))
indx0<- as.numeric(row.names(dat0))
indx00<- sort(c(indx0,indx0-1))
indx10<- sort(c(indx11,indx00))
res <- dat[-indx10,]
res
}
else {
dat
}
}

fun1New<-function(dat,percent,number){
lst1<- split(dat,list(dat\$year,dat\$industry))
lst2<- lst1[lapply(lst1,nrow)>1]
lst3<- lapply(lst2,function(x) {
CombN1<-combn(seq_len(nrow(x)),2)
lapply(split(CombN1,col(CombN1)),function(y){
x1<-x[y,]
x1[sum(x1\$dummy)==1,]
})
})

lst4<- lapply(lst3,function(x) x[lapply(x,nrow)>0])
lst5<- lst4[lapply(lst4,length)>0]
lst6<- lapply(lst5,function(x){
lapply(x,function(y){
x1<- abs(diff(y\$dimension))< number
x2<- y\$dimension[2]+ (y\$dimension[2]*percent)
x3<- y\$dimension[2]- (y\$dimension[2]*percent)
x4<- y\$dimension[1]+ (y\$dimension[1]*percent)
x5<- y\$dimension[1]- (y\$dimension[1]*percent)
x6<- (y\$dimension[1] < x2) & (y\$dimension[1] > x3)
x7<- (y\$dimension[2]< x4) & (y\$dimension[2]> x5)
y[((x6 & x1)| (x7 & x1)),]
})
}
)
lst7<- lapply(lst6,function(x) x[lapply(x,nrow)>0])
lst8<- lst7[lapply(lst7,length)>0]
lst9<- lapply(lst8,function(x) do.call(rbind,x))
lst10<-lapply(lst9,function(x) {
row.names(x)<- 1:nrow(x)
x1<-x[x\$dummy==1,]
do.call(rbind,lapply(split(x1,x1\$dimension),function(y){
indx1<-sort(c(as.numeric(row.names(y)),as.numeric(row.names(y))+1))
x2<-x[indx1,]
x3<- subset(x2,dummy==0)
x4<-x3[which.min(abs(x2\$dimension[1]-x3\$dimension)),]
rbind(x2[1,],x4)
}))
})
res<- do.call(rbind,lapply(lst10,fun3))
row.names(res)<- 1:nrow(res)
res
}

####1st dataset

res10PercentHigh<- fun1New(final3New,0.10,500000000)
dim(res10PercentHigh)
#[1] 764   5
dim(unique(res10PercentHigh))
#[1] 764   5
nrow(subset(res10PercentHigh,dummy==0))
#[1] 382
nrow(subset(res10PercentHigh,dummy==1))
#[1] 382
res10PercentLow<- fun1New(final3New,0.10,50)
dim(res10PercentLow)
#[1] 294   5
dim(unique(res10PercentLow))
#[1] 294   5
nrow(subset(res10PercentLow,dummy==0))
#[1] 147
nrow(subset(res10PercentLow,dummy==1))
#[1] 147

res5PercentHigh<- fun1New(final3New,0.05,500000000)
dim(res5PercentHigh)
#[1] 630   5
dim(unique(res5PercentHigh))
#[1] 630   5
nrow(subset(res5PercentHigh,dummy==0))
#[1] 315
nrow(subset(res5PercentHigh,dummy==1))
#[1] 315

res5PercentLow<- fun1New(final3New,0.05,50)
dim(res5PercentLow)
#[1] 294   5

dim(unique(res5PercentLow))
#[1] 294   5
nrow(subset(res5PercentLow,dummy==0))
#[1] 147
nrow(subset(res5PercentLow,dummy==1))
#[1] 147

#######2nd dataset
res10PercentHigh<- fun1New(final3New1,0.10,500000000)
dim(res10PercentHigh)
#[1] 760   5
dim(unique(res10PercentHigh))
#[1] 760   5

nrow(subset(res10PercentHigh,dummy==0))
#[1] 380
nrow(subset(res10PercentHigh,dummy==1))
#[1] 380
res10PercentLow<- fun1New(final3New1,0.10,100)
dim(res10PercentLow)
#[1] 418   5

dim(unique(res10PercentLow))
#[1] 418   5
nrow(subset(res10PercentLow,dummy==0))
#[1] 209
nrow(subset(res10PercentLow,dummy==1))
#[1] 209

res5PercentHigh<- fun1New(final3New1,0.05,500000000)
dim(res5PercentHigh)
#[1] 640   5
dim(unique(res5PercentHigh))
#[1] 640   5

nrow(subset(res5PercentHigh,dummy==0))
#[1] 320
nrow(subset(res5PercentHigh,dummy==1))
#[1] 320
res5PercentLow<- fun1New(final3New1,0.05,50)
dim(res5PercentLow)
#[1] 310   5

dim(unique(res5PercentLow))
#[1] 310   5
nrow(subset(res5PercentLow,dummy==0))
#[1] 155
nrow(subset(res5PercentLow,dummy==1))
#[1] 155

res20PercentHigh<- fun1New(final3New1,0.20,500000000)
dim(res20PercentHigh)
#[1] 846   5

dim(unique(res20PercentHigh))
#[1] 846   5

nrow(subset(res20PercentHigh,dummy==0))
#[1] 423
nrow(subset(res20PercentHigh,dummy==1))
#[1] 423

A.K.

----- Original Message -----
From: Cecilia Carmo <cecilia.carmo at ua.pt>
To: arun <smartpink111 at yahoo.com>
Cc:
Sent: Sunday, June 16, 2013 5:57 AM
Subject: RE: matched samples, dataframe, panel data

In the script I send you and with the file that  I sent with it and with the old function 1 and 2
it got 350 combinations and it was possible to have more

Now with new fun 1 and 3 I have less, so it is not ok, does it?

> res10Percent<- fun1New(final3New2,0.10,500000000)
> res10F3<- fun3(res10Percent)
> dim(res10F3)
[1] 600   5
> nrow(subset(res10F3,dummy==0))
[1] 300
> nrow(subset(res10F3,dummy==1))
[1] 300

Sorry for making you spending so much time. I thought it could be easier.

Cecília

________________________________________
De: arun [smartpink111 at yahoo.com]
Enviado: sexta-feira, 14 de Junho de 2013 23:09
Para: Cecilia Carmo
Assunto: Re: matched samples, dataframe, panel data

One thing I forgot to mention.  I used fun3() because i found fun2() still have some problems with getting the correct dimensions.  You can check the results of fun1() and fun3() and see if all the combinations are got.  Then, if I get chance, I will correct fun2().
"""""
And you conclude that they are the same!
"""""""
Here, also I am not concluding anything.
A.K.

----- Original Message -----
From: arun <smartpink111 at yahoo.com>
To: Cecilia Carmo <cecilia.carmo at ua.pt>
Cc: R help <r-help at r-project.org>
Sent: Friday, June 14, 2013 6:05 PM
Subject: Re: matched samples, dataframe, panel data

Hi,
I changed the fun1().  Now, it should be possible to get all the possible combinations within each group.

fun1New<- function(dat,percent,number){
lst1<- split(dat,list(dat\$year,dat\$industry))
lst2<- lst1[lapply(lst1,nrow)>1]
lst3<- lapply(lst2,function(x) {
CombN1<-combn(seq_len(nrow(x)),2)
lapply(split(CombN1,col(CombN1)),function(y){
x1<-x[y,]
x1[sum(x1\$dummy)==1,]
})
})

lst4<- lapply(lst3,function(x) x[lapply(x,nrow)>0])
lst5<- lst4[lapply(lst4,length)>0]
lst6<- lapply(lst5,function(x){
lapply(x,function(y){
x1<- abs(diff(y\$dimension))< number
x2<- y\$dimension[2]+ (y\$dimension[2]*percent)
x3<- y\$dimension[2]- (y\$dimension[2]*percent)
x4<- (y\$dimension[1] < x2) & (y\$dimension[1] > x3)
y[x4 & x1,]
})
}
)
lst7<- lapply(lst6,function(x) x[lapply(x,nrow)>0])
lst8<- lst7[lapply(lst7,length)>0]
res<- do.call(rbind,lapply(lst8,function(x){
do.call(rbind,x)
}))
row.names(res)<- 1:nrow(res)
res
}

##Applying fun1New
res5Percent<- fun1New(final3New,0.05,50)
dim(res5Percent)
#[1] 718   5
res5PercentHigh<- fun1New(final3New,0.05,500000)
dim(res5PercentHigh)
#[1] 2788    5

res5Percent1<- fun1New(final3New1,0.05,50)
dim(res5Percent1)
#[1] 870   5
res5Percent1High<- fun1New(final3New1,0.05,500000)
dim(res5Percent1High)
#[1] 2902    5

res10Percent<- fun1New(final3New,0.10,200)
dim(res10Percent)
#[1] 2928    5
res10Percent1<- fun1New(final3New1,0.10,200)
dim(res10Percent1)
#[1] 3092    5

fun3<- function(dat){
indx<- duplicated(dat)
dat1<- subset(dat[indx,],dummy==1)
dat0<- subset(dat[indx,],dummy==0)
indx1<- as.numeric(row.names(dat1))
indx11<- sort(c(indx1,indx1+1))
indx0<- as.numeric(row.names(dat0))
indx00<- sort(c(indx0,indx0-1))
indx10<- sort(c(indx11,indx00))
res <- dat[-indx10,]
res
}

#Applying fun3()
res5F3<- fun3(res5Percent)
dim(res5F3)
#[1] 278   5

res5F3High<- fun3(res5PercentHigh)
dim(res5F3High)
#[1] 546   5

res5F3_1<- fun3(res5Percent1)
#[1] 302   5
res5F3High_1<- fun3(res5Percent1High)
dim(res5F3High_1)
#[1] 570   5

res10F3<- fun3(res10Percent)
dim(res10F3)
#[1] 462   5
res10F3_1<- fun3(res10Percent1)
#[1] 474   5
nrow(subset(res5F3,dummy==0))
#[1] 139
nrow(subset(res5F3,dummy==1))
#[1] 139

nrow(subset(res5F3High,dummy==1))
#[1] 273
nrow(subset(res5F3High,dummy==0))
#[1] 273

nrow(subset(res10F3,dummy==0))
#[1] 231
nrow(subset(res10F3,dummy==1))
#[1] 231
nrow(subset(res10F3_1,dummy==1))
#[1] 237
nrow(subset(res10F3_1,dummy==0))
#[1] 237
dim(unique(res5F3))
#[1] 278   5
dim(unique(res5F3High))
#[1] 546   5

dim(unique(res10F3_1))
#[1] 474   5
dim(unique(res10F3))
#[1] 462   5
A.K.

________________________________
From: Cecilia Carmo <cecilia.carmo at ua.pt>
To: arun <smartpink111 at yahoo.com>
Sent: Friday, June 14, 2013 10:44 AM
Subject: me again

There some matchs that are missing. That is, it is possible to have more matchs.
I'm sending you a sript and the data.

Than you.
Cecília
```