[R] data - matched samples, dataframes, panel data

Cecilia Carmo cecilia.carmo at ua.pt
Sat Jun 8 12:06:25 CEST 2013

```It seems to do all I want:
1. matchs the greatest  number of firms within an year and industry, by dimension
2. diferences between dimensions should not be more than 10% or/and 200 (if I want just the 10%
restrition I think I could substitute 200 for a big number that is impossible to exist)
3. if there are many cases that verif y 2. then it chooses the closest.

Thank you very much.

Cecília Carmo

________________________________________
De: arun [smartpink111 at yahoo.com]
Para: Cecilia Carmo
Cc: R help
Assunto: Re: data

Hi,
Try this:
dim(final3New)
#[1] 5369    5

#Inside the split within split, dummy==1 for the first row.  For lists that have many rows, I selected the row with dummy==0 (from the rest) using the #condition that the absolute difference between the dimensions of those rows and the first row dimension was minimum (after I applied the first #constraint).  I guess you wanted to select only a pair of rows (dummy=0 and dummy=1) for each lists.

fun1<- function(dat,percent,number){
lst1<- split(dat,list(dat\$year,dat\$industry))
lst1New<- lapply(lst1,function(x) x[!(all(x\$dummy==0)|all(x\$dummy==1)),])
lst2<- lst1New[lapply(lst1New,nrow)>0]
lst3<- lapply(lst2,function(x){
lapply(x\$dimension,function(y){
x1<- x[(y < (x\$dimension+(x\$dimension*percent))) & (y > (x\$dimension-(x\$dimension*percent))),]
})
})
lst4<- lapply(lst3,function(x){
lst<- lapply(x,function(y){
y[!all(y\$dummy==0),]
})
lstNew<- lst[lapply(lst,nrow)>1]
lstNew1<- unique(lstNew)
})
lst5<- lst4[lapply(lst4,length)>0]
lst6<- lapply(lst5,function(x) {
lst<- lapply(x,function(y){
y[!all(y\$dummy==1),]
})
lst[lapply(lst,nrow)>0]
})
lst7<- lapply(lst6,function(x){
lst<- lapply(x,function(y) {
x1<- y[1,]
x2<- y[-1,]
x3<- subset(x2,dummy==0)
x4<- x3[which.min(abs(x1\$dimension-x3\$dimension)),]
rbind(x1,x4)
})
lstNew<-unique(lst)
lstNew1<- lapply(lstNew,function(x){
x[abs(diff(x\$dimension)) < number,]
})
lstNew1[lapply(lstNew1,nrow)>0]
})
lst8<- lst7[lapply(lst7,length)>0]
res<- do.call(rbind,lapply(lst8,function(x){
do.call(rbind,x)
})
)
row.names(res)<- 1:nrow(res)
res}

res10Percent<- fun1(final3New,0.1,200)
dim(res10Percent)
#[1] 508   5
nrow(subset(res10Percent,dummy==0))
#[1] 254
nrow(subset(res10Percent,dummy==1))
#[1] 254
#       firm year industry dummy dimension
#1 500622043 2004        1     1      1198
#2 501611886 2004        1     0      1208
#3 501164600 2005        1     1      1332
#4 504243349 2005        1     0      1455
#5 500862893 2006        1     1      5324
#6 501744860 2006        1     0      5453

res5Percent<- fun1(final3New,0.05,200)
dim(res5Percent)
#[1] 548   5

nrow(subset(res5Percent,dummy==0))
#[1] 274
nrow(subset(res5Percent,dummy==1))
#[1] 274

res5percent1<-fun1(final3New,0.05,50)
dim(res5percent1)
#[1] 302   5
nrow(subset(res5percent1,dummy==0))
#[1] 151
nrow(subset(res5percent1,dummy==1))
#[1] 151

Hope it helps.

A.K.

________________________________
From: Cecilia Carmo <cecilia.carmo at ua.pt>
To: arun <smartpink111 at yahoo.com>
Sent: Friday, June 7, 2013 7:30 PM
Subject: data

I'm sending the data.
Thank you very much.
Cecília

The code

lst1<-split(final3,list(final3\$year,final3\$industry))
lst2<-lst1[lapply(lst1,nrow)>0]
lst3<-lapply(lst2,function(x) lapply(x\$dimension,function(y) x[(y< (x\$dimension+x\$dimension*0.10)) & (y> (x\$dimension-x\$dimension*0.10)),]))
lst4<-lapply(lst3,function(x) x[lapply(x,nrow)==2])
lst5<-lapply(lst4,function(x)x[!duplicated(x)])
lst6<-lst5[lapply(lst5,length)>0]
lst7<-lapply(lst6,function(x) {lst<-lapply(x,function(y) y[sum(y\$dummy)==1,]);lst[lapply(lst,nrow)>0]})
res<-do.call(rbind,lapply(lst7,function(x) do.call(rbind,x)))
row.names(res)<-1:nrow(res)
nrow(subset(res,res\$dummy==1))
nrow(subset(res,res\$dummy==0))
```