[R] merging two dataframes

Petr PIKAL petr.pikal at precheza.cz
Wed Oct 26 15:37:58 CEST 2011


> 
> Hello.
> 
> Now i tried to do what you told me.
> I used the str(fuction), and data$date1 and data3$date1 where both 
listed

You have no date1 only date. Therefore

result<- merge(data, data3, by=c("date", "name"), all=T)

takes all values from both data frames 

> dim(data)
[1] 20 15
> dim(data3)
[1] 20  9

alltogether 24 columns from which 4 are date and name columns therefore 20 
columns contain data.

> dim(result)
[1] 40 22

So the result has all 20 columns from both data frames plus one name and 
one date column and all rows from both data frames = 40. Those two sets 
are disjoint. If you had some common date and name in both data frames 
these rows would be merged on the same row in result.

Let us try this.

> data3$name[1:5] <- data$name[1:5]
> data3$date[3:5] <- data$date[3:5]

result<- merge(data, data3, by=c("date", "name"), all=T)
dim(result)
[1] 37 22

Regards
Petr


> "character". I changed "name" to character but it did not work either.
> I also changed all variables to character, with no positive result.
> 
> str(data)
> 'data.frame':   14446 obs. of  15 variables:
>  $ id     : chr  "1" "1" "1" "1" ...
>  $ compid : chr  "2514" "2514" "2514" "2514" ...
>  $ secid  : chr  "15856" "15856" "15856" "15856" ...
>  $ name   : chr  "A-pressen" "A-pressen" "A-pressen" "A-pressen" ...
>  $ period : chr  "1" "2" "3" "4" ...
>  $ date   : chr  "17.05.1980" "17.05.1981" "17.05.1982" "17.05.1983" ...
>  $ enddate: chr  "17.05.1981" "17.05.1982" "17.05.1983" "17.05.1984" ...
>  $ div    : chr  NA NA NA NA ...
>  $ ndivs  : chr  NA NA NA NA ...
>  $ posdiv : chr  NA NA NA NA ...
>  $ ddiv2  : chr  NA NA NA NA ...
>  $ ddiv3  : chr  NA NA NA NA ...
>  $ ddiv4  : chr  NA NA NA NA ...
>  $ ddiv5  : chr  NA NA NA NA ...
>  $ ddiv6  : chr  NA NA NA NA ...
> 
> str(data3)
> 'data.frame':   812354 obs. of  9 variables:
>  $ date                  : chr  "02.01.1996" "03.01.1996" "04.01.1996"
> "05.01.1996" ...
>  $ Securityid            : chr  "6001" "6001" "6001" "6001" ...
>  $ Symbol                : chr  "AAV" "AAV" "AAV" "AAV" ...
>  $ name                  : chr  "Adresseavisen" "Adresseavisen"
> "Adresseavisen" "Adresseavisen" ...
>  $ Securitytype          : chr  "Ordinary Shares" "Ordinary Shares"
> "Ordinary Shares" "Ordinary Shares" ...
>  $ Unadjusted            : chr  "200" "200" "200" "200" ...
>  $ Event.adjusted        : chr  "200" "200" "200" "200" ...
>  $ Div.and.Event.adjusted: chr  "109,7595375" "109,7595375" 
"109,7595375"
> "109,7595375" ...
>  $ Sharesissued          : chr  "1901646" "1901646" "1901646" "1901646" 
...
> 
> Here is some suitable data for "data"
> 
> > dput(data[1:20,])
> 
> structure(list(id = c("1", "1", "1", "1", "1", "1", "1", "1", 
> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"), 
>     compid = c("2514", "2514", "2514", "2514", "2514", "2514", 
>     "2514", "2514", "2514", "2514", "2514", "2514", "2514", "2514", 
>     "2514", "2514", "2514", "2514", "2514", "2514"), secid = c("15856", 
>     "15856", "15856", "15856", "15856", "15856", "15856", "15856", 
>     "15856", "15856", "15856", "15856", "15856", "15856", "15856", 
>     "15856", "15856", "15856", "15856", "15856"), name = c("A-pressen", 
>     "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen", 
>     "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen", 
>     "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen", 
>     "A-pressen", "A-pressen", "A-pressen", "A-pressen"), period = c("1", 

>     "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", 
>     "13", "14", "15", "16", "17", "18", "19", "20"), date = 
c("17.05.1980", 
>     "17.05.1981", "17.05.1982", "17.05.1983", "17.05.1984", 
"17.05.1985", 
>     "17.05.1986", "17.05.1987", "17.05.1988", "17.05.1989", 
"17.05.1990", 
>     "17.05.1991", "17.05.1992", "17.05.1993", "17.05.1994", 
"17.05.1995", 
>     "17.05.1996", "17.05.1997", "17.05.1998", "17.05.1999"), 
>     enddate = c("17.05.1981", "17.05.1982", "17.05.1983", "17.05.1984", 
>     "17.05.1985", "17.05.1986", "17.05.1987", "17.05.1988", 
"17.05.1989", 
>     "17.05.1990", "17.05.1991", "17.05.1992", "17.05.1993", 
"17.05.1994", 
>     "17.05.1995", "17.05.1996", "17.05.1997", "17.05.1998", 
"17.05.1999", 
>     "17.05.2000"), div = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 
>     "0", "0", "0", "0", "0", "5", "0", "1.1", "1.2", "1", "0"
>     ), ndivs = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, "0", "0", 
>     "0", "0", "0", "1", "0", "1", "1", "1", "0"), posdiv = c(NA, 
>     NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "1", 
>     NA, "1", "1", "1", NA), ddiv2 = c(NA, NA, NA, NA, NA, NA, 
>     NA, NA, NA, NA, "0", "0", "0", "0", "0", NA, "0", "1", NA, 
>     NA), ddiv3 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
>     "0", "0", "0", "0", "0", "0", "0", "0", "-1"), ddiv4 = c(NA, 
>     NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "0", "0", "0", 
>     "0", "0", "0", "0", "0"), ddiv5 = c(NA, NA, NA, NA, NA, NA, 
>     NA, NA, NA, NA, NA, NA, NA, "0", "0", "0", "0", "0", "0", 
>     "0"), ddiv6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
>     NA, NA, NA, "0", "0", "0", "0", "0", "0")), .Names = c("id", 
> "compid", "secid", "name", "period", "date", "enddate", "div", 
> "ndivs", "posdiv", "ddiv2", "ddiv3", "ddiv4", "ddiv5", "ddiv6"
> ), row.names = c(NA, 20L), class = "data.frame")
> 
> 
> 
> 
> Here is some suitable data for "data3":
> 
> > dput(data3[1:20,])
> 
> structure(list(date = c("02.01.1996", "03.01.1996", "04.01.1996", 
> "05.01.1996", "08.01.1996", "09.01.1996", "10.01.1996", "11.01.1996", 
> "12.01.1996", "15.01.1996", "16.01.1996", "17.01.1996", "18.01.1996", 
> "19.01.1996", "22.01.1996", "23.01.1996", "24.01.1996", "25.01.1996", 
> "26.01.1996", "29.01.1996"), Securityid = c("6001", "6001", "6001", 
> "6001", "6001", "6001", "6001", "6001", "6001", "6001", "6001", 
> "6001", "6001", "6001", "6001", "6001", "6001", "6001", "6001", 
> "6001"), Symbol = c("AAV", "AAV", "AAV", "AAV", "AAV", "AAV", 
> "AAV", "AAV", "AAV", "AAV", "AAV", "AAV", "AAV", "AAV", "AAV", 
> "AAV", "AAV", "AAV", "AAV", "AAV"), name = c("Adresseavisen", 
> "Adresseavisen", "Adresseavisen", "Adresseavisen", "Adresseavisen", 
> "Adresseavisen", "Adresseavisen", "Adresseavisen", "Adresseavisen", 
> "Adresseavisen", "Adresseavisen", "Adresseavisen", "Adresseavisen", 
> "Adresseavisen", "Adresseavisen", "Adresseavisen", "Adresseavisen", 
> "Adresseavisen", "Adresseavisen", "Adresseavisen"), Securitytype =
> c("Ordinary Shares", 
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", "Ordinary 
Shares", 
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", "Ordinary 
Shares", 
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", "Ordinary 
Shares", 
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", "Ordinary 
Shares", 
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares"), Unadjusted =
> c("200", 
> "200", "200", "200", "200", "200", "200", "200", "200", "200", 
> "200", "200", "200", "200", "200", "200", "200", "200", "200", 
> "200"), Event.adjusted = c("200", "200", "200", "200", "200", 
> "200", "200", "200", "200", "200", "200", "200", "200", "200", 
> "200", "200", "200", "200", "200", "200"), Div.and.Event.adjusted =
> c("109,7595375", 
> "109,7595375", "109,7595375", "109,7595375", "109,7595375", 
"109,7595375", 
> "109,7595375", "109,7595375", "109,7595375", "109,7595375", 
"109,7595375", 
> "109,7595375", "109,7595375", "109,7595375", "109,7595375", 
"109,7595375", 
> "109,7595375", "109,7595375", "109,7595375", "109,7595375"), 
>     Sharesissued = c("1901646", "1901646", "1901646", "1901646", 
>     "1901646", "1901646", "1901646", "1901646", "1901646", "1901646", 
>     "1901646", "1901646", "1901646", "1901646", "1901646", "1901646", 
>     "1901646", "1901646", "1901646", "1901646")), .Names = c("date", 
> "Securityid", "Symbol", "name", "Securitytype", "Unadjusted", 
> "Event.adjusted", "Div.and.Event.adjusted", "Sharesissued"), row.names =
> c(NA, 
> 20L), class = "data.frame")
> 
> 
> 
> When I run the function:
> 
> data4<-merge(data,data3, by=c("name","date1"), all=T)
> 
> > dput(data4[1:20,])
> 
> structure(list(name = c("A-pressen", "A-pressen", "A-pressen", 
> "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen", 
> "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen", 
> "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen", 
> "A-pressen", "A-pressen"), date = c("01.02.1999", "01.02.2000", 
> "01.02.2001", "01.02.2002", "01.03.1999", "01.03.2000", "01.03.2001", 
> "01.03.2002", "01.04.2003", "01.06.1999", "01.06.2001", "01.07.1999", 
> "01.07.2002", "01.07.2003", "01.08.2000", "01.08.2001", "01.08.2002", 
> "01.08.2003", "01.09.1999", "01.09.2000"), id = c(NA_character_, 
> NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, 
> NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, 
> NA_character_, NA_character_, NA_character_, NA_character_, 
NA_character_, 
> NA_character_, NA_character_, NA_character_, NA_character_), 
>     compid = c(NA_character_, NA_character_, NA_character_, 
NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_
>     ), secid = c(NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_), period = c(NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_), enddate = c(NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_), div = c(NA_character_, 

>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_), ndivs = 
c(NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_), posdiv = 
c(NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_), ddiv2 = 
c(NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_), ddiv3 = 
c(NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_), ddiv4 = 
c(NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_), ddiv5 = 
c(NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_), ddiv6 = 
c(NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_, NA_character_, 
>     NA_character_, NA_character_, NA_character_), Securityid = 
c("15856", 
>     "15856", "15856", "15856", "15856", "15856", "15856", "15856", 
>     "15856", "15856", "15856", "15856", "15856", "15856", "15856", 
>     "15856", "15856", "15856", "15856", "15856"), Symbol = c("APR", 
>     "APR", "APR", "APR", "APR", "APR", "APR", "APR", "APR", "APR", 
>     "APR", "APR", "APR", "APR", "APR", "APR", "APR", "APR", "APR", 
>     "APR"), Securitytype = c("Ordinary Shares", "Ordinary Shares", 
>     "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", 
>     "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", 
>     "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", 
>     "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", 
>     "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", 
>     "Ordinary Shares", "Ordinary Shares", "Ordinary Shares"), 
>     Unadjusted = c("120", "140", "160", "105", "110", "190", 
>     "160", "112", "115", "120", "150", "127", "106,5", "154", 
>     "155", "160", "111", "155", "127", "205"), Event.adjusted = c("120", 

>     "140", "160", "105", "110", "190", "160", "112", "115", "120", 
>     "150", "127", "106,5", "154", "155", "160", "111", "155", 
>     "127", "205"), Div.and.Event.adjusted = c("111,4092308", 
>     "129,9774359", "148,545641", "100,1538462", "102,1251282", 
>     "176,3979487", "148,545641", "106,8307692", "109,6923077", 
>     "111,4092308", "143,0769231", "117,9081026", "101,5846154", 
>     "154", "143,9035897", "152,6153846", "105,8769231", "155", 
>     "117,9081026", "190,3241026"), Sharesissued = c("8839643", 
>     "8839643", "8854307", "8866191", "8839643", "8839643", "8854307", 
>     "8867791", "8885537", "8839643", "8866191", "8839643", "8885537", 
>     "8903842", "8854307", "8866191", "8885537", "8903842", "8839643", 
>     "8854307")), .Names = c("name", "date", "id", "compid", "secid", 
> "period", "enddate", "div", "ndivs", "posdiv", "ddiv2", "ddiv3", 
> "ddiv4", "ddiv5", "ddiv6", "Securityid", "Symbol", "Securitytype", 
> "Unadjusted", "Event.adjusted", "Div.and.Event.adjusted", "Sharesissued"
> ), row.names = c(NA, 20L), class = "data.frame")
> 
> 
> If I want to just keep the observations from "data" (14446 observations)
> merged with "data3". Is it correct to use:
> data4<-merge(data,data3, by=c("name","date1"), all=T)
> or should I use:
> data4<-merge(data,data3, by=c("name","date1"), all.x=F)? 
> (Then I get ca 14000 obs, but "NA" in all variables from "data3".
> 
> In advance, thank you.
> 
> 
> 
> --
> View this message in context: http://r.789695.n4.nabble.com/merging-two-
> dataframes-tp3932869p3940157.html
> Sent from the R help mailing list archive at Nabble.com.
> 
> ______________________________________________
> R-help at r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide 
http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.



More information about the R-help mailing list