[R] merging two dataframes
Petr PIKAL
petr.pikal at precheza.cz
Wed Oct 26 15:37:58 CEST 2011
>
> Hello.
>
> Now i tried to do what you told me.
> I used the str(fuction), and data$date1 and data3$date1 where both
listed
You have no date1 only date. Therefore
result<- merge(data, data3, by=c("date", "name"), all=T)
takes all values from both data frames
> dim(data)
[1] 20 15
> dim(data3)
[1] 20 9
alltogether 24 columns from which 4 are date and name columns therefore 20
columns contain data.
> dim(result)
[1] 40 22
So the result has all 20 columns from both data frames plus one name and
one date column and all rows from both data frames = 40. Those two sets
are disjoint. If you had some common date and name in both data frames
these rows would be merged on the same row in result.
Let us try this.
> data3$name[1:5] <- data$name[1:5]
> data3$date[3:5] <- data$date[3:5]
result<- merge(data, data3, by=c("date", "name"), all=T)
dim(result)
[1] 37 22
Regards
Petr
> "character". I changed "name" to character but it did not work either.
> I also changed all variables to character, with no positive result.
>
> str(data)
> 'data.frame': 14446 obs. of 15 variables:
> $ id : chr "1" "1" "1" "1" ...
> $ compid : chr "2514" "2514" "2514" "2514" ...
> $ secid : chr "15856" "15856" "15856" "15856" ...
> $ name : chr "A-pressen" "A-pressen" "A-pressen" "A-pressen" ...
> $ period : chr "1" "2" "3" "4" ...
> $ date : chr "17.05.1980" "17.05.1981" "17.05.1982" "17.05.1983" ...
> $ enddate: chr "17.05.1981" "17.05.1982" "17.05.1983" "17.05.1984" ...
> $ div : chr NA NA NA NA ...
> $ ndivs : chr NA NA NA NA ...
> $ posdiv : chr NA NA NA NA ...
> $ ddiv2 : chr NA NA NA NA ...
> $ ddiv3 : chr NA NA NA NA ...
> $ ddiv4 : chr NA NA NA NA ...
> $ ddiv5 : chr NA NA NA NA ...
> $ ddiv6 : chr NA NA NA NA ...
>
> str(data3)
> 'data.frame': 812354 obs. of 9 variables:
> $ date : chr "02.01.1996" "03.01.1996" "04.01.1996"
> "05.01.1996" ...
> $ Securityid : chr "6001" "6001" "6001" "6001" ...
> $ Symbol : chr "AAV" "AAV" "AAV" "AAV" ...
> $ name : chr "Adresseavisen" "Adresseavisen"
> "Adresseavisen" "Adresseavisen" ...
> $ Securitytype : chr "Ordinary Shares" "Ordinary Shares"
> "Ordinary Shares" "Ordinary Shares" ...
> $ Unadjusted : chr "200" "200" "200" "200" ...
> $ Event.adjusted : chr "200" "200" "200" "200" ...
> $ Div.and.Event.adjusted: chr "109,7595375" "109,7595375"
"109,7595375"
> "109,7595375" ...
> $ Sharesissued : chr "1901646" "1901646" "1901646" "1901646"
...
>
> Here is some suitable data for "data"
>
> > dput(data[1:20,])
>
> structure(list(id = c("1", "1", "1", "1", "1", "1", "1", "1",
> "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"),
> compid = c("2514", "2514", "2514", "2514", "2514", "2514",
> "2514", "2514", "2514", "2514", "2514", "2514", "2514", "2514",
> "2514", "2514", "2514", "2514", "2514", "2514"), secid = c("15856",
> "15856", "15856", "15856", "15856", "15856", "15856", "15856",
> "15856", "15856", "15856", "15856", "15856", "15856", "15856",
> "15856", "15856", "15856", "15856", "15856"), name = c("A-pressen",
> "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen",
> "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen",
> "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen",
> "A-pressen", "A-pressen", "A-pressen", "A-pressen"), period = c("1",
> "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12",
> "13", "14", "15", "16", "17", "18", "19", "20"), date =
c("17.05.1980",
> "17.05.1981", "17.05.1982", "17.05.1983", "17.05.1984",
"17.05.1985",
> "17.05.1986", "17.05.1987", "17.05.1988", "17.05.1989",
"17.05.1990",
> "17.05.1991", "17.05.1992", "17.05.1993", "17.05.1994",
"17.05.1995",
> "17.05.1996", "17.05.1997", "17.05.1998", "17.05.1999"),
> enddate = c("17.05.1981", "17.05.1982", "17.05.1983", "17.05.1984",
> "17.05.1985", "17.05.1986", "17.05.1987", "17.05.1988",
"17.05.1989",
> "17.05.1990", "17.05.1991", "17.05.1992", "17.05.1993",
"17.05.1994",
> "17.05.1995", "17.05.1996", "17.05.1997", "17.05.1998",
"17.05.1999",
> "17.05.2000"), div = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
> "0", "0", "0", "0", "0", "5", "0", "1.1", "1.2", "1", "0"
> ), ndivs = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, "0", "0",
> "0", "0", "0", "1", "0", "1", "1", "1", "0"), posdiv = c(NA,
> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "1",
> NA, "1", "1", "1", NA), ddiv2 = c(NA, NA, NA, NA, NA, NA,
> NA, NA, NA, NA, "0", "0", "0", "0", "0", NA, "0", "1", NA,
> NA), ddiv3 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
> "0", "0", "0", "0", "0", "0", "0", "0", "-1"), ddiv4 = c(NA,
> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "0", "0", "0",
> "0", "0", "0", "0", "0"), ddiv5 = c(NA, NA, NA, NA, NA, NA,
> NA, NA, NA, NA, NA, NA, NA, "0", "0", "0", "0", "0", "0",
> "0"), ddiv6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
> NA, NA, NA, "0", "0", "0", "0", "0", "0")), .Names = c("id",
> "compid", "secid", "name", "period", "date", "enddate", "div",
> "ndivs", "posdiv", "ddiv2", "ddiv3", "ddiv4", "ddiv5", "ddiv6"
> ), row.names = c(NA, 20L), class = "data.frame")
>
>
>
>
> Here is some suitable data for "data3":
>
> > dput(data3[1:20,])
>
> structure(list(date = c("02.01.1996", "03.01.1996", "04.01.1996",
> "05.01.1996", "08.01.1996", "09.01.1996", "10.01.1996", "11.01.1996",
> "12.01.1996", "15.01.1996", "16.01.1996", "17.01.1996", "18.01.1996",
> "19.01.1996", "22.01.1996", "23.01.1996", "24.01.1996", "25.01.1996",
> "26.01.1996", "29.01.1996"), Securityid = c("6001", "6001", "6001",
> "6001", "6001", "6001", "6001", "6001", "6001", "6001", "6001",
> "6001", "6001", "6001", "6001", "6001", "6001", "6001", "6001",
> "6001"), Symbol = c("AAV", "AAV", "AAV", "AAV", "AAV", "AAV",
> "AAV", "AAV", "AAV", "AAV", "AAV", "AAV", "AAV", "AAV", "AAV",
> "AAV", "AAV", "AAV", "AAV", "AAV"), name = c("Adresseavisen",
> "Adresseavisen", "Adresseavisen", "Adresseavisen", "Adresseavisen",
> "Adresseavisen", "Adresseavisen", "Adresseavisen", "Adresseavisen",
> "Adresseavisen", "Adresseavisen", "Adresseavisen", "Adresseavisen",
> "Adresseavisen", "Adresseavisen", "Adresseavisen", "Adresseavisen",
> "Adresseavisen", "Adresseavisen", "Adresseavisen"), Securitytype =
> c("Ordinary Shares",
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", "Ordinary
Shares",
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", "Ordinary
Shares",
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", "Ordinary
Shares",
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares", "Ordinary
Shares",
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares"), Unadjusted =
> c("200",
> "200", "200", "200", "200", "200", "200", "200", "200", "200",
> "200", "200", "200", "200", "200", "200", "200", "200", "200",
> "200"), Event.adjusted = c("200", "200", "200", "200", "200",
> "200", "200", "200", "200", "200", "200", "200", "200", "200",
> "200", "200", "200", "200", "200", "200"), Div.and.Event.adjusted =
> c("109,7595375",
> "109,7595375", "109,7595375", "109,7595375", "109,7595375",
"109,7595375",
> "109,7595375", "109,7595375", "109,7595375", "109,7595375",
"109,7595375",
> "109,7595375", "109,7595375", "109,7595375", "109,7595375",
"109,7595375",
> "109,7595375", "109,7595375", "109,7595375", "109,7595375"),
> Sharesissued = c("1901646", "1901646", "1901646", "1901646",
> "1901646", "1901646", "1901646", "1901646", "1901646", "1901646",
> "1901646", "1901646", "1901646", "1901646", "1901646", "1901646",
> "1901646", "1901646", "1901646", "1901646")), .Names = c("date",
> "Securityid", "Symbol", "name", "Securitytype", "Unadjusted",
> "Event.adjusted", "Div.and.Event.adjusted", "Sharesissued"), row.names =
> c(NA,
> 20L), class = "data.frame")
>
>
>
> When I run the function:
>
> data4<-merge(data,data3, by=c("name","date1"), all=T)
>
> > dput(data4[1:20,])
>
> structure(list(name = c("A-pressen", "A-pressen", "A-pressen",
> "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen",
> "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen",
> "A-pressen", "A-pressen", "A-pressen", "A-pressen", "A-pressen",
> "A-pressen", "A-pressen"), date = c("01.02.1999", "01.02.2000",
> "01.02.2001", "01.02.2002", "01.03.1999", "01.03.2000", "01.03.2001",
> "01.03.2002", "01.04.2003", "01.06.1999", "01.06.2001", "01.07.1999",
> "01.07.2002", "01.07.2003", "01.08.2000", "01.08.2001", "01.08.2002",
> "01.08.2003", "01.09.1999", "01.09.2000"), id = c(NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_),
> compid = c(NA_character_, NA_character_, NA_character_,
NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_
> ), secid = c(NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_), period = c(NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_), enddate = c(NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_), div = c(NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_), ndivs =
c(NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_), posdiv =
c(NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_), ddiv2 =
c(NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_), ddiv3 =
c(NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_), ddiv4 =
c(NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_), ddiv5 =
c(NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_), ddiv6 =
c(NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_, NA_character_,
> NA_character_, NA_character_, NA_character_), Securityid =
c("15856",
> "15856", "15856", "15856", "15856", "15856", "15856", "15856",
> "15856", "15856", "15856", "15856", "15856", "15856", "15856",
> "15856", "15856", "15856", "15856", "15856"), Symbol = c("APR",
> "APR", "APR", "APR", "APR", "APR", "APR", "APR", "APR", "APR",
> "APR", "APR", "APR", "APR", "APR", "APR", "APR", "APR", "APR",
> "APR"), Securitytype = c("Ordinary Shares", "Ordinary Shares",
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares",
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares",
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares",
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares",
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares",
> "Ordinary Shares", "Ordinary Shares", "Ordinary Shares"),
> Unadjusted = c("120", "140", "160", "105", "110", "190",
> "160", "112", "115", "120", "150", "127", "106,5", "154",
> "155", "160", "111", "155", "127", "205"), Event.adjusted = c("120",
> "140", "160", "105", "110", "190", "160", "112", "115", "120",
> "150", "127", "106,5", "154", "155", "160", "111", "155",
> "127", "205"), Div.and.Event.adjusted = c("111,4092308",
> "129,9774359", "148,545641", "100,1538462", "102,1251282",
> "176,3979487", "148,545641", "106,8307692", "109,6923077",
> "111,4092308", "143,0769231", "117,9081026", "101,5846154",
> "154", "143,9035897", "152,6153846", "105,8769231", "155",
> "117,9081026", "190,3241026"), Sharesissued = c("8839643",
> "8839643", "8854307", "8866191", "8839643", "8839643", "8854307",
> "8867791", "8885537", "8839643", "8866191", "8839643", "8885537",
> "8903842", "8854307", "8866191", "8885537", "8903842", "8839643",
> "8854307")), .Names = c("name", "date", "id", "compid", "secid",
> "period", "enddate", "div", "ndivs", "posdiv", "ddiv2", "ddiv3",
> "ddiv4", "ddiv5", "ddiv6", "Securityid", "Symbol", "Securitytype",
> "Unadjusted", "Event.adjusted", "Div.and.Event.adjusted", "Sharesissued"
> ), row.names = c(NA, 20L), class = "data.frame")
>
>
> If I want to just keep the observations from "data" (14446 observations)
> merged with "data3". Is it correct to use:
> data4<-merge(data,data3, by=c("name","date1"), all=T)
> or should I use:
> data4<-merge(data,data3, by=c("name","date1"), all.x=F)?
> (Then I get ca 14000 obs, but "NA" in all variables from "data3".
>
> In advance, thank you.
>
>
>
> --
> View this message in context: http://r.789695.n4.nabble.com/merging-two-
> dataframes-tp3932869p3940157.html
> Sent from the R help mailing list archive at Nabble.com.
>
> ______________________________________________
> R-help at r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide
http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
More information about the R-help
mailing list