[R] Help on reducing multiple loops
John Kane
jrkrideau at yahoo.ca
Thu May 18 11:55:39 CEST 2017
Data? It's difficult to do anything without some test data.See How to make a great R reproducible example? or http://adv-r.had.co.nz/Reproducibility.html
with particular reference to the use of dput() as the best way to provide sample data.
|
|
|
| | |
|
|
|
| |
How to make a great R reproducible example?
When discussing performance with colleagues, teaching, sending a bug report or searching for guidance on mailing... | |
|
|
On Wednesday, May 17, 2017 6:10 PM, Sumanta Basak <sumanta24 at gmail.com> wrote:
Hi All,
I've a data-set on product sub-product matrix on which I'm doing multiple
calculation, but unfortunately using nested loops, the programme is taking
long time to execute. Can anyone help me how to get rid of the following
jungle? Any direction would be helpful.
GA <- "India"
verticle <- "Prod1"
prod_data <- readRDS(paste0("/Prod_ladder_",GA,"_",verticle,".rds"))
setDF(prod_data)
Final_data <-
subset(prod_data[,c("P_KEY","Active_Prod_Id","Active_Prod_Nm")],!duplicated(prod_data[,c("P_KEY","Active_Prod_Id")]))
proximity_prod_mapping <- readRDS("Proximity_prod_mapping.rds")
dst_prod <- subset(prod_data[,c("P_KEY")],!duplicated(prod_data$P_KEY))
output_data <- c()
data_merge_final <- c()
system.time({
for(i in 1 : length(dst_prod)){
prod_data <- subset(prod_data,prod_data$P_KEY == dst_prod[i]) #
Subsetting data at prod level
dst_prod <-
subset(prod_data[,c("Active_Prod_Id")],!duplicated(prod_data$Active_Prod_Id))
# Finding distinct prods of active prodloyee
for(j in 1 : length(dst_prod)){
# Subsetting data at prod level for active prod
# Fetiching data for Anchor prod
prod_data1 <-
subset(prod_data[,c("P_KEY","Active_Prod_Id","Active_Prod_Nm","Start_Date_1","End_Date_1")],prod_data$Active_Prod_Id
== dst_prod[j])
prod_data1$Anchor_prod <- 1
anc_max_End_Date_1 <- as.Date(max(prod_data1$End_Date_1),origin =
"1970-01-01")
anc_prod_count <- sum(prod_data1$Anchor_prod)
# Fetiching data for Proximate prod
prox_prod_data <-
subset(proximity_prod_mapping[,c("Proximate_prod_ID")],proximity_prod_mapping$Anchor_prod_ID
== dst_prod[j])
prod_data2 <-
subset(prod_data[,c("P_KEY","Active_Prod_Id","Active_Prod_Nm","Start_Date_1","End_Date_1")],prod_data$Active_Prod_Id
%in% c(prox_prod_data))
prox_sill_count <- 0
if(nrow(prod_data2) > 0){
prod_data2$Proximity_prod <- 1
prox_max_End_Date_1 <- as.Date(max(prod_data2$End_Date_1),origin =
"1970-01-01")
prox_sill_count <- sum(prod_data2$Proximity_prod)
}
# library(plyr)
prod_data <-rbind.fill(prod_data1,prod_data2)
prod_data$exclude <- 0
prod_data$Anchor_Active_Prod_Id <- dst_prod[j]
prod_data$Start_Date_1 <- as.Date(prod_data$Start_Date_1,origin =
"1970-01-01")
prod_data$End_Date_1 <- as.Date(prod_data$End_Date_1,origin =
"1970-01-01")
if(prox_sill_count > 0){
if(nrow(prod_data) > 1){
# Trimming end date of proximity prods where end data of
proximity prod is greater that Anchor prod
if((prox_max_End_Date_1 - anc_max_End_Date_1) > 0){
prod_data$End_Date_1 <- ifelse(prod_data$Proximity_prod == 1 &
(prod_data$End_Date_1 - anc_max_End_Date_1) > 0,
anc_max_End_Date_1,prod_data$End_Date_1)
prod_data$End_Date_1 <- as.Date(prod_data$End_Date_1,origin =
"1970-01-01")
}
prod_data$exclude <- ifelse(prod_data$Proximity_prod == 1 &
(as.Date(prod_data$Start_Date_1,origin = "1970-01-01") -
anc_max_End_Date_1) > 0,1,0)
prod_data <- subset(prod_data,prod_data$exclude == 0)
prod_data <-
arrange(prod_data,prod_data$Anchor_prod,desc(prod_data$End_Date_1),prod_data$Start_Date_1)
prod_data$Anchor_prod <- ifelse(is.na
(prod_data$Anchor_prod),0,prod_data$Anchor_prod)
prod_data$Proximity_prod <- ifelse(is.na
(prod_data$Proximity_prod),0,prod_data$Proximity_prod)
prod_data$new_rec <- 0
tot_loop <- nrow(prod_data)
k=1
# Looping to map start date and end date of each row with other
rows
while(k <= tot_loop){
excl_flag <- prod_data[k,c("exclude")]
if(excl_flag == 0){
st_dt1 <- as.Date(prod_data[k,c("Start_Date_1")])
end_dt1 <- as.Date(prod_data[k,c("End_Date_1")])
prod_flag1 <- prod_data[k,c("Anchor_prod")]
if(k != nrow(prod_data)){
tot_row <- nrow(prod_data)
for(m in 1 : (tot_row -k)){
l = k+m
if(l != k){
st_dt2 <- as.Date(prod_data[l,c("Start_Date_1")])
end_dt2 <- as.Date(prod_data[l,c("End_Date_1")])
prod_flag2 <- prod_data[l,c("Anchor_prod")]
flag_excl <- prod_data[l,c("exclude")]
if(flag_excl ==0){
rec_check <- prod_data[l,c("new_rec")]
# if(rec_check == 0){
prod_data$Start_date2 <- NA
prod_data$End_date2 <- NA
new_start_date <- as.Date(ifelse(prod_flag1 == 1 &
prod_flag2 == 1,NA,
ifelse(prod_flag1 ==
1 & prod_flag2 == 0 & end_dt2 > end_dt1 & st_dt2 < end_dt1,end_dt1,
ifelse(prod_flag1 == 0 & prod_flag2 == 1 & end_dt1 > end_dt2 & st_dt1 <
end_dt2,end_dt2,NA))),origin = "1970-01-01")
message(paste0("new_start_date = ",new_start_date))
new_start_date <- as.Date(new_start_date,origin =
"1970-01-01")
message(paste0("new_start_date = ",new_start_date))
new_end_date <- as.Date(ifelse(prod_flag1 == 1 &
prod_flag2 == 1,NA,
ifelse(prod_flag1 == 1
& prod_flag2 == 0 & end_dt2 > end_dt1 & st_dt2 < end_dt1,end_dt2,
ifelse(prod_flag1 == 0 & prod_flag2 == 1 & end_dt1 > end_dt2 & st_dt1 <
end_dt2,end_dt1,NA))),origin = "1970-01-01")
message(paste0("new_end_date = ",new_end_date))
new_end_date <- as.Date(new_end_date,origin =
"1970-01-01")
message(paste0("new_end_date = ",new_end_date))
prod_data[l,c("Start_date2")] <-
as.Date(new_start_date,origin = "1970-01-01")
prod_data[l,c("End_date2")] <-
as.Date(new_end_date,origin = "1970-01-01")
tmp_data <- subset(prod_data,!is.na
(prod_data$Start_date2))
tmp_data$Start_Date_1 <-
as.Date(tmp_data$Start_date2,origin = "1970-01-01")
tmp_data$End_Date_1 <-
as.Date(tmp_data$End_date2,origin = "1970-01-01")
if(nrow(tmp_data)){
tmp_data$new_rec <- 1
prod_data[l,c("End_Date_1")] <-
as.Date(end_dt1,origin = "1970-01-01")
}
prod_data <- rbind(prod_data,tmp_data)
tot_row <- tot_row + nrow(tmp_data)
tot_loop <- tot_loop + nrow(tmp_data)
prod_data$Start_date2 <- NULL
prod_data$End_date2 <- NULL
# }
}
# Condition to identify true subset
# overlap <- ifelse((st_dt1 >= st_dt2 & st_dt1 <=
end_dt2) & (end_dt1 >= st_dt2 & end_dt1 <= end_dt2),1,
# ifelse((st_dt2 >= st_dt1 & st_dt2 <=
end_dt1) & (end_dt2 >= st_dt1 & end_dt2 <= end_dt1),1,0))
if((end_dt1 - st_dt2) >= 0){
if((end_dt2 - st_dt1) >= 0){
if((st_dt2 - st_dt1) >=0){
prod_data[k,c("exclude")] <- ifelse(prod_flag1 ==
1 & prod_flag2 == 1,9999, #if Anchor prods have overlapping
ifelse(prod_flag1 == 1 & prod_flag2 == 0,0,
ifelse(prod_flag1 == 0 & prod_flag2 == 1,1,
ifelse(prod_flag1 == 0 & prod_flag2 == 0,0,1))))
prod_data[l,c("exclude")] <- ifelse(prod_flag1 ==
1 & prod_flag2 == 1,9999,
ifelse(prod_flag1 == 0 & prod_flag2 == 1,0,
ifelse(prod_flag1 == 1 & prod_flag2 == 0,1,
ifelse(prod_flag1 == 0 & prod_flag2 == 0,1,0))))
}
}
}
# Condition to trim the dates as to make dates in each
observation mutually exclusive to exch other
flag_excl <- prod_data[l,c("exclude")]
if(flag_excl == 0){
if(end_dt1 > st_dt2){
if(st_dt1 >= st_dt2){
new_date <- ifelse(end_dt2 >
st_dt1,as.Date(st_dt1,origin = "1970-01-01"),as.Date(end_dt2,origin =
"1970-01-01"))
new_date <- as.Date(new_date,origin =
"1970-01-01")
old_date <-
as.Date(prod_data[l,c("End_Date_1")],origin = "1970-01-01")
old_date <- as.Date(old_date,origin =
"1970-01-01")
# prod_data[j,c("End_Date_1")] <-
ifelse(prod_flag1 == 1 & prod_flag2 == 1,as.Date(old_date,origin =
"1970-01-01"),
#
ifelse(prod_flag1 == 0 & prod_flag2 == 1,as.date(old_date, origin =
"1970-01-01"),as.Date(new_date,origin = "1970-01-01")))
prod_data[l,c("End_Date_1")] <-
as.Date(ifelse(prod_flag1 == 1 & prod_flag2 == 1,old_date,ifelse(prod_flag1
== 0 & prod_flag2 == 1,old_date,new_date)),origin = "1970-01-01")
}
}
}
}
}
}
}
k=k+1
}
}
}
# excluding non required observations
prod_data <- subset(prod_data,prod_data$exclude == 0)
prod_data$multiply_factor <- ifelse(prod_data$Anchor_prod == 1,1,
ifelse(prod_data$Proximity_prod
== 1,0.5,9999))
prod_data$recency_in_months <- (as.Date("2017-01-31") -
prod_data$End_Date_1)/30
prod_data$recency_factor <- ifelse(prod_data$recency_in_months <=
12,1,
ifelse(prod_data$recency_in_months > 12 & prod_data$recency_in_months <=
24,0.9,
ifelse(prod_data$recency_in_months > 24 & prod_data$recency_in_months <=
36,0.8,
ifelse(prod_data$recency_in_months > 36 & prod_data$recency_in_months <=
48,0.7,
ifelse(prod_data$recency_in_months > 48,0.6,9999)))))
prod_data$duration_in_months <- (prod_data$End_Date_1 -
prod_data$Start_Date_1)/30
prod_data$weight <-
prod_data$duration_in_months*prod_data$multiply_factor*prod_data$recency_factor
prod <- prod_data[1,c("Anchor_Active_Prod_Id")]
if(nrow(prod_data) > 1){
data_merge <-with(prod_data,aggregate(weight ~ P_KEY, FUN =
function(x) c(Proficiency_Score = sum(x))))
}else{
data_merge <- prod_data[1,c("P_KEY","weight")]
}
data_merge$prod <- prod_data[1,c("Anchor_Active_Prod_Id")]
data_merge_final <- rbind(data_merge_final,data_merge)
# Recency and Duration calculation goes here and final score will be
added in final data
output_data <- rbind.fill(output_data,prod_data)
}
}
Final_data <- merge(Final_data,data_merge_final,by.x= c("P_KEY",
"Active_Prod_Id"),by.y = c("P_KEY", "prod"),all.x=TRUE)
names(Final_data)[names(Final_data) == "weight"] <- "Proficiency_Score"
emerging_prod_mapping <- readRDS("5.Emerging_prod_Lookup.rds")
emerging_prod_list <-
subset(emerging_prod_mapping[,c("prod_ID")],!duplicated(emerging_prod_mapping$prod_ID))
Final_data$Emerging_Traditional <- ifelse(Final_data$Active_Prod_Id %in%
c(emerging_prod_list),"Emerging","Traditional")
Final_data$Final_Proficiency <- ifelse(Final_data$Emerging_Traditional ==
"Traditional",
ifelse(Final_data$Proficiency_Score < 12, "P0",
ifelse(Final_data$Proficiency_Score >=12 & Final_data$Proficiency_Score <
24,"P1",
ifelse(Final_data$Proficiency_Score >=24 & Final_data$Proficiency_Score <
48,"P2",
ifelse(Final_data$Proficiency_Score >=48 & Final_data$Proficiency_Score <
60,"P3",
ifelse(Final_data$Proficiency_Score >=60,"P4",NA))))),
ifelse(Final_data$Emerging_Traditional == "Emerging",
ifelse(Final_data$Proficiency_Score < 6, "P0",
ifelse(Final_data$Proficiency_Score >=6 & Final_data$Proficiency_Score <
12,"P1",
ifelse(Final_data$Proficiency_Score >=12 & Final_data$Proficiency_Score <
24,"P2",
ifelse(Final_data$Proficiency_Score >=24 & Final_data$Proficiency_Score <
30,"P3",
ifelse(Final_data$Proficiency_Score >=30,"P4",NA))))),NA))
tst <- prod_data[,c("P_KEY", "Id")]
tst <- subset(tst,!duplicated(tst))
Final_data <-
merge(Final_data,tst[,c("P_KEY","Id")],by="P_KEY",all.x=TRUE)
})
*SUMANTA BASAK*
[[alternative HTML version deleted]]
______________________________________________
R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see
https://stat.ethz.ch/mailman/listinfo/r-help
PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
and provide commented, minimal, self-contained, reproducible code.
[[alternative HTML version deleted]]
More information about the R-help
mailing list