[R] Help on reducing multiple loops
Sumanta Basak
sumanta24 at gmail.com
Wed May 17 08:22:23 CEST 2017
Hi All,
I've a data-set on product sub-product matrix on which I'm doing multiple
calculation, but unfortunately using nested loops, the programme is taking
long time to execute. Can anyone help me how to get rid of the following
jungle? Any direction would be helpful.
GA <- "India"
verticle <- "Prod1"
prod_data <- readRDS(paste0("/Prod_ladder_",GA,"_",verticle,".rds"))
setDF(prod_data)
Final_data <-
subset(prod_data[,c("P_KEY","Active_Prod_Id","Active_Prod_Nm")],!duplicated(prod_data[,c("P_KEY","Active_Prod_Id")]))
proximity_prod_mapping <- readRDS("Proximity_prod_mapping.rds")
dst_prod <- subset(prod_data[,c("P_KEY")],!duplicated(prod_data$P_KEY))
output_data <- c()
data_merge_final <- c()
system.time({
for(i in 1 : length(dst_prod)){
prod_data <- subset(prod_data,prod_data$P_KEY == dst_prod[i]) #
Subsetting data at prod level
dst_prod <-
subset(prod_data[,c("Active_Prod_Id")],!duplicated(prod_data$Active_Prod_Id))
# Finding distinct prods of active prodloyee
for(j in 1 : length(dst_prod)){
# Subsetting data at prod level for active prod
# Fetiching data for Anchor prod
prod_data1 <-
subset(prod_data[,c("P_KEY","Active_Prod_Id","Active_Prod_Nm","Start_Date_1","End_Date_1")],prod_data$Active_Prod_Id
== dst_prod[j])
prod_data1$Anchor_prod <- 1
anc_max_End_Date_1 <- as.Date(max(prod_data1$End_Date_1),origin =
"1970-01-01")
anc_prod_count <- sum(prod_data1$Anchor_prod)
# Fetiching data for Proximate prod
prox_prod_data <-
subset(proximity_prod_mapping[,c("Proximate_prod_ID")],proximity_prod_mapping$Anchor_prod_ID
== dst_prod[j])
prod_data2 <-
subset(prod_data[,c("P_KEY","Active_Prod_Id","Active_Prod_Nm","Start_Date_1","End_Date_1")],prod_data$Active_Prod_Id
%in% c(prox_prod_data))
prox_sill_count <- 0
if(nrow(prod_data2) > 0){
prod_data2$Proximity_prod <- 1
prox_max_End_Date_1 <- as.Date(max(prod_data2$End_Date_1),origin =
"1970-01-01")
prox_sill_count <- sum(prod_data2$Proximity_prod)
}
# library(plyr)
prod_data <-rbind.fill(prod_data1,prod_data2)
prod_data$exclude <- 0
prod_data$Anchor_Active_Prod_Id <- dst_prod[j]
prod_data$Start_Date_1 <- as.Date(prod_data$Start_Date_1,origin =
"1970-01-01")
prod_data$End_Date_1 <- as.Date(prod_data$End_Date_1,origin =
"1970-01-01")
if(prox_sill_count > 0){
if(nrow(prod_data) > 1){
# Trimming end date of proximity prods where end data of
proximity prod is greater that Anchor prod
if((prox_max_End_Date_1 - anc_max_End_Date_1) > 0){
prod_data$End_Date_1 <- ifelse(prod_data$Proximity_prod == 1 &
(prod_data$End_Date_1 - anc_max_End_Date_1) > 0,
anc_max_End_Date_1,prod_data$End_Date_1)
prod_data$End_Date_1 <- as.Date(prod_data$End_Date_1,origin =
"1970-01-01")
}
prod_data$exclude <- ifelse(prod_data$Proximity_prod == 1 &
(as.Date(prod_data$Start_Date_1,origin = "1970-01-01") -
anc_max_End_Date_1) > 0,1,0)
prod_data <- subset(prod_data,prod_data$exclude == 0)
prod_data <-
arrange(prod_data,prod_data$Anchor_prod,desc(prod_data$End_Date_1),prod_data$Start_Date_1)
prod_data$Anchor_prod <- ifelse(is.na
(prod_data$Anchor_prod),0,prod_data$Anchor_prod)
prod_data$Proximity_prod <- ifelse(is.na
(prod_data$Proximity_prod),0,prod_data$Proximity_prod)
prod_data$new_rec <- 0
tot_loop <- nrow(prod_data)
k=1
# Looping to map start date and end date of each row with other
rows
while(k <= tot_loop){
excl_flag <- prod_data[k,c("exclude")]
if(excl_flag == 0){
st_dt1 <- as.Date(prod_data[k,c("Start_Date_1")])
end_dt1 <- as.Date(prod_data[k,c("End_Date_1")])
prod_flag1 <- prod_data[k,c("Anchor_prod")]
if(k != nrow(prod_data)){
tot_row <- nrow(prod_data)
for(m in 1 : (tot_row -k)){
l = k+m
if(l != k){
st_dt2 <- as.Date(prod_data[l,c("Start_Date_1")])
end_dt2 <- as.Date(prod_data[l,c("End_Date_1")])
prod_flag2 <- prod_data[l,c("Anchor_prod")]
flag_excl <- prod_data[l,c("exclude")]
if(flag_excl ==0){
rec_check <- prod_data[l,c("new_rec")]
# if(rec_check == 0){
prod_data$Start_date2 <- NA
prod_data$End_date2 <- NA
new_start_date <- as.Date(ifelse(prod_flag1 == 1 &
prod_flag2 == 1,NA,
ifelse(prod_flag1 ==
1 & prod_flag2 == 0 & end_dt2 > end_dt1 & st_dt2 < end_dt1,end_dt1,
ifelse(prod_flag1 == 0 & prod_flag2 == 1 & end_dt1 > end_dt2 & st_dt1 <
end_dt2,end_dt2,NA))),origin = "1970-01-01")
message(paste0("new_start_date = ",new_start_date))
new_start_date <- as.Date(new_start_date,origin =
"1970-01-01")
message(paste0("new_start_date = ",new_start_date))
new_end_date <- as.Date(ifelse(prod_flag1 == 1 &
prod_flag2 == 1,NA,
ifelse(prod_flag1 == 1
& prod_flag2 == 0 & end_dt2 > end_dt1 & st_dt2 < end_dt1,end_dt2,
ifelse(prod_flag1 == 0 & prod_flag2 == 1 & end_dt1 > end_dt2 & st_dt1 <
end_dt2,end_dt1,NA))),origin = "1970-01-01")
message(paste0("new_end_date = ",new_end_date))
new_end_date <- as.Date(new_end_date,origin =
"1970-01-01")
message(paste0("new_end_date = ",new_end_date))
prod_data[l,c("Start_date2")] <-
as.Date(new_start_date,origin = "1970-01-01")
prod_data[l,c("End_date2")] <-
as.Date(new_end_date,origin = "1970-01-01")
tmp_data <- subset(prod_data,!is.na
(prod_data$Start_date2))
tmp_data$Start_Date_1 <-
as.Date(tmp_data$Start_date2,origin = "1970-01-01")
tmp_data$End_Date_1 <-
as.Date(tmp_data$End_date2,origin = "1970-01-01")
if(nrow(tmp_data)){
tmp_data$new_rec <- 1
prod_data[l,c("End_Date_1")] <-
as.Date(end_dt1,origin = "1970-01-01")
}
prod_data <- rbind(prod_data,tmp_data)
tot_row <- tot_row + nrow(tmp_data)
tot_loop <- tot_loop + nrow(tmp_data)
prod_data$Start_date2 <- NULL
prod_data$End_date2 <- NULL
# }
}
# Condition to identify true subset
# overlap <- ifelse((st_dt1 >= st_dt2 & st_dt1 <=
end_dt2) & (end_dt1 >= st_dt2 & end_dt1 <= end_dt2),1,
# ifelse((st_dt2 >= st_dt1 & st_dt2 <=
end_dt1) & (end_dt2 >= st_dt1 & end_dt2 <= end_dt1),1,0))
if((end_dt1 - st_dt2) >= 0){
if((end_dt2 - st_dt1) >= 0){
if((st_dt2 - st_dt1) >=0){
prod_data[k,c("exclude")] <- ifelse(prod_flag1 ==
1 & prod_flag2 == 1,9999, #if Anchor prods have overlapping
ifelse(prod_flag1 == 1 & prod_flag2 == 0,0,
ifelse(prod_flag1 == 0 & prod_flag2 == 1,1,
ifelse(prod_flag1 == 0 & prod_flag2 == 0,0,1))))
prod_data[l,c("exclude")] <- ifelse(prod_flag1 ==
1 & prod_flag2 == 1,9999,
ifelse(prod_flag1 == 0 & prod_flag2 == 1,0,
ifelse(prod_flag1 == 1 & prod_flag2 == 0,1,
ifelse(prod_flag1 == 0 & prod_flag2 == 0,1,0))))
}
}
}
# Condition to trim the dates as to make dates in each
observation mutually exclusive to exch other
flag_excl <- prod_data[l,c("exclude")]
if(flag_excl == 0){
if(end_dt1 > st_dt2){
if(st_dt1 >= st_dt2){
new_date <- ifelse(end_dt2 >
st_dt1,as.Date(st_dt1,origin = "1970-01-01"),as.Date(end_dt2,origin =
"1970-01-01"))
new_date <- as.Date(new_date,origin =
"1970-01-01")
old_date <-
as.Date(prod_data[l,c("End_Date_1")],origin = "1970-01-01")
old_date <- as.Date(old_date,origin =
"1970-01-01")
# prod_data[j,c("End_Date_1")] <-
ifelse(prod_flag1 == 1 & prod_flag2 == 1,as.Date(old_date,origin =
"1970-01-01"),
#
ifelse(prod_flag1 == 0 & prod_flag2 == 1,as.date(old_date, origin =
"1970-01-01"),as.Date(new_date,origin = "1970-01-01")))
prod_data[l,c("End_Date_1")] <-
as.Date(ifelse(prod_flag1 == 1 & prod_flag2 == 1,old_date,ifelse(prod_flag1
== 0 & prod_flag2 == 1,old_date,new_date)),origin = "1970-01-01")
}
}
}
}
}
}
}
k=k+1
}
}
}
# excluding non required observations
prod_data <- subset(prod_data,prod_data$exclude == 0)
prod_data$multiply_factor <- ifelse(prod_data$Anchor_prod == 1,1,
ifelse(prod_data$Proximity_prod
== 1,0.5,9999))
prod_data$recency_in_months <- (as.Date("2017-01-31") -
prod_data$End_Date_1)/30
prod_data$recency_factor <- ifelse(prod_data$recency_in_months <=
12,1,
ifelse(prod_data$recency_in_months > 12 & prod_data$recency_in_months <=
24,0.9,
ifelse(prod_data$recency_in_months > 24 & prod_data$recency_in_months <=
36,0.8,
ifelse(prod_data$recency_in_months > 36 & prod_data$recency_in_months <=
48,0.7,
ifelse(prod_data$recency_in_months > 48,0.6,9999)))))
prod_data$duration_in_months <- (prod_data$End_Date_1 -
prod_data$Start_Date_1)/30
prod_data$weight <-
prod_data$duration_in_months*prod_data$multiply_factor*prod_data$recency_factor
prod <- prod_data[1,c("Anchor_Active_Prod_Id")]
if(nrow(prod_data) > 1){
data_merge <-with(prod_data,aggregate(weight ~ P_KEY, FUN =
function(x) c(Proficiency_Score = sum(x))))
}else{
data_merge <- prod_data[1,c("P_KEY","weight")]
}
data_merge$prod <- prod_data[1,c("Anchor_Active_Prod_Id")]
data_merge_final <- rbind(data_merge_final,data_merge)
# Recency and Duration calculation goes here and final score will be
added in final data
output_data <- rbind.fill(output_data,prod_data)
}
}
Final_data <- merge(Final_data,data_merge_final,by.x= c("P_KEY",
"Active_Prod_Id"),by.y = c("P_KEY", "prod"),all.x=TRUE)
names(Final_data)[names(Final_data) == "weight"] <- "Proficiency_Score"
emerging_prod_mapping <- readRDS("5.Emerging_prod_Lookup.rds")
emerging_prod_list <-
subset(emerging_prod_mapping[,c("prod_ID")],!duplicated(emerging_prod_mapping$prod_ID))
Final_data$Emerging_Traditional <- ifelse(Final_data$Active_Prod_Id %in%
c(emerging_prod_list),"Emerging","Traditional")
Final_data$Final_Proficiency <- ifelse(Final_data$Emerging_Traditional ==
"Traditional",
ifelse(Final_data$Proficiency_Score < 12, "P0",
ifelse(Final_data$Proficiency_Score >=12 & Final_data$Proficiency_Score <
24,"P1",
ifelse(Final_data$Proficiency_Score >=24 & Final_data$Proficiency_Score <
48,"P2",
ifelse(Final_data$Proficiency_Score >=48 & Final_data$Proficiency_Score <
60,"P3",
ifelse(Final_data$Proficiency_Score >=60,"P4",NA))))),
ifelse(Final_data$Emerging_Traditional == "Emerging",
ifelse(Final_data$Proficiency_Score < 6, "P0",
ifelse(Final_data$Proficiency_Score >=6 & Final_data$Proficiency_Score <
12,"P1",
ifelse(Final_data$Proficiency_Score >=12 & Final_data$Proficiency_Score <
24,"P2",
ifelse(Final_data$Proficiency_Score >=24 & Final_data$Proficiency_Score <
30,"P3",
ifelse(Final_data$Proficiency_Score >=30,"P4",NA))))),NA))
tst <- prod_data[,c("P_KEY", "Id")]
tst <- subset(tst,!duplicated(tst))
Final_data <-
merge(Final_data,tst[,c("P_KEY","Id")],by="P_KEY",all.x=TRUE)
})
*SUMANTA BASAK*
[[alternative HTML version deleted]]
More information about the R-help
mailing list