# First, let's load our libraries. library(ggplot2) library(caret) library(xgboost) # Import GSS_SM and do all necessary data cleaning gss <- read.csv("gss_sm.csv") head(gss) # Checking the variables ncol(gss) # 33 summary(gss) nrow(gss) # 2867 - we will see that only about 300 are usable for our purposes gss <- gss[, 4:33] # Removing identification numbers and the year 2016 # Check for outliers boxplot(gss$age) # no outliers boxplot(gss$childs) # one outlier, 8 children - leave it in boxplot(gss$sibs) # Many outliers, 9 and over - most likely errors gss <- subset(gss, sibs < 9) # Remove these errors levels(gss$marital) # Reduces to 2 variables - currently married, ever married levels(gss$padeg) # Should transform this to numbers 1-5 levels(gss$madeg) # same levels(gss$partyid) # Should transform this to numbers 1-8 levels(gss$polviews) # Same, 1-7 levels(gss$happy) # Examine influence of different categories levels(gss$partners) # Same, 1-9 table(gss$partners) # Remove "1 or more, # Unknown", there are only 9, assigning could bias results gss <- subset(gss, partners != "1 or More, # Unknown") boxplot(gss$wtssall) # The 11 outliers have a score of over 3 # This ended up biasing results, so I removed the outliers gss <- subset(gss, wtssall < 3) # I did checks on other variables; no changes were needed, so I deleted these # Let's convert some variables to numeric # Names are based on original data's naming method, adding "num" when necessary gss$degnum <- sapply(as.character(gss$degree), switch, "Lt High School" = 1, "High School" = 2, "Junior College" = 3, "Bachelor" = 4, "Graduate" = 5, USE.NAMES = F) gss$padegnum <- sapply(as.character(gss$padeg), switch, "Lt High School" = 1, "High School" = 2, "Junior College" = 3, "Bachelor" = 4, "Graduate" = 5, USE.NAMES = F) gss$madegnum <- sapply(as.character(gss$madeg), switch, "Lt High School" = 1, "High School" = 2, "Junior College" = 3, "Bachelor" = 4, "Graduate" = 5, USE.NAMES = F) gss$partynum <- sapply(as.character(gss$partyid), switch, "Strong Republican" = 1, "Not Str Republican" = 2, "Ind,near Rep" = 3, "Independent" = 4, "Other Party" = 5, "Ind,near Dem" = 6, "Not Str Democrat" = 7, "Strong Democrat" = 8, USE.NAMES = F) gss$polnum <- sapply(as.character(gss$polviews), switch, "Extremely Conservative" = 1, "Conservative" = 2, "Slightly Conservative" = 3, "Moderate" = 4, "Slightly Liberal" = 5, "Liberal" = 6, "Extremely Liberal" = 7, USE.NAMES = F) gss$partrnum <- sapply(as.character(gss$partners), switch, "No Partners" = 1, "1 Partner" = 2, "2 Partners" = 3, "3 Partners" = 4, "4 Partners" = 5, "5-10 Partners" = 6, "11-20 Partners" = 7, "21-100 Partners" = 8, USE.NAMES = F) # This is to convert the lower end of the range to a number gss$incnum <- as.integer(substr(as.character(gss$income_rc), 5, nchar(as.character(gss$income_rc)))) boxplot(gss$incnum) # Only one has 170K or over, leave that one in # Remove the original variables, plus redundant variables gss[, c("degree", "padeg", "madeg", "partners", "partyid", "polviews", "income_rc", "pres12", "kids", "siblings", "ageq")] <- NULL gss$evermar <- ifelse(gss$marital != "Never Married", 1, 0) gss$marital <- NULL # Make assumptions for NULL values in education and politics gss$degnum <- as.integer(gsub("NULL", 2, gss$degnum)) gss$padegnum <- as.integer(gsub("NULL", 2, gss$padegnum)) gss$madegnum <- as.integer(gsub("NULL", 2, gss$madegnum)) gss$partynum <- as.integer(gsub("NULL", 4, gss$partynum)) gss$polnum <- as.integer(gsub("NULL", 4, gss$polnum)) # Let's look at number of children by parent's age ggplot(data = gss, mapping = aes(x = age, y = childs)) + geom_count() + geom_smooth(method = "loess") + facet_wrap(sex ~ .) # Increases steadily until early 40s, drops, starts to increase at 60 # We ask ourselves, why would this be? # First, people usually have children in their twenties and thirties. # Second, people over 60 (born in the 1950s and before) had more children. # Cultural change has led people born since then to have fewer. # To look at completed families after this change, let's take a subset. # This subset of our data will only include people ages 40 to 60, inclusive. mid.age <- subset(gss, age >= 40 & age <= 60) # Let's use a gradient boosting machine. # xgboost only accept numeric variables, so make as many as possible # Some decisions of where to make the split came from previous iterations mid.age$male <- ifelse(mid.age$sex == "Male", 1, 0) mid.age$white <- ifelse(mid.age$race == "White", 1, 0) mid.age$happy <- ifelse(mid.age$happy == "Not Too Happy", 0, 1) mid.age$legal <- ifelse(mid.age$grass == "Legal", 1, 0) mid.age$central <- ifelse(as.character(mid.age$region) %in% c("E. Nor. Central", "E. Sou. Central", "W. Nor. Central", "W. Sou. Central"), 1, 0) mid.age$coastal <- ifelse(as.character(mid.age$region) %in% c("Pacific", "Middle Atlantic"), 1, 0) mid.age[, c("partrnum", "race", "sex", "region", "relig", "grass", "zodiac", "income16", "agegrp", "religion", "bigregion", "partners_rc", "polnum")] <- NULL # Now for the boosting set.seed(15131) # Number another person would be unikely to repeat mid.train.rows <- createDataPartition(na.omit(mid.age)$childs, p = 0.8, list = FALSE) mid.train <- na.omit(mid.age)[mid.train.rows, ] mid.test <- na.omit(mid.age)[-mid.train.rows, ] # Convert data to a format usable by XGBoost # A model frame contains a formula and our data frame columns data.training.mf <- model.frame(childs ~ ., data = mid.train) # A model (or design) matrix only contains numerical values. Factors are dummy coded by default data.training.mm <- model.matrix(attr(data.training.mf, "terms"), data = mid.train) # A XGB dense matrix contains an R matrix and metadata [optional] data.training.dm <- xgb.DMatrix(data.training.mm, label = mid.train$childs, missing = -1) # Testing set data.testing.mf <- model.frame(childs ~ ., data = mid.test) data.testing.mm <- model.matrix(attr(data.testing.mf, "terms"), data = mid.test) data.testing.dm <- xgb.DMatrix(data.testing.mm, label = mid.test$childs, missing = -1) # Cross-validation par <- list(nrounds = 500, max_depth = 3, eta = 0.1, gamma = 0, colsample_bytree = 0.6, min_child_weight = 0.1, subsample = 0.6) xgbGrid <- expand.grid(nrounds = 500, max_depth = c(1:5), eta = c(0.01, 0.1), gamma = 0, colsample_bytree = c(0.6, 0.9), min_child_weight = 0.1, subsample = 0.6) model.xgb.cv <- xgb.cv(params = par, data = data.training.dm, nrounds = 500, prediction = FALSE, print_every_n = 25, early_stopping_rounds = 50, maximize = TRUE, nfold = 5) model.xgb <- xgb.train(params = par, data = data.training.dm, nrounds = model.xgb.cv$best_iteration, prediction = FALSE) ctrl <- trainControl(method = "cv", number = 2) # I chose 2 in the interest of computation speed model.xgb.tuned <- train(childs ~ ., data = mid.train, method = "xgbTree", # This is so we use xgboost trControl = ctrl, tuneGrid = xgbGrid) ggplot(model.xgb.tuned) # To minimize MSE, choose eta = 0.01, colsample = 0.9 # At eta = 0.01, MSE minimized at max_depth = 1 - degenerate tree par.tuned <- list(nrounds = 500, max_depth = 2, eta = 0.01, gamma = 0, colsample_bytree = 0.9, min_child_weight = 0.1, subsample = 0.6) model.xgb.final <- xgboost(params = par.tuned, data = data.training.dm, nrounds = model.xgb.cv$best_iteration, prediction = TRUE) importance <- xgb.importance(feature_names = dimnames(data.training.dm)[[2]], model = model.xgb.final) xgb.plot.importance(importance) prediction <- predict(model.xgb.final, newdata = data.testing.dm) hist(prediction) # Predictions range from 0.504 to 0.518. I'm trying to predict the number of children.