# First, let's load our libraries.
library(ggplot2)
library(caret)
library(xgboost)
# Import GSS_SM and do all necessary data cleaning
gss <- read.csv("gss_sm.csv")
head(gss)  # Checking the variables
ncol(gss)  # 33
summary(gss)
nrow(gss)  # 2867 - we will see that only about 300 are usable for our purposes
gss <- gss[, 4:33]  # Removing identification numbers and the year 2016
# Check for outliers
boxplot(gss$age)  # no outliers
boxplot(gss$childs)  # one outlier, 8 children - leave it in
boxplot(gss$sibs)  # Many outliers, 9 and over - most likely errors
gss <- subset(gss, sibs < 9)  # Remove these errors
levels(gss$marital)  # Reduces to 2 variables - currently married, ever married
levels(gss$padeg)  # Should transform this to numbers 1-5
levels(gss$madeg)  # same
levels(gss$partyid)  # Should transform this to numbers 1-8
levels(gss$polviews)  # Same, 1-7
levels(gss$happy)  # Examine influence of different categories
levels(gss$partners)  # Same, 1-9
table(gss$partners)
# Remove "1 or more, # Unknown", there are only 9, assigning could bias results
gss <- subset(gss, partners != "1 or More, # Unknown")
boxplot(gss$wtssall)
# The 11 outliers have a score of over 3
# This ended up biasing results, so I removed the outliers
gss <- subset(gss, wtssall < 3)
# I did checks on other variables; no changes were needed, so I deleted these
# Let's convert some variables to numeric
# Names are based on original data's naming method, adding "num" when necessary
gss$degnum <- sapply(as.character(gss$degree), switch, "Lt High School" = 1, 
                     "High School" = 2, "Junior College" = 3, 
                     "Bachelor" = 4, "Graduate" = 5, USE.NAMES = F)
gss$padegnum <- sapply(as.character(gss$padeg), switch, "Lt High School" = 1, 
                       "High School" = 2, "Junior College" = 3, 
                       "Bachelor" = 4, "Graduate" = 5, USE.NAMES = F)
gss$madegnum <- sapply(as.character(gss$madeg), switch, "Lt High School" = 1, 
                       "High School" = 2, "Junior College" = 3, 
                       "Bachelor" = 4, "Graduate" = 5, USE.NAMES = F)
gss$partynum <- sapply(as.character(gss$partyid), switch, 
                       "Strong Republican" = 1, 
                       "Not Str Republican" = 2, "Ind,near Rep" = 3, 
                       "Independent" = 4, "Other Party" = 5,
                       "Ind,near Dem" = 6, "Not Str Democrat" = 7,
                       "Strong Democrat" = 8, USE.NAMES = F)
gss$polnum <- sapply(as.character(gss$polviews), switch, 
                     "Extremely Conservative" = 1, 
                     "Conservative" = 2, "Slightly Conservative" = 3, 
                     "Moderate" = 4, "Slightly Liberal" = 5,
                     "Liberal" = 6, "Extremely Liberal" = 7, USE.NAMES = F)
gss$partrnum <- sapply(as.character(gss$partners), switch, "No Partners" = 1, 
                       "1 Partner" = 2, "2 Partners" = 3, 
                       "3 Partners" = 4, "4 Partners" = 5,
                       "5-10 Partners" = 6, "11-20 Partners" = 7,
                       "21-100 Partners" = 8, USE.NAMES = F)
# This is to convert the lower end of the range to a number
gss$incnum <- as.integer(substr(as.character(gss$income_rc), 5, 
                                nchar(as.character(gss$income_rc))))
boxplot(gss$incnum)  # Only one has 170K or over, leave that one in
# Remove the original variables, plus redundant variables
gss[, c("degree", "padeg", "madeg", "partners", "partyid", "polviews",
        "income_rc", "pres12", "kids", "siblings", "ageq")] <- NULL
gss$evermar <- ifelse(gss$marital != "Never Married", 1, 0)
gss$marital <- NULL
# Make assumptions for NULL values in education and politics
gss$degnum <- as.integer(gsub("NULL", 2, gss$degnum))
gss$padegnum <- as.integer(gsub("NULL", 2, gss$padegnum))
gss$madegnum <- as.integer(gsub("NULL", 2, gss$madegnum))
gss$partynum <- as.integer(gsub("NULL", 4, gss$partynum))
gss$polnum <- as.integer(gsub("NULL", 4, gss$polnum))
# Let's look at number of children by parent's age
ggplot(data = gss, mapping = aes(x = age, y = childs)) + geom_count() + 
  geom_smooth(method = "loess") + facet_wrap(sex ~ .)
# Increases steadily until early 40s, drops, starts to increase at 60
# We ask ourselves, why would this be?
# First, people usually have children in their twenties and thirties.
# Second, people over 60 (born in the 1950s and before) had more children.
# Cultural change has led people born since then to have fewer.
# To look at completed families after this change, let's take a subset.
# This subset of our data will only include people ages 40 to 60, inclusive.
mid.age <- subset(gss, age >= 40 & age <= 60)

# Let's use a gradient boosting machine.
# xgboost only accept numeric variables, so make as many as possible
# Some decisions of where to make the split came from previous iterations
mid.age$male <- ifelse(mid.age$sex == "Male", 1, 0)
mid.age$white <- ifelse(mid.age$race == "White", 1, 0)
mid.age$happy <- ifelse(mid.age$happy == "Not Too Happy", 0, 1)
mid.age$legal <- ifelse(mid.age$grass == "Legal", 1, 0)
mid.age$central <- ifelse(as.character(mid.age$region) %in% 
                            c("E. Nor. Central", "E. Sou. Central", 
                              "W. Nor. Central", "W. Sou. Central"), 1, 0)
mid.age$coastal <- ifelse(as.character(mid.age$region) %in% 
                            c("Pacific", "Middle Atlantic"), 1, 0)
mid.age[, c("partrnum", "race", "sex", "region", "relig", "grass", 
            "zodiac", "income16", "agegrp", "religion", "bigregion", 
            "partners_rc", "polnum")] <- NULL

# Now for the boosting
set.seed(15131)  # Number another person would be unikely to repeat
mid.train.rows <- createDataPartition(na.omit(mid.age)$childs, p = 0.8, 
                                      list = FALSE)
mid.train <- na.omit(mid.age)[mid.train.rows, ]
mid.test <- na.omit(mid.age)[-mid.train.rows, ]
# Convert data to a format usable by XGBoost
# A model frame contains a formula and our data frame columns
data.training.mf  <- model.frame(childs ~ ., data = mid.train)

# A model (or design) matrix only contains numerical values. Factors are dummy coded by default
data.training.mm  <- model.matrix(attr(data.training.mf, "terms"), data = mid.train)

# A XGB dense matrix contains an R matrix and metadata [optional]
data.training.dm  <- xgb.DMatrix(data.training.mm, label = mid.train$childs, missing = -1)

# Testing set
data.testing.mf <- model.frame(childs ~ ., data = mid.test)
data.testing.mm <- model.matrix(attr(data.testing.mf, "terms"), data = mid.test)
data.testing.dm <- xgb.DMatrix(data.testing.mm, label = mid.test$childs, missing = -1)

# Cross-validation
par <- list(nrounds = 500, 
            max_depth = 3,
            eta = 0.1,
            gamma = 0,
            colsample_bytree = 0.6,
            min_child_weight = 0.1,
            subsample = 0.6) 
xgbGrid <- expand.grid(nrounds = 500, 
                       max_depth = c(1:5),
                       eta = c(0.01, 0.1),
                       gamma = 0,
                       colsample_bytree = c(0.6, 0.9),
                       min_child_weight = 0.1,
                       subsample = 0.6)
model.xgb.cv <- xgb.cv(params = par,
                       data = data.training.dm,
                       nrounds = 500,
                       prediction = FALSE,
                       print_every_n = 25,
                       early_stopping_rounds = 50,
                       maximize = TRUE,
                       nfold = 5)
model.xgb <- xgb.train(params = par,
                       data = data.training.dm,
                       nrounds = model.xgb.cv$best_iteration,
                       prediction = FALSE)
ctrl <- trainControl(method = "cv", number = 2)
# I chose 2 in the interest of computation speed
model.xgb.tuned <- train(childs ~ ., data = mid.train,
                         method = "xgbTree", # This is so we use xgboost
                         trControl = ctrl,
                         tuneGrid = xgbGrid)
ggplot(model.xgb.tuned)
# To minimize MSE, choose eta = 0.01, colsample = 0.9
# At eta = 0.01, MSE minimized at max_depth = 1 - degenerate tree
par.tuned <- list(nrounds = 500, 
                  max_depth = 2,
                  eta = 0.01,
                  gamma = 0,
                  colsample_bytree = 0.9,
                  min_child_weight = 0.1,
                  subsample = 0.6) 
model.xgb.final <- xgboost(params = par.tuned,
                             data = data.training.dm,
                             nrounds = model.xgb.cv$best_iteration,
                             prediction = TRUE)
importance <- xgb.importance(feature_names = dimnames(data.training.dm)[[2]], 
                             model = model.xgb.final)
xgb.plot.importance(importance)
prediction <- predict(model.xgb.final, newdata = data.testing.dm)
hist(prediction)
# Predictions range from 0.504 to 0.518. I'm trying to predict the number of children.