[R] Trouble with FAILDATE when using the aggregate() and ts() functions

Sat Sep 30 18:45:07 CEST 2023

Dear friends,

First of all, the following is the R version I am working with:
[64-bit] C:\Program Files\R\R-4.3.1

The packages I am using are the following:
library("dplyr")
library("lubridate")
library("tidyverse")
library("readxl")
library("stats")

#reading data for distribution fitting

failuredf <- as.data.frame(read_excel("FailureData.xlsx"))
failuredf2 <- subset(failuredf, !is.na(FAILDATE))
failuredf2["Fails"] <- if(is.null(failuredf$FAILDATE)==FALSE){
   failuredf2$Fails = 1
 }else{
   failuredf2$Fails = 0
 }
 #reading data for distribution fitting

failuredf <- as.data.frame(read_excel("FailureData.xlsx"))

failuredf2 <- subset(failuredf, !is.na(FAILDATE))
dput(head(failuredf2))
structure(list(WONUM = c("946936", "944757", "946194", "897595",
"897714", "898221"), ASSET = c("REMLOSSANTOS", "REMCAMPANA",
"REMBAYANO", "REMCHIRIQUI", "REMDARIEN", "REMTUIRA"), INSTALLDATE =
structure(c(1039910400,
NA, 1223424000, 1200355200, 1175385600, 1220918400), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), LOCATION = c("DISTRITO-NOR", "DISTRITO-NOR",
"DISTRITO-NOR", "DISTRITO-SUR", "DISTRITO-NOR", "DISTRITO-NOR"
), DESCRIPTION = c("APOYO DE ATF- LIMPIEZA DE TANQUES DE COMBUSTIBLE
REMOLCADOR LOS SANTOS",
"APOYO DE ATF- LIMPIEZA DE LOS TANQUES DE COMBUSTIBLE  4B Y 4C (PORT Y STB
DAY TANKS) CERRO CAMPANA",
"APOYO DE ATF- LIMPIEZA Y PINTURA CUARTO DE MÁQUINAS REMOLCADOR BAYANO",
"Para FDS - Unidad de propulsión de babor con movimientos erraticos",
"Para FDS - Bunkering", "Para FDS - Problemas intermitentes en el sistema
de embrague de la unidad MCD en el SME"
), WORKODER_REPORTDATE = structure(c(1517910018, 1517211639,
1517563662, 1510655175, 1510733776, 1510909387), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), WORKTYPE = c("MC", "MC", "MC", "MC",
"MC", "MC"), COMPONENTE = c("FSE", "FSE", "FSE", "MPS", "MME",
"CSE"), COMP_DESCRIPTION = c("fuel system", "fuel system", "fuel system",
"miscellaneous propulsion & steering", "miscellaneous main engine",
"control system"), FAILURECODE = c("REM", "REM", "REM", "REM",
"REM", "REM"), FAILDATE = structure(c(1520438053, 1520438271,
1520438369, 1492283280, 1497858240, 1476959280), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), ORGID = c("ACP", "ACP", "ACP", "ACP",
"ACP", "ACP"), TICKETCLASS = c(NA, NA, NA, NA, NA, NA), PROBLEMA =
c("MENG",
"MENG", "MENG", "PROSY", "MENG", "MENG"), PROBLEMA_CODE_DESC = c("MAIN
ENGINES",
"MAIN ENGINES", "MAIN ENGINES", "Propulsion system", "MAIN ENGINES",
"MAIN ENGINES"), CAUSA = c(NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_), CAUSA_CODE_DESC =
c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_
), REMEDIO = c(NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), REMEDIO_CODE_DESC = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_
)), row.names = c(NA, 6L), class = "data.frame")

I basically want to group the dataset by FAILDATE and then just count the
number of failures. It would be equivalent to creating a pivot table where
you specify FAILDATE as rows and then count any field (in this case WONUM).
#Here I  format FAILDATE as YYYY-MM
failuredf2["faildateproc"] = format(failuredf2$FAILDATE, format='%Y-%m')

#I just select the fields I am interested in
failuredf3 <- select(failuredf2, faildateproc, Fails)

head(failuredf3)

#I do the aggregation here
failuredf3.sum <- aggregate(x = failuredf3[c("Fails")],
                     FUN = sum,
                     by = list(FailDate = failuredf3$faildateproc))

This actually works, but the problem comes when I try to extract the year
and month from FailDate when I apply function ts() (to transform the data
to a ts object (it is worth mentioning that the aggregate() function
doesn't work if I do as.Date(failuredf$FAILDATE, format='%Y-%m'). By the
way, it is worth mentioning that this is the aggregate function from the
stats package.

Look at the output when I do not apply the as.Date() funciton to
failuredf$FAILDATE:
head(failuredf3.sum)
  FailDate Fails
1  2016-09     1
2  2016-10     1
3  2016-11     3
4  2016-12     4
5  2017-01     5
6  2017-02     6
this is doing the aggregation properly, but if I apply the as.Date()
function first, look what happens when I try to aggregate:
#reading data for distribution fitting

failuredf <- as.data.frame(read_excel("FailureData.xlsx"))

failuredf2 <- subset(failuredf, !is.na(FAILDATE))

dput(head(failuredf2))

failuredf2["Fails"] <- if(is.null(failuredf$FAILDATE)==FALSE){
  failuredf2$Fails = 1
}else{
  failuredf2$Fails = 0
}

failuredf2["faildateproc"] = as.Date(failuredf2$FAILDATE, format='%Y-%m'))

failuredf3 <- select(failuredf2, faildateproc, Fails)

head(failuredf3)

failuredf3.sum <- aggregate(x = failuredf3[c("Fails")],
                     FUN = sum,
                     by = list(FailDate = failuredf3$faildateproc))
head(failuredf3.sum)
> head(failuredf3.sum)
    FailDate Fails
1 2016-09-27     1
2 2016-10-20     1
3 2016-11-09     1
4 2016-11-18     1
5 2016-11-22     1
6 2016-12-01     1
(this is not aggregating at all)
It seems that the aggregate() function doesn't work with actual dates.

I want to do the following:
failure_ts <- ts(failuredf3.sum$Fails, start=c(year(failuredf3.sum[1,1]),
month(failuredf3.sum[1,1)),
end=c(year(failuredf3.sum[nrow(failuredf3.sum),1],month(failuredf3.sum[nrow(failuredf3),1])),
frequency=12)
I basically want to arrange my data so that I can have the count of
failures grouped by YYYY-MM period and then be able to extract the year and
month automatically when using the ts object, so I do not have to do it
manually.

Any help and/or guidance will be greatly appreciated.

Below the complete code with the issue I am experiencing:
library("vcd")
library("stats")
library("fitdistrplus")
library("actuar")
library("betafunctions")
library("extraDistr")
library("gamlss.dist")
library("mbbefd")
library("VGAM")
library("LaplacesDemon")
library("GB2")
library("BMT")
library("bridgedist")
library("ExtDist")
library("Pareto")
library("CircStats")
library("circular")
library("lmomco")
library("sgt")
library("ChernoffDist")
library("Runuran")
library("emdbook")
library("extraDistr")
library("sadists")
library("rmutil")
library("cbinom")
library("Davies")
library("nCDunnett")
library("AEP")
library("tolerance")
library("mixSPE")
library("poweRlaw")
library("ReIns")
library("normalp")
library("SMR")
library("fpow")
library("flexsurv")
library("RTDE")
library("distributionsrd")
library("evd")
library("SuppDists")
library("EnvStats")
library("ghyp")
library("CaDENCE")
library("VarianceGamma")
library("MCMCpack")
library("coga")
library("ggamma")
library("distTails")
library("sglg")
library("ollggamma")
library("truncnorm")
library("greybox")
library("csn")
library("CompQuadForm")
library("NormalLaplace")
library("evir")
library("FAdist")
library("TLMoments")
library("QRM")
library("ROOPSD")
library("fExtremes")
library("revdbayes")
library("dgumbel")
library("bgumbel")
library("ugomquantreg")
library("smoothmest")
library("marg")
library("fBasics")
library("GeneralizedHyperbolic")
library("HyperbolicDist")
library("SkewHyperbolic")
library("FatTailsR")
library("statmod")
library("SuppDists")
library("GIGrvg")
library("ForestFit")
library("elfDistr")
library("ecd")
library("gld")
library("ald")
library("distr")
library("gb")
library("LambertW")
library("L1pack")
library("gambin")
library("sld")
library("SCI")
library("glogis")
library("GTDL")
library("logitnorm")
library("loglognorm")
library("minimax")
library("MittagLeffleR")
library("extremefit")
library("qrmtools")
library("Renext")
library("ParetoPosStable")
library("PhaseTypeR")
library("mapfit")
library("matrixdist")
library("stabledist")
library("stable")
library("PearsonDS")
library("rtdists")
library("MixedTS")
library("crch")
library("skewt")
library("FMStable")
library("SymTS")
library("sn")
library("triangle")
library("mc2d")
library("trapezoid")
library("tsallisqexp")
library("tweedie")
library("KScorrect")
library("vasicek")
library("vasicekreg")
library("rvMF")
library("WienR")
library("fBasics")
library("gamlss")
library("gamlss.add")
library("univariateML")
library("readxl")
library("xlsx")
library("MASS")
library("invgamma")
library("dplyr")
library("lubridate")
library("tidyverse")
library("forecast")
library("glmnet")
library("neuralnet")
library("nnfor")
library("smooth")
library("tseries")
library("greybox")
library("caret")
library("lattice")
library("doParallel")
library("pdp")
library("readxl")
library("mixtools")
library("pivottabler")
library("arsenal")
library("kableExtra")
library("janitor")
library("forecastHybrid")

#getting current working directory

getwd()

#setting up desired working directory

setwd("C:/Users/PaulBernal")

#reading data for distribution fitting

failuredf <- as.data.frame(read_excel("FailureData.xlsx"))

failuredf2 <- subset(failuredf, !is.na(FAILDATE))

failuredf2["Fails"] <- if(is.null(failuredf$FAILDATE)==FALSE){
  failuredf2$Fails = 1
}else{
  failuredf2$Fails = 0
}

#mapply(format, failuredf2$FAILDATE, tz = "America/Bogota")
failuredf2["faildateproc"] = format(failuredf2$FAILDATE, format='%Y-%m')

failuredf3 <- select(failuredf2, faildateproc, Fails)

head(failuredf3)

failuredf3.sum <- aggregate(x = failuredf3[c("Fails")],
                     FUN = sum,
                     by = list(FailDate = failuredf3$faildateproc))
head(failuredf3.sum)

failuredf3.sum %>% arrange(ym(failuredf3.sum$FailDate))

failuredf3.sum

fail_ts_t <- ts(failuredf3.sum$Fails, start=c(year(failuredf3.sum[1,1]),
month(failuredf3.sum[1,1])), end = c(2023,9), frequency = 12)
Error in as.POSIXlt.character(x, tz = tz(x)) :
  character string is not in a standard unambiguous format

Kind regards,
Paul

	[[alternative HTML version deleted]]