[R] How to identify outliers with values five times 99th percentile

Rui Barradas ruipbarradas at sapo.pt
Tue Sep 9 11:35:17 CEST 2014


Hello,

Try the following.

out <- lapply(df, function(x){
	qq <- quantile(x, probs = 0.99)
	which(x > 5*qq)
})

out

The list 'out' contains indices to the outliers. You can now have those 
outliers as follows

df[out[[1]], 1]  # first column

Etc.

Hope this helps,

Rui Barradas


Em 09-09-2014 10:21, Kuma Raj escreveu:
> I have a data frame with some extreme values which I wish to identify
> and repeat an analysis without these extreme values. How could I
> identify several columns with values which are 5 times higher than the
> 99th percentile?
>
> Sample data is pasted below.
>
>> dput(df)
>
> structure(list(ad1 = c(98, 6.9, 8.1, 56, 3.9, 6.9, 6.9, 5.8,
>
> 7.2, 20.5, 9.4, 7.6, 5.3, 7.9, 62.2, 9.2, 11.9, 8.8, 23.1, 5.4,
>
> 9.4, 56, 8.6, 20.7, 21, 10.5, 5.5, 4.3, 15.8, 6.8, 10.4, 5.1),
>
>      ad2 = c(14.9, 19.7, 1, 17.7, 14.9, 13.6, 18.8, 20.9, 46,
>
>      16.5, 11.7, 1, 9.2, 23.6, 19.7, 1, 11.4, 11, 23.1, 1, 1,
>
>      8.9, 11.3, 6.4, 15.2, 1, 17.3, 10.1, 13.3, 21.3, 12.3, 15.4
>
>      ), ad3 = c(0.91, 0.95, 10.7, 4.4, 0.43, 0.8, 3.1, 1.9, 2.3,
>
>      5.6, 3.9, 7.3, 0.37, 4.1, 15.1, 21.8, 3, 0.79, 1, 4.6, 0.61,
>
>      0.46, 0.87, 23.5, 3.8, 3.1, 0.33, 1.9, 3.2, 1.7, 0.53, 62.5
>
>      ), ad4 = c(225.5, 269.7, 326, 485.4, 193.2, 274.1, 553.2,
>
>      166.8, 435.9, 433.2, 187.1, 660.4, 235.4, 356.5, 378.8, 500.5,
>
>      323.5, 327.1, 289.5, 301.2, 291.7, 333.5, 351.7, 384.1, 347,
>
>      1354, 440.4, 189.2, 381, 252.7, 391.1, 255.1), ad5 = c(337.9,
>
>      355.6, 419.5, 798.5, 225, 355.9, 394.4, 340.6, 463.9, 291.9,
>
>      312.3, 491, 290.5, 231.9, 358, 386.4, 306.7, 440.6, 297.9,
>
>      339.3, 341.1, 366.2, 325.4, 357, 412.2, 370.2, 421.3, 346.3,
>
>      289.1, 257.4, 368, 322.6), ad6 = c(64.5, 130.6, 76, 167.8,
>
>      47.3, 117, 60.7, 91.9, 221.9, 91.1, 105.1, 110.8, 64.5, 184.5,
>
>      191.6, 259.4, 879.5, 142.1, 55.3, 123.1, 62.2, 75.2, 154.6,
>
>      100.7, 93.1, 136.7, 74.3, 41.8, 110.1, 109.1, 172.5, 87.7
>
>      ), ad7 = c(128L, 987L, 158L, 124L, 137L, 215L, 141L, 98L,
>
>      291L, 261L, 106L, 137L, 141L, 159L, 221L, 108L, 123L, 107L,
>
>      137L, 175L, 257L, 97L, 168L, 145L, 147L, 188L, 145L, 128L,
>
>      153L, 187L, 123L, 354L), ad8 = c(3.26, 3.98, 2.88, 2.85,
>
>      4.17, 3.16, 3.09, 4.35, 3.46, 3.81, 3.78, 3.81, 4.17, 4.27,
>
>      4.27, 2.97, 3.43, 3.48, 3.78, 3.86, 3.11, 3.12, 3.16, 4.24,
>
>      3.81, 3.11, 5.31, 3.75, 3.78, 3.55, 4.08, 3.5), ad9 = c(433L,
>
>      211L, 66L, 173L, 224L, 466L, 224L, 273L, 94L, 321L, 160L,
>
>      107L, 121L, 186L, 455L, 80L, 897L, 186L, 285L, 134L, 107L,
>
>      355L, 261L, 249L, 332L, 107L, 273L, 107L, 160L, 535L, 160L,
>
>      121L)), .Names = c("ad1", "ad2", "ad3", "ad4", "ad5", "ad6",
>
> "ad7", "ad8", "ad9"), class = "data.frame", row.names = c(NA,
>
> -32L))
>
> ______________________________________________
> R-help at r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
>



More information about the R-help mailing list