[R] how to manage missing values correctly when importing a data frame

Sarah Goslee sarah.goslee at gmail.com
Wed Sep 7 15:11:06 CEST 2016


R is refusing to make unwarranted assumptions about your data.

See inline.


# it's nicer to use dput() instead of pasting raw data

Storia_RM_RT <- structure(list(Station_RM = c(1400L, 1460L, 1500L,
1520L), Sensor_RM = 2701:2704,
    Place_RM = c("Novafeltria", "Carpegna", "Pesaro", "Fano"),
    Y_init_RM = c(1959L, 1963L, 1957L, 1957L), M_init_RM = c(1L,
    1L, 1L, 1L), D_init_RM = c(1L, 1L, 1L, 1L), Long_cent_RM = c(12.289552,
    12.332614, 12.909822, 13.017591), Lat_cent_RM = c(43.890057,
    43.778107, 43.910889, 43.840054), Height_RM = c(293L, 748L,
    11L, 4L), Continues = c("NO", "SI", "SI", "SI"), Station_RT = c(NA,
    702L, 112L, 152L), Sensor_RT = c(NA, 2954L, 1229L, 2671L),
    Place_RT = c(NA, "Carpegna", "Pesaro", "Fano"), Name1_RT = c(NA,
    "Carpegna", "Villa_Fastiggi", "Foce_Metauro"), Name2_RT = c(NA,
    "Carpegna", "Villa_Fastiggi", "Metaurilia"), Long_cent_RT = c(NA,
    12.340618, 12.86939, 13.053796), Lat_cent_RT = c(NA, 43.780575,
    43.89061, 43.826328), Height_RT = c(NA, 715, 22, 7.12), Actual_net
= c("CAE",
    "RT", "RT", "RT"), Notes = c(NA, NA, NA, NA), Test_20141231 = c("NO",
    "NO", "YES", "YES"), Test_20151231 = c("NO", "NO", "YES",
    "YES")), .Names = c("Station_RM", "Sensor_RM", "Place_RM",
"Y_init_RM", "M_init_RM", "D_init_RM", "Long_cent_RM", "Lat_cent_RM",
"Height_RM", "Continues", "Station_RT", "Sensor_RT", "Place_RT",
"Name1_RT", "Name2_RT", "Long_cent_RT", "Lat_cent_RT", "Height_RT",
"Actual_net", "Notes", "Test_20141231", "Test_20151231"), class =
"data.frame", row.names = c(NA,
-4L))


> Storia_RM_RT$Test_20151231[Storia_RM_RT$Station_RM == 1500]
[1] "YES"

# Storia_RM_RT$Omogenea_20151231[Storia_RM_RT$Station_RT == 112]
# there's no such column; you probably mean Test_20151231

> Storia_RM_RT$Test_20151231[Storia_RM_RT$Station_RT == 112]
[1] NA    "YES"

# What do you expect to have happen when Station_RT is NA? R has no idea
# whether it is 112 or not, so R returns an "I don't know" value that
# lets the user decide how to handle the missing data, rather than making
# assumptions.

# But you probably want one of these constructions:

Storia_RM_RT$Test_20151231[Storia_RM_RT$Station_RT == 112 &
!is.na(Storia_RM_RT$Station_RT)]

# subset automatically handles NAs, making the assumption I'm assuming you want.
subset(Storia_RM_RT, Station_RT == 112 )$Test_20151231

# This is the first form, somewhat more elegantly
with(Storia_RM_RT, Test_20151231[Station_RT == 112 & !is.na(Station_RT)])

On Wed, Sep 7, 2016 at 7:09 AM, Stefano Sofia
<stefano.sofia at regione.marche.it> wrote:
> Dear R users,
> I have a data frame with 22 columns, called Storia_RM_RT. Here the first 4 rows:
>
> Station_RM Sensor_RM Place_RM Y_init_RM M_init_RM D_init_RM Long_cent_RM Lat_cent_RM Height_RM Continues Station_RT Sensor_RT Place_RT Name1_RT Name2_RT Long_cent_RT Lat_cent_RT Height_RT Actual_net Notes Test_20141231 Test_20151231
> 1400 2701 Novafeltria 1959 1 1 12.289552 43.890057 293 NO NA NA NA NA NA NA NA NA CAE NA NO NO
> 1460 2702 Carpegna 1963 1 1 12.332614 43.778107 748 SI 702 2954 Carpegna Carpegna Carpegna 12.340618 43.780575 715 RT NA NO NO
> 1500 2703 Pesaro 1957 1 1 12.909822 43.910889 11 SI 112 1229 Pesaro Villa_Fastiggi Villa_Fastiggi 12.86939 43.890610 22 RT NA YES YES
> 1520 2704 Fano 1957 1 1 13.017591 43.840054 4 SI 152 2671 Fano Foce_Metauro Metaurilia 13.053796 43.826328 7.12 RT NA YES YES
>
> I load it with
> Storia_RM_RT <- read.table(file="Storia_RM_RT.txt", header = TRUE, sep=" ", dec = ".", stringsAsFactors = FALSE)
>
> print(Storia_RM_RT$Test_20151231[Storia_RM_RT$Station_RM == 1500]) gives
> [1] "YES"
>
> while
> print(Storia_RM_RT$Omogenea_20151231[Storia_RM_RT$Station_RT == 112]) gives
> [1] NA   "YES"
>
>
> print(lapply(Storia_RM_RT, class)) gives
>
> $Station_RM
> [1] "integer"
>
> $Sensor_RM
> [1] "integer"
>
> $Place_RM
> [1] "character"
>
> $Y_init_RM
> [1] "integer"
>
> $M_init_RM
> [1] "integer"
>
> $D_init_RM
> [1] "integer"
>
> $Long_cent_RM
> [1] "numeric"
>
> $Lat_cent_RM
> [1] "numeric"
>
> $Height_RM
> [1] "integer"
>
> $Continues
> [1] "character"
>
> $Station_RT
> [1] "integer"
>
> $Sensor_RT
> [1] "integer"
>
> $Place_RT
> [1] "character"
>
> $Name1_RT
> [1] "character"
>
> $Name2_RT
> [1] "character"
>
> $Long_cent_RT
> [1] "numeric"
>
> $Lat_cent_RT
> [1] "numeric"
> $Quota_RT
> [1] "numeric"
>
> $Actual_net
> [1] "character"
>
> $Notes
> [1] "logical"
>
> $Test_20141231
> [1] "character"
>
> $Test_20151231
> [1] "character"
>
> I am struggling to understand why the query through the field Station_RT does not work.
> Could please somebody help me to manage correctly the missing values? Is the mistake somewhere else?
>
> Thank you
> Stefano Sofia
>
>

-- 
Sarah Goslee
http://www.functionaldiversity.org



More information about the R-help mailing list