[R] Filtering String Variables

G.Maubach at weinwolf.de G.Maubach at weinwolf.de
Mon May 23 15:28:11 CEST 2016


# Hi All,
# 
# I have the following data frame (example):

Debitor <- c("968691", "968691", "968691",
             "A04046", "A04046",
             "L0006", "L0006", "L0006",
             "L0023", "L0023",
             "L0056", "L0056",
             "L0094", "L0094", "L0094",
             "L0124", "L0124",
             "L0143", 
             "L0170",
             "13459",
             "473908",
             "394704",
             "4711",
             "4712",
             "4713")
Debitor <- as.character(Debitor)
var1 <- c(11, 12, 13,
          14, 14,
          12, 13, 14,
          10, 11,
          12, 12,
          12, 12, 12,
          15, 17,
          11,
          14,
          12,
          17,
          13,
          15,
          16,
          11)
ds_example <- data.frame(Debitor, var1)
ds_example$case_id <- 1:nrow(ds_example)
ds_example <- ds_example[, sort(colnames(ds_example))]
ds_example

# I would like to generate a data frame that contains the duplicates AND 
the
# corresponding non-duplicates to the duplicates.
# For example, finding the duplicates with deliver case 2 and 3 but the 
list
# should also contain case 1 because case 1 is the corresponding case to 
the
# duplicate cases 2 and 3.
# For the whole example dataset that would be:
needed <- c(1, 1, 1,
            1, 1,
            1, 1, 1,
            1, 1,
            1, 1,
            1, 1, 1,
            1, 1,
            0, 0, 0, 0, 0, 0, 0, 0)
needed <- as.logical(needed)
ds_example <- data.frame(ds_example, needed)
ds_example

# To find the duplicates and the corresponding non-duplicates
duplicates <- duplicated(ds_example$Debitor)

list_of_duplicated_debitors <- as.character(ds_example[duplicates, 
"Debitor"])

filter_variable <- unique(list_of_duplicated_debitors)

ds_duplicates <- ds_example["Debitor" == filter_variable]  # Result: 
dataset with 0 columns

ds_duplicates <- ds_example["Debitor"] %in% filter_variable  # Result: 
FALSE

# How can I create a dataset like this

ds_example <- ds_example[needed, ]
ds_example

# using the Debitor IDs?

Kind regards

Georg Maubach



More information about the R-help mailing list