[R] WG: Filtering String Variables (SOLVED)

Mon May 23 15:57:04 CEST 2016

Hi All,

the solution for my question is as follows

## Filter duplicates and correpsonding non-duplicates
### To filter duplicates and their corresponding non-duplicates use the
### following code snippet:
Debitor <- c("968691", "968691", "968691",
             "A04046", "A04046",
             "L0006", "L0006", "L0006",
             "L0023", "L0023",
             "L0056", "L0056",
             "L0094", "L0094", "L0094",
             "L0124", "L0124",
             "L0143", 
             "L0170",
             "13459",
             "473908",
             "394704",
             "4711",
             "4712",
             "4713")
Debitor <- as.character(Debitor)
var1 <- c(11, 12, 13,
          14, 14,
          12, 13, 14,
          10, 11,
          12, 12,
          12, 12, 12,
          15, 17,
          11,
          14,
          12,
          17,
          13,
          15,
          16,
          11)
ds_example <- data.frame(Debitor, var1)
ds_example$case_id <- 1:nrow(ds_example)
ds_example <- ds_example[, sort(colnames(ds_example))]
ds_example

# This task is to generate a data frame that contains the duplicates AND 
the
# corresponding non-duplicates to the duplicates.
# For example, finding the duplicates will deliver case 2 and 3 but the 
list
# should also contain case 1 because case 1 is the corresponding case to 
the
# duplicate cases 2 and 3.
# For the whole example dataset that would be:
needed <- c(1, 1, 1,
            1, 1,
            1, 1, 1,
            1, 1,
            1, 1,
            1, 1, 1,
            1, 1,
            0, 0, 0, 0, 0, 0, 0, 0)
needed <- as.logical(needed)
ds_example <- data.frame(ds_example, needed)
ds_example

# To find the duplicates and the corresponding non-duplicates
duplicates <- duplicated(ds_example$Debitor)

list_of_duplicated_debitors <- as.character(ds_example[duplicates, 
"Debitor"])

filter_variable <- unique(list_of_duplicated_debitors)

### Wrong code. Do not run.
### ds_duplicates <- ds_example["Debitor" == filter_variable]  # Result: 
dataset with 0 columns
### duplicates_and_correponding_non_duplicates <- ds_example["Debitor"] 
%in% filter_variable  # Result: FALSE

duplicates_and_correponding_non_duplicates <- ds_example$Debitor %in% 
filter_variable  # Result: OK
duplicates_and_correponding_non_duplicates <- ds_example[, "Debitor"] %in% 
filter_variable  # Result: OK

### Create the dataset with duplicates and corresponding non-duplicates
ds_example <- ds_example[duplicates_and_correponding_non_duplicates, ]
ds_example

It was a simple mistake when subscripting.

Kind regards

Georg Maubach

----- Weitergeleitet von Georg Maubach/WWBO/WW/HAW am 23.05.2016 15:54 
-----

Von:    Georg Maubach/WWBO/WW/HAW
An:     r-help at r-project.org, 
Datum:  23.05.2016 15:28
Betreff:        Filtering String Variables

# Hi All,
# 
# I have the following data frame (example):

Debitor <- c("968691", "968691", "968691",
             "A04046", "A04046",
             "L0006", "L0006", "L0006",
             "L0023", "L0023",
             "L0056", "L0056",
             "L0094", "L0094", "L0094",
             "L0124", "L0124",
             "L0143", 
             "L0170",
             "13459",
             "473908",
             "394704",
             "4711",
             "4712",
             "4713")
Debitor <- as.character(Debitor)
var1 <- c(11, 12, 13,
          14, 14,
          12, 13, 14,
          10, 11,
          12, 12,
          12, 12, 12,
          15, 17,
          11,
          14,
          12,
          17,
          13,
          15,
          16,
          11)
ds_example <- data.frame(Debitor, var1)
ds_example$case_id <- 1:nrow(ds_example)
ds_example <- ds_example[, sort(colnames(ds_example))]
ds_example

# I would like to generate a data frame that contains the duplicates AND 
the
# corresponding non-duplicates to the duplicates.
# For example, finding the duplicates with deliver case 2 and 3 but the 
list
# should also contain case 1 because case 1 is the corresponding case to 
the
# duplicate cases 2 and 3.
# For the whole example dataset that would be:
needed <- c(1, 1, 1,
            1, 1,
            1, 1, 1,
            1, 1,
            1, 1,
            1, 1, 1,
            1, 1,
            0, 0, 0, 0, 0, 0, 0, 0)
needed <- as.logical(needed)
ds_example <- data.frame(ds_example, needed)
ds_example

# To find the duplicates and the corresponding non-duplicates
duplicates <- duplicated(ds_example$Debitor)

list_of_duplicated_debitors <- as.character(ds_example[duplicates, 
"Debitor"])

filter_variable <- unique(list_of_duplicated_debitors)

ds_duplicates <- ds_example["Debitor" == filter_variable]  # Result: 
dataset with 0 columns

ds_duplicates <- ds_example["Debitor"] %in% filter_variable  # Result: 
FALSE

# How can I create a dataset like this

ds_example <- ds_example[needed, ]
ds_example

# using the Debitor IDs?

Kind regards

Georg Maubach