[R] WG: Filtering String Variables (SOLVED)

Mon May 23 16:14:09 CEST 2016

Perhaps

ds_example <- ds_example[ with( ds_example, 1 < ave( Debitor, Debitor, FUN=length ) ), ]

-- 
Sent from my phone. Please excuse my brevity.

On May 23, 2016 6:57:04 AM PDT, G.Maubach at weinwolf.de wrote:
>Hi All,
>
>the solution for my question is as follows
>
>## Filter duplicates and correpsonding non-duplicates
>### To filter duplicates and their corresponding non-duplicates use the
>### following code snippet:
>Debitor <- c("968691", "968691", "968691",
>             "A04046", "A04046",
>             "L0006", "L0006", "L0006",
>             "L0023", "L0023",
>             "L0056", "L0056",
>             "L0094", "L0094", "L0094",
>             "L0124", "L0124",
>             "L0143", 
>             "L0170",
>             "13459",
>             "473908",
>             "394704",
>             "4711",
>             "4712",
>             "4713")
>Debitor <- as.character(Debitor)
>var1 <- c(11, 12, 13,
>          14, 14,
>          12, 13, 14,
>          10, 11,
>          12, 12,
>          12, 12, 12,
>          15, 17,
>          11,
>          14,
>          12,
>          17,
>          13,
>          15,
>          16,
>          11)
>ds_example <- data.frame(Debitor, var1)
>ds_example$case_id <- 1:nrow(ds_example)
>ds_example <- ds_example[, sort(colnames(ds_example))]
>ds_example
>
># This task is to generate a data frame that contains the duplicates
>AND 
>the
># corresponding non-duplicates to the duplicates.
># For example, finding the duplicates will deliver case 2 and 3 but the
>
>list
># should also contain case 1 because case 1 is the corresponding case
>to 
>the
># duplicate cases 2 and 3.
># For the whole example dataset that would be:
>needed <- c(1, 1, 1,
>            1, 1,
>            1, 1, 1,
>            1, 1,
>            1, 1,
>            1, 1, 1,
>            1, 1,
>            0, 0, 0, 0, 0, 0, 0, 0)
>needed <- as.logical(needed)
>ds_example <- data.frame(ds_example, needed)
>ds_example
>
># To find the duplicates and the corresponding non-duplicates
>duplicates <- duplicated(ds_example$Debitor)
>
>list_of_duplicated_debitors <- as.character(ds_example[duplicates, 
>"Debitor"])
>
>filter_variable <- unique(list_of_duplicated_debitors)
>
>### Wrong code. Do not run.
>### ds_duplicates <- ds_example["Debitor" == filter_variable]  #
>Result: 
>dataset with 0 columns
>### duplicates_and_correponding_non_duplicates <- ds_example["Debitor"]
>
>%in% filter_variable  # Result: FALSE
>
>duplicates_and_correponding_non_duplicates <- ds_example$Debitor %in% 
>filter_variable  # Result: OK
>duplicates_and_correponding_non_duplicates <- ds_example[, "Debitor"]
>%in% 
>filter_variable  # Result: OK
>
>### Create the dataset with duplicates and corresponding non-duplicates
>ds_example <- ds_example[duplicates_and_correponding_non_duplicates, ]
>ds_example
>
>It was a simple mistake when subscripting.
>
>Kind regards
>
>Georg Maubach
>
>
>----- Weitergeleitet von Georg Maubach/WWBO/WW/HAW am 23.05.2016 15:54 
>-----
>
>Von:    Georg Maubach/WWBO/WW/HAW
>An:     r-help at r-project.org, 
>Datum:  23.05.2016 15:28
>Betreff:        Filtering String Variables
>
>
># Hi All,
># 
># I have the following data frame (example):
>
>Debitor <- c("968691", "968691", "968691",
>             "A04046", "A04046",
>             "L0006", "L0006", "L0006",
>             "L0023", "L0023",
>             "L0056", "L0056",
>             "L0094", "L0094", "L0094",
>             "L0124", "L0124",
>             "L0143", 
>             "L0170",
>             "13459",
>             "473908",
>             "394704",
>             "4711",
>             "4712",
>             "4713")
>Debitor <- as.character(Debitor)
>var1 <- c(11, 12, 13,
>          14, 14,
>          12, 13, 14,
>          10, 11,
>          12, 12,
>          12, 12, 12,
>          15, 17,
>          11,
>          14,
>          12,
>          17,
>          13,
>          15,
>          16,
>          11)
>ds_example <- data.frame(Debitor, var1)
>ds_example$case_id <- 1:nrow(ds_example)
>ds_example <- ds_example[, sort(colnames(ds_example))]
>ds_example
>
># I would like to generate a data frame that contains the duplicates
>AND 
>the
># corresponding non-duplicates to the duplicates.
># For example, finding the duplicates with deliver case 2 and 3 but the
>
>list
># should also contain case 1 because case 1 is the corresponding case
>to 
>the
># duplicate cases 2 and 3.
># For the whole example dataset that would be:
>needed <- c(1, 1, 1,
>            1, 1,
>            1, 1, 1,
>            1, 1,
>            1, 1,
>            1, 1, 1,
>            1, 1,
>            0, 0, 0, 0, 0, 0, 0, 0)
>needed <- as.logical(needed)
>ds_example <- data.frame(ds_example, needed)
>ds_example
>
># To find the duplicates and the corresponding non-duplicates
>duplicates <- duplicated(ds_example$Debitor)
>
>list_of_duplicated_debitors <- as.character(ds_example[duplicates, 
>"Debitor"])
>
>filter_variable <- unique(list_of_duplicated_debitors)
>
>ds_duplicates <- ds_example["Debitor" == filter_variable]  # Result: 
>dataset with 0 columns
>
>ds_duplicates <- ds_example["Debitor"] %in% filter_variable  # Result: 
>FALSE
>
># How can I create a dataset like this
>
>ds_example <- ds_example[needed, ]
>ds_example
>
># using the Debitor IDs?
>
>Kind regards
>
>Georg Maubach
>
>______________________________________________
>R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see
>https://stat.ethz.ch/mailman/listinfo/r-help
>PLEASE do read the posting guide
>http://www.R-project.org/posting-guide.html
>and provide commented, minimal, self-contained, reproducible code.

	[[alternative HTML version deleted]]