[R] Problem with comparing multiple data sets
Jim Lemon
drjimlemon at gmail.com
Sat May 30 01:14:14 CEST 2015
Hi Mohammad,
It looks like you are still having problems with this. Given your
latest data set, as below, here is something that might do what you
want. From David's message, I'm not sure whether you are operating on
a single data frame or a list.
# this is the data set as taken from your message below
madf<-structure(list(terms = structure(c(2L, 4L, 4L, 4L, 3L, 1L, 5L,
5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L), .Label =
c("#authentication,access control",
"#privacy,personal data", "#security,malicious,security", "data controller",
"id management,security", "password,recovery"), class = "factor"),
class.1 = c(2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L), class.2 = c(2L, 2L, 2L,
0L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L,
2L, 2L), class.3 = c(2L, 0L, 2L, 2L, 1L, 1L, 0L, 0L, 0L,
2L, 2L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("terms",
"class.1", "class.2", "class.3"), class = "data.frame", row.names = c(NA,
-50L))
# define a function that extracts the value from one field
# selected by a value in another field
extract_by_value<-function(x,field1,value1,field2) {
return(x[x[,field1]==value1,field2])
}
# define another function that equates all of the values
sub_value<-function(x,field1,value1,field2,value2) {
x[x[,field1]==value1,field2]<-value2
return(x)
}
# this now steps through every value in "key_field"
# and operates on every field listed in "change_fields"
conformity<-function(x,key_field,change_fields) {
keys<-unique(x[,key_field])
for(key in keys) {
for(change_field in change_fields) {
# get the most frequent value in change_field
# for the desired value in key_field
most_freq<-as.numeric(names(which.max(table(
extract_by_value(x,key_field,key,change_field)))))
# now set all the values to the most frequent
x<-sub_value(x,key_field,key,change_field,most_freq)
}
}
return(x)
}
conformity(madf,"terms",c("class.1","class.2","class.3"))
Obviously you will want to save the return value of "conformity" into
your original data frame or create a new one.
Jim
> Hi everyone.
>
> I tried the (modeest) package on my initial test data and it worked. However, it doesn't work on the entire data set. I saved one of the protions that gives error. (Not for all of the values but for some of them). For example: lines 36 and 37 and 39 correctly show the mode value but 38 and 40 are not correct. Such error is repeated for many of the values.
>
> [36,] 2
>
> [37,] 2
>
> [38,] Numeric,3
>
> [39,] 1
>
> [40,] Numeric,3
>
> ============================================
>
> #This is what I did:
>
>> df<- read.csv(file="Part1-modif.csv", head=TRUE, sep=",")
>
>> Out<- apply(df[,2:length(df)],1, mfv)
>
>> t(t(Out))
>
> #This is the data set
>
> structure(list(terms = structure(c(2L, 4L, 4L, 4L, 3L, 1L, 5L,
>
> 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
>
> 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L,
>
> 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L), .Label = c("#authentication,access control",
>
> "#privacy,personal data", "#security,malicious,security", "data controller",
>
> "id management,security", "password,recovery"), class = "factor"),
>
> class.1 = c(2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
>
> 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L,
>
> 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L,
>
> 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L), class.2 = c(2L, 2L, 2L,
>
> 0L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L,
>
> 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L,
>
> 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L,
>
> 2L, 2L), class.3 = c(2L, 0L, 2L, 2L, 1L, 1L, 0L, 0L, 0L,
>
> 2L, 2L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
>
> 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
>
> 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("terms",
>
> "class.1", "class.2", "class.3"), class = "data.frame", row.names = c(NA,
>
> -50L))
>
> ========================================================
>
> also when I try to include the terms to the result it gives me an error:
>
>> mode.names<- data.frame (df[,1],Out)
>
> Error in data.frame(df[, 1], Out) :
>
> arguments imply differing number of rows: 50, 3
>
More information about the R-help
mailing list