[BioC] biomaRt - batch query for chromosome location to gene identifier?

Kemal Akat kakat at mail.rockefeller.edu
Sat Nov 13 01:19:06 CET 2010


Dear Vincent and Martin,

thank you for your help and explanations. I will try your suggestions.

Dear Wolfgang,

sorry if the info I posted was incomplete. It was more a semantic explanation than a technical one. I realized the incorrect syntax, but that was just a typo (as I couldn't copy and paste back then). I'll try to be more precise in the future.

For the sake of completeness here is the actual code I was running:

1) with one filter, referring to the column of the data frame

> options(width = 800, max.print = 5E5) 
> library(biomaRt)
> ensembl54 <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host = "may2009.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE) 
> tdp <- read.delim("/Users/Kemal/Desktop/Projects/biomaRt/tdp.txt", row.names = 1)
> genes <- getBM(attributes = "entrezgene", filters = c("chromosomal_region"), values = list(tdp$Chromosomal_Location), mart = ensembl54)
> genes
      entrezgene
1           6964
2         651536
3         445347
4           3492
5      100133739
6         652494
...
19731      55657
19732     267002
19733     692312
> sessionInfo()
R version 2.12.0 (2010-10-15)
Platform: x86_64-apple-darwin9.8.0/x86_64 (64-bit)

locale:
[1] C/en_US.UTF-8/C/C/C/C

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] biomaRt_2.6.0

loaded via a namespace (and not attached):
[1] RCurl_1.4-3 XML_3.2-0  
>

2) with multiple filters and the localization info splitted into 4 separate vectors

> options(width = 800, max.print = 5E5) # change display settings to allow larger data frames, modify as needed
> library(biomaRt) # load the biomaRt package
> #ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") # to assign the variable ensembl (or else) to hg19, NCBI build 37
> ensembl54 <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl", host = "may2009.archive.ensembl.org", path = "/biomart/martservice", archive = FALSE) # to use the hg18, NCBI build 36
> 
> chromosome <-c(2, 1, 8, 17, 12, 10, 21, 2, 7, 16, 5, 4, 1, 19, 13, 13, 10, 7, 15, 7, 2, 12, 21, 20, 5, 11, 15, 12, 17, 3, 17, 14, 19, 13, 6, 14, 11, 13, 2, 20, 7, 10, 1, 16, "X", 22, 20, 1, 3, 8, 4, 1, 6, 15, 17, 4, 12, 7, 1, 14, 12, 17, 12, 6, 9, 22, "X", 7, 12, 10, 19, 5, 1, 8, 11, 8, 19, 7, 6, 5, 1, 6, 9, 19, 10, "X", 3, 5, 13, 17, 20, 3, 16, 5, 13, 12, 15, 19, 4, 16, 10, 8, 7, 7, 12, 6, 11, 21, 17, "X", 15, 10, 16, 15, 9, 5, 2, 6, 12, 5, 14, 14, 6, 6, 15, 4, 1, 9, 8, 1, 5, "X", 11, 2, 1, 19, 2, 2, 13, 1, 17, 13, "X", 13, 7, 11, 3, "X", 15, 17, 22, 11, 16, 19, 7, 2, 13, 9, 14, 12, 1)
> 
> strand <- c(1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, -1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, -1, -1, 1, -1, 1, 1, -1, 1, 1, -1)
> 
> start <- c(74295624, 203949843, 103464182, 53272097, 11150173, 52616450, 29638098, 160316091, 26218430, 73763529, 118611423, 78176046, 39566219, 45430361, 34823356, 114106490, 85902288, 55407132, 97080019, 97322520, 241940312, 49774225, 43149920, 1298551, 71540977, 2967925, 75125438, 107566872, 32012029, 50130463, 24612590, 103222346, 49628998, 76795050,47700337, 61273382, 64288817, 99433818, 86857893, 60954640, 121562796, 102024852, 233341511, 46963400, 54115224, 17405158, 33706984, 233341179, 11574528, 117938795, 100202920, 191280320, 89641039, 91241813, 27712528, 72107017, 45043151, 115686015, 227526472, 59794222, 132193041, 2534782, 49500468, 64462329, 19040649, 34020658, 118261386, 5071002,47616321, 101146860, 41419808, 145858601, 9912662, 109329554, 47446160, 130922997, 42311362, 135263153, 64348945, 32179718, 224044250, 112127969, 37126090, 41697757, 7442786, 24005946, 52264406, 154175486, 76478566, 20845822, 30384091, 9561760, 55844512, 65510903, 73435394, 52920938, 42797538, 45273560, 174489559, 55535115, 847112, 26283582,79686310, 138640447, 751963, 124334398, 122435615, 29638867, 5277351, 46804967, 39560795, 89503999, 55844568, 42797166, 113346669, 139476285, 74236497, 43716256, 52725397, 137763505, 31647833, 101463626, 136622538, 76608878, 66287317, 174489713, 32145213, 114023332, 61632141, 160759702, 56575893, 73723822, 92851504, 74295579, 20108627, 51970080,86384720, 174692895, 76479389, 233342144, 19819930, 47962168, 70434946, 33344305, 135285934, 64288869, 143115273, 100555421, 40493944, 44486723, 37016836, 27475187, 14437226, 40184189, 104928588, 38862574, 96917747, 114023291, 89944103, 67952950, 223656250)
> 
> end <- c(74295644, 203949866, 103464207, 53272117, 11150194, 52616474, 29638121, 160316115, 26218451, 73763550, 118611442, 78176077, 39566244, 45430401, 34823379, 114106513, 85902314, 55407154, 97080040, 97322541, 241940335, 49774248, 43149944, 1298573, 71540997, 2967950, 75125461, 107566904, 32012052, 50130483, 24612615, 103222367, 49629021, 76795071, 47700356, 61273403, 64288842, 99433838, 86857915, 60954662, 121562817, 102024873, 233341535, 46963421, 54115248, 17405178, 33707009, 233341202, 11574549, 117938820, 100202946, 191280339, 89641061, 91241835, 27712550, 72107038, 45043175, 115686058, 227526491, 59794245, 132193064, 2534802, 49500489, 64462352, 19040675, 34020682, 118261409, 5071024, 47616347, 101146884, 41419832, 145858626, 9912686, 109329575, 47446181, 130923017, 42311383, 135263177, 64348968, 32179741, 224044275, 112127989, 37126110, 41697783, 7442807, 24005968, 52264426, 154175508, 76478587, 20845844, 30384113, 9561782, 55844532, 65510922, 73435415, 52920963, 42797563, 45273580, 174489628, 55535135, 847134, 26283606, 79686336, 138640468, 751985, 124334422, 122435636, 29638895, 5277374, 46804991, 39560819, 89504022, 55844595, 42797186, 113346688, 139476306, 74236521, 43716279, 52725418, 137763531, 31647855, 101463647, 136622560, 76608900, 66287361, 174489736, 32145234, 114023358, 61632162, 160759726, 56575914, 73723844, 92851527, 74295600, 20108650, 51970105, 86384744, 174692915, 76479410, 233342163, 19819951, 47962190, 70434970, 33344326, 135285963, 64288896, 143115294, 100555442, 40493968, 44486745, 37016858, 27475207, 14437254, 40184209, 104928610, 38862598, 96917795, 114023316, 89944124, 67952970, 223656273)
> 
> genes <- getBM(attributes = "entrezgene", filters = c("chromosome_name", "start", "end", "strand"), values = list(chromosome, start, end, strand), mart = ensembl54)
> genes
      entrezgene
1           6964
2         651536
3         445347
4           3492
...
19073      10806
19074      10000
19075     692312
> sessionInfo()
R version 2.12.0 (2010-10-15)
Platform: x86_64-apple-darwin9.8.0/x86_64 (64-bit)

locale:
[1] C/en_US.UTF-8/C/C/C/C

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] biomaRt_2.6.0

loaded via a namespace (and not attached):
[1] RCurl_1.4-3 XML_3.2-0  
>

Kind regards,
Kemal


Am 12.11.2010 um 09:16 schrieb Martin Morgan:

> On 11/12/2010 04:18 AM, Vincent Carey wrote:
>> tx18 = transcripts(hg18.txdb)
>>> kg = values(tx18[ findOverlaps(kem,tx18)@matchMatrix[,2] ])$tx_name
> 
> Better to use the accessor matchMatrix(findOveralaps(kem, tx18))
> 
> Martin
> 
> -- 
> Computational Biology
> Fred Hutchinson Cancer Research Center
> 1100 Fairview Ave. N. PO Box 19024 Seattle, WA 98109
> 
> Location: M1-B861
> Telephone: 206 667-2793



More information about the Bioconductor mailing list