r - For/while loop optimization with d_ply -


i running custom function produces , saves plots each line of data.frame:

zz <- "chr  start end   name  max chr11 66184332 66197785 npas4_cbp20 90 chr11  62666002 62683613 bc047540_cbp20 100    chr1 9824542  9828548  mir3687_cbp20  500 chr6  33239767 33259282 b3galt4_pol2   1000 chr20  244996112   245029580   hnrnpu-as1_pol2   450 chr20 62487823 62525914 abhd16b_pol2   370 chr12 121146198   121179996   acads_pol2  90"  my.genes <- read.table(text=zz, header = true) 

that function has 2 steps 1 of slow (~40 sec) needs run 1 each chr , each line of chr. in pseudocode like:

for each chr createdata    each nameinchr somethingwithdata. 

my question is, how optimize this? nested d_ply? sort data.frame chr , form of apply on unique(mygenes$chr)?

i sorry if not contain actual function, long 1 , think more practice/theoretical question. however, if needed can add proper code.

edit

here simplified function (plotgviz). is, don't think d_ply work because run ucsctrack every line (am correct?). ultimate goal run ucsctrack each unique chromosome , use information construct plot each gene (name). both steps (ucsctrack , plotting) time consuming rationale running non-unique part of code (that applies chr) first save time on whole - table fraction of have.

the functions inside custom plotgviz cannot optimized way set them up.

################ ### libraries ## library(gviz) library(genomicranges) library(genomicfeatures) library(data.table) library("rcolorbrewer") #######################   d_ply(my.genes, .(chr, name), plotgviz)  plotgviz <- function(gene) {    chr <- gene$chr    mygene.start <- gene$start    mygene.end <- gene$end    mygene <- gene$name    max.cov <- gene$max  ################################################# ## part runs once each unique chr ## ucsctrack takes time     ## gene annotations ##    knowngenes <- ucsctrack(genome=gen, chromosome=chr,    track="knowngene",  tracktype="generegiontrack",    rstarts="exonstarts", rends="exonends", gene="name", symbol="name",    transcript="name", strand="strand", fill="#ff7f00", name="ucsc genes", genesymbols = true, showid = true)     refgenes <- ucsctrack(genome=gen, chromosome=chr,    track="refgene", tracktype="generegiontrack",     rstarts="exonstarts", rends="exonends", gene="name", symbol="name",    transcript="name", strand="strand", fill="#ff7f00", name="refseq genes", genesymbols = true, showid = true)     ## axis scale ##    ideotrack <- ideogramtrack(genome = gen, chromosome = chr)    # plottracks(ideotrack, = mygene.start, = mygene.end)     axistrack <- genomeaxistrack(scale=0.25)   #################################### ## each name in unique chr ######################     ## construct tracks ##      ## chip-seq coverage bam ##    bampol2 <- "~/bioinfo/srp_chip_seq/data/reads/merged_reads_cbp20line/pol2.bam"    baminput <- "~/bioinfo/srp_chip_seq/data/reads/merged_reads_cbp20line/input.bam"    ## histogram    btrackpol2 <- datatrack(range = bampol2, genome = gen, chromosome = chr,       name = "pol2", type = "histogram", col.histogram="#984ea3",       ylim=c(0,max.cov))     btrackinput <- datatrack(range = baminput, genome = gen, chromosome = chr,       name = "input", type = "histogram", col.histogram="#999999",       ylim=c(0,max.cov))      #################    ## plot tracks ##     pdf(paste("./plots/", mygene,"_cov.pdf", sep=""))      plottracks(list(ideotrack, axistrack, btrackcbp20, ptrackcbp20, btrackpol2, ptrackpol2, btrackinput, refgenes, btrackrnacov), = mygene.start, = mygene.end, fontfamily="helvetica", background.title="white", col.title="black", col.axis="black")     plottracks(list(ideotrack, axistrack, btrackcbp20, ptrackcbp20, btrackpol2, ptrackpol2, btrackinput, knowngenes, btrackrnacov), = mygene.start, = mygene.end, fontfamily="helvetica", background.title="white", col.title="black", col.axis="black")    dev.off()     # plottracks(list(axistrack, refgenes, btrackcbp20, ptrackcbp20, btrackpol2, ptrackpol2, btrackinput, btrackrna), = mygene.start, = mygene.end, fontfamily="helvetica", background.title="white", col.title="black", col.axis="black")     # head(displaypars(btrackpol2)) } 

edit2

my temporary solution using loop unique chr, creating ucsctrack before sub-setting names each chr , d_pply plotting. spiting big function in 2 piaces.

for (chrom in unique(my.genes$chr)) {    print(paste("creating annotation ", chrom, sep=""))     ## gene annotations ##    knowngenes <- ucsctrack(genome=gen, chromosome=chrom,    track="knowngene",  tracktype="generegiontrack",    rstarts="exonstarts", rends="exonends", gene="name", symbol="name",    transcript="name", strand="strand", fill="#ff7f00", name="ucsc genes", genesymbols = true, showid = true)     refgenes <- ucsctrack(genome=gen, chromosome=chrom,    track="refgene", tracktype="generegiontrack",     rstarts="exonstarts", rends="exonends", gene="name", symbol="name",    transcript="name", strand="strand", fill="#ff7f00", name="refseq genes", genesymbols = true, showid = true)     ## axis scale ##    ideotrack <- ideogramtrack(genome = gen, chromosome = chrom)    # plottracks(ideotrack, = mygene.start, = mygene.end)     axistrack <- genomeaxistrack(scale=0.25)     ## select genes in chromosome    df.g <- subset(my.genes, chr == chrom)    d_ply(df.g, .(name), plotgviz) } 

try data.table, it's going faster plyr. here's generic approach problem:

library(data.table) dt = data.table(my.genes)  dt[, .sd[, do_smth(), = name], = chr] 

and here's example of above approach in different question - how integrate properties defined on multiple rows using data.frame or data.table long format approach


Comments

Popular posts from this blog

matlab - Deleting rows with specific rules -

jquery - How would i go about shortening this code? And to cancel the previous click on click of new section? -