r - For/while loop optimization with d_ply -
i running custom function produces , saves plots each line of data.frame:
zz <- "chr start end name max chr11 66184332 66197785 npas4_cbp20 90 chr11 62666002 62683613 bc047540_cbp20 100 chr1 9824542 9828548 mir3687_cbp20 500 chr6 33239767 33259282 b3galt4_pol2 1000 chr20 244996112 245029580 hnrnpu-as1_pol2 450 chr20 62487823 62525914 abhd16b_pol2 370 chr12 121146198 121179996 acads_pol2 90" my.genes <- read.table(text=zz, header = true) that function has 2 steps 1 of slow (~40 sec) needs run 1 each chr , each line of chr. in pseudocode like:
for each chr createdata each nameinchr somethingwithdata. my question is, how optimize this? nested d_ply? sort data.frame chr , form of apply on unique(mygenes$chr)?
i sorry if not contain actual function, long 1 , think more practice/theoretical question. however, if needed can add proper code.
edit
here simplified function (plotgviz). is, don't think d_ply work because run ucsctrack every line (am correct?). ultimate goal run ucsctrack each unique chromosome , use information construct plot each gene (name). both steps (ucsctrack , plotting) time consuming rationale running non-unique part of code (that applies chr) first save time on whole - table fraction of have.
the functions inside custom plotgviz cannot optimized way set them up.
################ ### libraries ## library(gviz) library(genomicranges) library(genomicfeatures) library(data.table) library("rcolorbrewer") ####################### d_ply(my.genes, .(chr, name), plotgviz) plotgviz <- function(gene) { chr <- gene$chr mygene.start <- gene$start mygene.end <- gene$end mygene <- gene$name max.cov <- gene$max ################################################# ## part runs once each unique chr ## ucsctrack takes time ## gene annotations ## knowngenes <- ucsctrack(genome=gen, chromosome=chr, track="knowngene", tracktype="generegiontrack", rstarts="exonstarts", rends="exonends", gene="name", symbol="name", transcript="name", strand="strand", fill="#ff7f00", name="ucsc genes", genesymbols = true, showid = true) refgenes <- ucsctrack(genome=gen, chromosome=chr, track="refgene", tracktype="generegiontrack", rstarts="exonstarts", rends="exonends", gene="name", symbol="name", transcript="name", strand="strand", fill="#ff7f00", name="refseq genes", genesymbols = true, showid = true) ## axis scale ## ideotrack <- ideogramtrack(genome = gen, chromosome = chr) # plottracks(ideotrack, = mygene.start, = mygene.end) axistrack <- genomeaxistrack(scale=0.25) #################################### ## each name in unique chr ###################### ## construct tracks ## ## chip-seq coverage bam ## bampol2 <- "~/bioinfo/srp_chip_seq/data/reads/merged_reads_cbp20line/pol2.bam" baminput <- "~/bioinfo/srp_chip_seq/data/reads/merged_reads_cbp20line/input.bam" ## histogram btrackpol2 <- datatrack(range = bampol2, genome = gen, chromosome = chr, name = "pol2", type = "histogram", col.histogram="#984ea3", ylim=c(0,max.cov)) btrackinput <- datatrack(range = baminput, genome = gen, chromosome = chr, name = "input", type = "histogram", col.histogram="#999999", ylim=c(0,max.cov)) ################# ## plot tracks ## pdf(paste("./plots/", mygene,"_cov.pdf", sep="")) plottracks(list(ideotrack, axistrack, btrackcbp20, ptrackcbp20, btrackpol2, ptrackpol2, btrackinput, refgenes, btrackrnacov), = mygene.start, = mygene.end, fontfamily="helvetica", background.title="white", col.title="black", col.axis="black") plottracks(list(ideotrack, axistrack, btrackcbp20, ptrackcbp20, btrackpol2, ptrackpol2, btrackinput, knowngenes, btrackrnacov), = mygene.start, = mygene.end, fontfamily="helvetica", background.title="white", col.title="black", col.axis="black") dev.off() # plottracks(list(axistrack, refgenes, btrackcbp20, ptrackcbp20, btrackpol2, ptrackpol2, btrackinput, btrackrna), = mygene.start, = mygene.end, fontfamily="helvetica", background.title="white", col.title="black", col.axis="black") # head(displaypars(btrackpol2)) } edit2
my temporary solution using loop unique chr, creating ucsctrack before sub-setting names each chr , d_pply plotting. spiting big function in 2 piaces.
for (chrom in unique(my.genes$chr)) { print(paste("creating annotation ", chrom, sep="")) ## gene annotations ## knowngenes <- ucsctrack(genome=gen, chromosome=chrom, track="knowngene", tracktype="generegiontrack", rstarts="exonstarts", rends="exonends", gene="name", symbol="name", transcript="name", strand="strand", fill="#ff7f00", name="ucsc genes", genesymbols = true, showid = true) refgenes <- ucsctrack(genome=gen, chromosome=chrom, track="refgene", tracktype="generegiontrack", rstarts="exonstarts", rends="exonends", gene="name", symbol="name", transcript="name", strand="strand", fill="#ff7f00", name="refseq genes", genesymbols = true, showid = true) ## axis scale ## ideotrack <- ideogramtrack(genome = gen, chromosome = chrom) # plottracks(ideotrack, = mygene.start, = mygene.end) axistrack <- genomeaxistrack(scale=0.25) ## select genes in chromosome df.g <- subset(my.genes, chr == chrom) d_ply(df.g, .(name), plotgviz) }
try data.table, it's going faster plyr. here's generic approach problem:
library(data.table) dt = data.table(my.genes) dt[, .sd[, do_smth(), = name], = chr] and here's example of above approach in different question - how integrate properties defined on multiple rows using data.frame or data.table long format approach
Comments
Post a Comment