SCM

SCM Repository

[inlinedocs] Annotation of /pkg/inlinedocs/inst/testfiles/sublogo.dendrogram.R
ViewVC logotype

Annotation of /pkg/inlinedocs/inst/testfiles/sublogo.dendrogram.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 45 - (view) (download)
Original Path: pkg/inlinedocs/tests/sublogo/R/sublogo.dendrogram.R

1 : tdhock 45 read.fasta <- function
2 :     ### Read sequences in FASTA format into a named character vector
3 :     (infile
4 :     ### Name of the sequence file
5 :     ){
6 :     tmp <- sapply(strsplit(strsplit(paste('\n\n\n',paste(readLines(infile),collapse='\n'),sep=''),split='\n>')[[1]],'\n'),function(v)c(v[1],paste(v[2:length(v)],collapse='')))
7 :     seqs <- tmp[2,]
8 :     names(seqs) <- tmp[1,]
9 :     blank <- names(seqs)==''
10 :     names(seqs)[blank] <- seqs[blank]
11 :     names(seqs) <- gsub(' ','',names(seqs))
12 :     seqs[seqs!='']
13 :     }
14 :     ##debug(read.fasta)
15 :    
16 :     ### Letters in the DNA alphabet, used to auto-detect sequence type
17 :     dna.letters <- c("*","A","T","G","C")
18 :     ### DNA identity substitution matrix.
19 :     dna.identity <- matrix(0,nrow=length(dna.letters),ncol=length(dna.letters),
20 :     dimnames=list(dna.letters,dna.letters))
21 :     diag(dna.identity) <- 1
22 :     dna.identity['*','*'] <- 0
23 :     seqs.to.mat <- function
24 :     ### Calculate pairwise differences between sequences using a
25 :     ### substitution matrix.
26 :     (seq.vec,
27 :     ### DNA or protein sequences.
28 :     subs.mat=NULL){
29 :     ### Substitution matrix with dimnames that match the letters used in
30 :     ### the sequence data, or a character vector that specifies a common
31 :     ### substitution matrix (as defined in the Biostrings package). NULL
32 :     ### specifies that we will guess a suitable substitution matrix to
33 :     ### match your input sequences (DNA=>identity, protein=>BLOSUM62).
34 :     if(is.null(names(seq.vec)))names(seq.vec) <- seq.vec
35 :     chars <- sapply(seq.vec,nchar)
36 :     seqsum <- table(chars)
37 :     if(length(seqsum)>1){
38 :     print(as.data.frame(seqsum))
39 :     i <- which.max(seqsum)
40 :     cat("Sequences not of length ",names(seqsum)[i],":\n",sep="")
41 :     rows <- as.integer(names(seqsum)[i])!=chars
42 :     print(data.frame(chars,name=names(seq.vec),row.names=NULL)[rows,])
43 :     stop("All input sequences must be of the same length.")
44 :     }
45 :     d <- toupper(gsub('[- .]',"*",seq.vec[unique(names(seq.vec))]))
46 :     print(d)
47 :     letters <- unique(c(unlist(strsplit(d,split='')),dna.letters))
48 :     ##if dna alignment use simple identity matrix
49 :     looks.like.dna <- identical(sort(letters),sort(dna.letters))
50 :     if(is.null(subs.mat))
51 :     subs.mat <- if(looks.like.dna)dna.identity else "BLOSUM62"
52 :     if(mode(subs.mat)=="character"){
53 :     ex <- substitute(data(M,package="Biostrings"),list(M=subs.mat))
54 :     eval(ex)
55 :     subs.mat <- get(subs.mat)
56 :     }
57 :     print(subs.mat)
58 :     N <- length(d)
59 :     m <- matrix(0,nrow=N,ncol=N,dimnames=list(names(d),names(d)))
60 :     for(i in 1:N)for(j in 1:i){
61 :     seqs <- sapply(strsplit(c(d[i],d[j]),split=''),c)
62 :     entry <- try(apply(seqs,1,function(x)subs.mat[x[1],x[2]]))
63 :     if(class(entry)=="try-error"){
64 :     print(seqs)
65 :     stop("Sequence difference matrix construction failed.")
66 :     }
67 :     ## subscript out of bounds here usually means bad matrix
68 :     m[i,j] <- m[j,i] <- -sum(entry)
69 :     }
70 :     m <- m-min(m)
71 :     attr(m,'seqs') <- d
72 :     print(m[1:min(5,nrow(m)),1:min(5,ncol(m))])
73 :     m
74 :     ### The matrix of distances between each input sequence, with dimnames
75 :     ### corresponding to either the sequences, or the sequence names (if
76 :     ### they exist)
77 :     }
78 :     ##debug(seqs.to.mat)
79 :    
80 :     make.logo.ps <- function
81 :     ### Create a logo using weblogo, then read it in using grImport
82 :     (helices,
83 :     ### Sequences to plot in the logo
84 :     psbase
85 :     ### Base filename for the logo postscript and xml files, should be the
86 :     ### full path name except for the trailing .ps
87 :     ){
88 :     psfile <- paste(psbase,'ps',sep='.')
89 :     xmlfile <- paste(psfile,'xml',sep='.')
90 :     seq.text <- paste(paste('>',helices,'\n',helices,sep=''),collapse='\n')
91 :     write(seq.text,psbase)
92 :     execdir <- system.file("exec",package="sublogo")
93 :     cmd <- paste("PATH=",execdir,
94 :     ":$PATH seqlogo -c -F EPS -f ",psbase,
95 :     "|sed 's/^EndLine/%EndLine/'|sed 's/^EndLogo/%EndLogo/' >",
96 :     psfile,sep="")
97 :     cat(cmd,'\n')
98 :     system(cmd)
99 :     owd <- setwd(tempdir())
100 :     PostScriptTrace(psfile,xmlfile)
101 :     setwd(owd)
102 :     pic <- readPicture(xmlfile)
103 :     pic
104 :     ### Grid picture grob as read using readPicture
105 :     }
106 :     ##debug(make.logo.ps)
107 :    
108 :     sublogo.dendrogram <- function
109 :     ### Main function for drawing sublogo dendrograms.
110 :     (M,
111 :     ### difference matrix as constructed by seqs.to.mat (although in
112 :     ### principle any object with a valid as.dist method could be used)
113 :     main='',
114 :     ### plot title
115 :     subtit=NULL,
116 :     ### plot subtitle
117 :     base=tempfile(),
118 :     ### base file name for temporary logo files
119 :     cutline=150,
120 :     ### Distance for cutting the tree. Draw a sublogo for each
121 :     ### leaf. Normally you will plot once, then inspect the dendrogram to
122 :     ### determine which is a good value for cutline, then plot again using
123 :     ### your chosen cutline.
124 :     dend.width=30,
125 :     ### Percent of the plot to be occupied by the dendrogram. The logos
126 :     ### will be displayed with equal widths on either side.
127 :     cex=1
128 :     ### character expansion factor for the dendrogram
129 :     ){
130 :     hc <- hclust(as.dist(M),method="average")
131 :     dend <- as.dendrogram(hc)
132 :     fam <- cutree(hc,h=cutline)[labels(dend)] # order by plotting method
133 :     famids <- unique(fam)
134 :     famtab <- data.frame(fam,y=1:length(fam))
135 :     famtab$seq <- attr(M,'seqs')[rownames(famtab)]
136 :     fam.nontriv <- sapply(famids,function(i)sum(famtab$fam==i))>1
137 :     names(fam.nontriv) <- famids
138 :     if(is.null(subtit))
139 :     subtit <- paste(nrow(M),"sequences,",sum(fam.nontriv),"families")
140 :     xrange <- c(0,1)
141 :     yrange <- c(0,max(famtab[,'y'])+1)
142 :     draw.box <- function(i){ # replace with logos
143 :     subtab <- famtab[famtab[,'fam']==i,]
144 :     grprange <- range(subtab[,'y'])
145 :     rect(xrange[1],grprange[1]-0.25,xrange[2],grprange[2]+0.25)
146 :     }
147 :     draw.logo <- function(i){
148 :     ## do not draw sublogo for a family of trivial size
149 :     if(fam.nontriv[as.character(i)]){
150 :     subtab <- famtab[famtab[,'fam']==i,]
151 :     grprange <- range(subtab[,'y'])
152 :     tmpfile <- paste(base,i,sep='.')
153 :     logo <- make.logo.ps(subtab$seq,tmpfile)
154 :     grid.picture(logo,
155 :     x=xrange[1],
156 :     y=unit(grprange[1]+0.25,'native'),
157 :     height=unit(diff(grprange)-0.5,'native'),
158 :     distort=TRUE,
159 :     just=c(0,0),
160 :     fillText=TRUE)
161 :     }
162 :     }
163 :    
164 :     bottomspace <- 0.6
165 :     topspace <- 0.4
166 :     side.percents <- (100-dend.width)/2
167 :     layout(matrix(1:3,ncol=3),c(side.percents,dend.width,side.percents))
168 :     par(mai=c(bottomspace,0,topspace,0),cex=1)
169 :    
170 :     ## Big summary logo on left
171 :     plot(xrange,yrange,
172 :     bty='n',xaxt='n',yaxt='n',ylab='',xlab='',type='n',yaxs='i')
173 :     biglogo <- make.logo.ps(famtab$seq,paste(base,'0',sep='.'))
174 :     vps <- baseViewports()
175 :     pushViewport(vps$inner,vps$figure,vps$plot)
176 :     grid.picture(biglogo,
177 :     y=unit(0,'native'),
178 :     height=unit(length(fam),'native'),
179 :     just=c(0.5,0),
180 :     exp=0,
181 :     distort=TRUE,
182 :     fillText=TRUE)
183 :     popViewport(3)
184 :    
185 :     ## Dendrogram in middle
186 :     par(mai=c(bottomspace,0,topspace,
187 :     max(strwidth(colnames(M),'inches',
188 :     cex))/6*5),
189 :     family='mono')
190 :     par(xpd=NA)
191 :     plot(dend,h=TRUE,edgePar=list(lwd=2),nodePar=list(lab.cex=cex,pch=""))
192 :     par(family="")
193 :     segments(cutline,1,cutline,length(fam))
194 :     ##axis(3,cutline,lty=0,line=0)
195 :    
196 :     ## Title in the middle
197 :     title(main,line=0.5)
198 :     mtext(subtit,line=-0.7,cex=1)
199 :    
200 :     # Sublogos on the right side
201 :     par(mai=c(bottomspace,0,topspace,0))
202 :     plot(xrange,yrange,
203 :     bty='n',xaxt='n',yaxt='n',ylab='',xlab='',type='n',yaxs='i')
204 :     vps <- baseViewports()
205 :     pushViewport(vps$inner,vps$figure,vps$plot)
206 :     sapply(famids,draw.logo)
207 :     popViewport(3)
208 :    
209 :     hc
210 :     ### The dendrogram from the call to hclust
211 :     }
212 :     ##debug(sublogo.dendrogram)
213 :    
214 :     sublogo <- function
215 :     ### Draw a sublogo dendrogram for a sequence motif.
216 :     (seqs,
217 :     ### Character vector of DNA or protein sequences (optionally named
218 :     ### with sequence labels).
219 :     mat=NULL,
220 :     ### Substitution matrix passed to seqs.to.mat.
221 :     ...
222 :     ### Other arguments to pass to sublogo.dendrogram (see that help page
223 :     ### for a full description).
224 :     ){
225 :     sublogo.dendrogram(seqs.to.mat(seqs,mat),...)
226 :     }

R-Forge@R-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge