SCM

SCM Repository

[tm] Diff of /pkg/R/matrix.R
ViewVC logotype

Diff of /pkg/R/matrix.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1150, Tue Nov 15 15:37:17 2011 UTC revision 1151, Thu Nov 17 14:21:49 2011 UTC
# Line 49  Line 49 
49  TermDocumentMatrix.VCorpus <-  TermDocumentMatrix.VCorpus <-
50  function(x, control = list())  function(x, control = list())
51  {  {
     weighting <- control$weighting  
     if (is.null(weighting))  
         weighting <- weightTf  
   
52      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
53      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
54          .Call("copyCorpus", x, materialize(x))          .Call("copyCorpus", x, materialize(x))
# Line 78  Line 74 
74                                 list(Terms = allTerms,                                 list(Terms = allTerms,
75                                      Docs = unlist(lapply(x, ID))))                                      Docs = unlist(lapply(x, ID))))
76    
77        bg <- control$bounds$global
78        if (length(bg) == 2L && is.numeric(bg)) {
79            rs <- row_sums(m > 0)
80            m <- m[(rs >= bg[1]) & (rs <= bg[2]), ]
81        }
82    
83        weighting <- control$weighting
84        if (is.null(weighting))
85            weighting <- weightTf
86    
87      .TermDocumentMatrix(m, weighting)      .TermDocumentMatrix(m, weighting)
88  }  }
89    
# Line 196  Line 202 
202      else      else
203          table(factor(txt, levels = dictionary))          table(factor(txt, levels = dictionary))
204    
205      ## Ensure minimum document frequency threshold      ## Ensure local bounds
206      minDocFreq <- control$minDocFreq      bl <- control$bounds$local
207      if (!is.null(minDocFreq))      if (length(bl) == 2L && is.numeric(bl))
208          tab <- tab[tab >= minDocFreq]          tab <- tab[(tab >= bl[1]) & (tab <= bl[2])]
209    
210      ## Filter out too short terms      ## Filter out too short or too long terms
211      minWordLength <- control$minWordLength      nc <- nchar(names(tab), type = "chars")
212      if (is.null(minWordLength))      tab <- tab[(nc >= max(3, control$wordLengths[1])) & (nc <= min(Inf, control$wordLengths[2]))]
         minWordLength <- 3  
     tab <- tab[nchar(names(tab), type = "chars") >= minWordLength]  
213    
214      ## Return named integer      ## Return named integer
215      structure(as.integer(tab), names = names(tab), class = c("term_frequency", "integer"))      structure(as.integer(tab), names = names(tab), class = c("term_frequency", "integer"))

Legend:
Removed from v.1150  
changed lines
  Added in v.1151

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge