SCM

SCM Repository

[tm] Diff of /pkg/R/weight.R
ViewVC logotype

Diff of /pkg/R/weight.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1444, Mon Aug 22 11:50:13 2016 UTC revision 1445, Sun Oct 9 09:30:58 2016 UTC
# Line 21  Line 21 
21          if (normalize) {          if (normalize) {
22              cs <- col_sums(m)              cs <- col_sums(m)
23              if (any(cs == 0))              if (any(cs == 0))
24                  warning("empty document(s): ", paste(Docs(m)[cs == 0], collapse = " "))                  warning("empty document(s): ",
25                            paste(Docs(m)[cs == 0], collapse = " "))
26              names(cs) <- seq_len(nDocs(m))              names(cs) <- seq_len(nDocs(m))
27              m$v <- m$v / cs[m$j]              m$v <- m$v / cs[m$j]
28          }          }
29          rs <- row_sums(m > 0)          rs <- row_sums(m > 0)
30          if (any(rs == 0))          if (any(rs == 0))
31              warning("unreferenced term(s): ", paste(Terms(m)[rs == 0], collapse = " "))              warning("unreferenced term(s): ",
32                        paste(Terms(m)[rs == 0], collapse = " "))
33          lnrs <- log2(nDocs(m) / rs)          lnrs <- log2(nDocs(m) / rs)
34          lnrs[!is.finite(lnrs)] <- 0          lnrs[!is.finite(lnrs)] <- 0
35          m <- m * lnrs          m <- m * lnrs
36          attr(m, "weighting") <- c(sprintf("%s%s",          attr(m, "weighting") <-
37                c(sprintf("%s%s",
38                                            "term frequency - inverse document frequency",                                            "term frequency - inverse document frequency",
39                                            if (normalize) " (normalized)" else ""),                                            if (normalize) " (normalized)" else ""),
40                                    "tf-idf")                                    "tf-idf")
# Line 57  Line 60 
60          if (isDTM) m <- t(m)          if (isDTM) m <- t(m)
61    
62          if(normalization == "b") {          if(normalization == "b") {
63              ## Need to compute the character lenghts of the documents              ## Need to compute the character lengths of the documents
64              ## before starting the weighting.              ## before starting the weighting.
65              charlengths <-              charlengths <-
66                  tapply(nchar(Terms(m))[m$i] * m$v, m$j, sum)                  tapply(nchar(Terms(m))[m$i] * m$v, m$j, sum)
# Line 79  Line 82 
82                        ## log ave                        ## log ave
83                        L = {                        L = {
84                            s <- tapply(m$v, m$j, mean)                            s <- tapply(m$v, m$j, mean)
85                            ((1 + log2(m$v)) /                            ((1 + log2(m$v)) / (1 + log2(s[as.character(m$j)])))
                            (1 + log2(s[as.character(m$j)])))  
86                        })                        })
87    
88          ## Document frequency          ## Document frequency
89          rs <- row_sums(m > 0)          rs <- row_sums(m > 0)
90          if (any(rs == 0))          if (any(rs == 0))
91              warning("unreferenced term(s): ", paste(Terms(m)[rs == 0], collapse = " "))              warning("unreferenced term(s): ",
92                        paste(Terms(m)[rs == 0], collapse = " "))
93          df <- switch(document_frequency,          df <- switch(document_frequency,
94                       ## natural                       ## natural
95                       n = 1,                       n = 1,
# Line 99  Line 102 
102          ## Normalization          ## Normalization
103          cs <- col_sums(m)          cs <- col_sums(m)
104          if (any(cs == 0))          if (any(cs == 0))
105              warning("empty document(s): ", paste(Docs(m)[cs == 0], collapse = " "))              warning("empty document(s): ",
106                        paste(Docs(m)[cs == 0], collapse = " "))
107          norm <- switch(normalization,          norm <- switch(normalization,
108                         ## none                         ## none
109                         n = rep(1, nDocs(m)),                         n = rep.int(1, nDocs(m)),
110                         ## cosine                         ## cosine
111                         c = sqrt(col_sums(m ^ 2)),                         c = sqrt(col_sums(m ^ 2)),
112                         ## pivoted unique                         ## pivoted unique
# Line 134  Line 138 
138    
139  weightBin <-  weightBin <-
140      WeightFunction(function(m) {      WeightFunction(function(m) {
141          m$v <- rep(1, length(m$v))          m$v <- rep_len(1L, length(m$v))
142          attr(m, "weighting") <- c("binary", "bin")          attr(m, "weighting") <- c("binary", "bin")
143          m          m
144      }, "binary", "bin")      }, "binary", "bin")

Legend:
Removed from v.1444  
changed lines
  Added in v.1445

R-Forge@R-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge