SCM

SCM Repository

[tm] Diff of /pkg/R/transform.R
ViewVC logotype

Diff of /pkg/R/transform.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 971, Tue Jun 30 10:53:15 2009 UTC revision 972, Fri Jul 3 16:16:59 2009 UTC
# Line 4  Line 4 
4  tmReduce <- function(x, tmFuns, ...)  tmReduce <- function(x, tmFuns, ...)
5      Reduce(function(f, ...) f(...), tmFuns, x, right = TRUE)      Reduce(function(f, ...) f(...), tmFuns, x, right = TRUE)
6    
7  getTransformations <- function() { c("asPlain",  getTransformations <- function()
8      "removeCitation", "removeMultipart", "removeNumbers",      c("asPlain", "removeNumbers", "removePunctuation", "removeWords",
9      "removePunctuation", "removeSignature", "removeWords",      "replacePatterns", "stemDoc", "stripWhitespace", "tmTolower")
     "replacePatterns", "stemDoc", "stripWhitespace", "tmTolower") }  
   
 setGeneric("removeMultipart",  
            function(object, ...) standardGeneric("removeMultipart"))  
 setMethod("removeMultipart",  
           signature(object = "PlainTextDocument"),  
           function(object, ...) {  
               c <- Content(object)  
   
               # http://en.wikipedia.org/wiki/Multipart_message#Multipart_Messages  
               # We are only interested in text/plain parts  
               i <- grep("^Content-Type: text/plain", c)  
               r <- character(0)  
               k <- 2  
               for (j in i) {  
                   end <- if (k <= length(i)) i[k]-1 else length(c)  
                   content <- c[j:end]  
                   ## Find boundary (starting with "--")  
                   # In most cases the boundary is just one line before the Content-Type header  
                   start <- j - 1  
                   while (j > 0) {  
                       if (substr(c[j], 1, 2) == "--") {  
                           start <- j  
                           break  
                       }  
                       else  
                           j <- j - 1  
                   }  
                   index <- grep(c[start], content)  
                   index <- if (length(index) == 0) length(content) else (index[1] - 1)  
                   content <- content[1:index]  
                   # Now remove remaining headers  
                   index <- grep("^$", content)  
                   index <- if (length(index) == 0) 1 else (index[1] + 1)  
                   r <- c(r, content[index:length(content)])  
                   k <- k + 1  
               }  
   
               Content(object) <- if (length(r) == 0) c else r  
               return(object)  
           })  
   
 setGeneric("removeCitation",  
            function(object, ...) standardGeneric("removeCitation"))  
 # Remove e-mail citations beginning with >  
 setMethod("removeCitation",  
           signature(object = "PlainTextDocument"),  
           function(object, ...) {  
               citations <- grep("^[[:blank:]]*>", Content(object))  
               if (length(citations) > 0)  
                   Content(object) <- Content(object)[-citations]  
               return(object)  
           })  
10    
11  setGeneric("removeNumbers", function(object, ...) standardGeneric("removeNumbers"))  setGeneric("removeNumbers", function(object, ...) standardGeneric("removeNumbers"))
12  .removeNumbers <- function(object, ...) {  .removeNumbers <- function(object, ...) {
# Line 77  Line 24 
24  setMethod("removePunctuation", signature(object = "PlainTextDocument"), .removePunctuation)  setMethod("removePunctuation", signature(object = "PlainTextDocument"), .removePunctuation)
25  #setMethod("removePunctuation", signature(object = "MinimalDocument"), .removePunctuation)  #setMethod("removePunctuation", signature(object = "MinimalDocument"), .removePunctuation)
26    
 setGeneric("removeSignature",  
            function(object, ...) standardGeneric("removeSignature"))  
 setMethod("removeSignature",  
           signature(object = "PlainTextDocument"),  
           function(object, marks = character(0), ...) {  
               c <- Content(object)  
   
               # "---" is often added to Sourceforge mails  
               # "___" and "***" are also common, i.e.,  
               # marks <- c("^_{10,}", "^-{10,}", "^[*]{10,}")  
   
               # "-- " is the official correct signature start mark  
               marks <- c("^-- $", marks)  
   
               signatureStart <- length(c) + 1  
               for (m in marks)  
                   signatureStart <- min(grep(m, c), signatureStart)  
   
               if (signatureStart <= length(c))  
                   c <- c[-(signatureStart:length(c))]  
   
               Content(object) <- c  
               return(object)  
           })  
   
27  setGeneric("removeWords", function(object, words, ...) standardGeneric("removeWords"))  setGeneric("removeWords", function(object, words, ...) standardGeneric("removeWords"))
28  .removeWords <- function(object, words, ...) {  .removeWords <- function(object, words, ...) {
29      Content(object) <- gsub(paste("([[:blank:]]|^)",      Content(object) <- gsub(paste("([[:blank:]]|^)",

Legend:
Removed from v.971  
changed lines
  Added in v.972

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge