SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 78, Wed Nov 29 14:56:36 2006 UTC revision 689, Fri Dec 8 14:21:46 2006 UTC
# Line 26  Line 26 
26                dmeta.df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)                dmeta.df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
27                dcmeta.node <- new("MetaDataNode",                dcmeta.node <- new("MetaDataNode",
28                              NodeID = 0,                              NodeID = 0,
29                              MetaData = list(create_date = date(), creator = Sys.getenv("LOGNAME")),                              MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")),
30                              children = list())                              children = list())
31    
32                return(new("TextDocCol", .Data = tdl, DMetaData = dmeta.df, DCMetaData = dcmeta.node))                return(new("TextDocCol", .Data = tdl, DMetaData = dmeta.df, DCMetaData = dcmeta.node))
33            })            })
34    
 setGeneric("DirSource", function(directory, load = FALSE) standardGeneric("DirSource"))  
 setMethod("DirSource",  
           signature(directory = "character"),  
           function(directory, load = FALSE) {  
               new("DirSource", LoDSupport = TRUE, FileList = dir(directory, full.names = TRUE),  
                   Position = 0, Load = load)  
           })  
   
 setGeneric("CSVSource", function(object) standardGeneric("CSVSource"))  
 setMethod("CSVSource",  
           signature(object = "character"),  
           function(object) {  
               object <- substitute(file(object))  
               con <- eval(object)  
               content <- scan(con, what = "character")  
               close(con)  
               new("CSVSource", LoDSupport = FALSE, URI = object,  
                   Content = content, Position = 0)  
           })  
 setMethod("CSVSource",  
           signature(object = "ANY"),  
           function(object) {  
               object <- substitute(object)  
               con <- eval(object)  
               content <- scan(con, what = "character")  
               close(con)  
               new("CSVSource", LoDSupport = FALSE, URI = object,  
                   Content = content, Position = 0)  
           })  
   
 setGeneric("ReutersSource", function(object) standardGeneric("ReutersSource"))  
 setMethod("ReutersSource",  
           signature(object = "character"),  
           function(object) {  
               object <- substitute(file(object))  
               con <- eval(object)  
               corpus <- paste(readLines(con), "\n", collapse = "")  
               close(con)  
               tree <- xmlTreeParse(corpus, asText = TRUE)  
               content <- xmlRoot(tree)$children  
   
               new("ReutersSource", LoDSupport = FALSE, URI = object,  
                   Content = content, Position = 0)  
           })  
 setMethod("ReutersSource",  
           signature(object = "ANY"),  
           function(object) {  
               object <- substitute(object)  
               con <- eval(object)  
               corpus <- paste(readLines(con), "\n", collapse = "")  
               close(con)  
               tree <- xmlTreeParse(corpus, asText = TRUE)  
               content <- xmlRoot(tree)$children  
   
               new("ReutersSource", LoDSupport = FALSE, URI = object,  
                   Content = content, Position = 0)  
           })  
   
 setGeneric("step_next", function(object) standardGeneric("step_next"))  
 setMethod("step_next",  
           signature(object = "DirSource"),  
           function(object) {  
               object@Position <- object@Position + 1  
               object  
           })  
 setMethod("step_next",  
           signature(object = "CSVSource"),  
           function(object) {  
               object@Position <- object@Position + 1  
               object  
           })  
 setMethod("step_next",  
           signature(object = "ReutersSource"),  
           function(object) {  
               object@Position <- object@Position + 1  
               object  
           })  
   
 setGeneric("get_elem", function(object) standardGeneric("get_elem"))  
 setMethod("get_elem",  
           signature(object = "DirSource"),  
           function(object) {  
               filename <- object@FileList[object@Position]  
               list(content = readLines(filename),  
                    uri = substitute(file(filename)))  
           })  
 setMethod("get_elem",  
           signature(object = "CSVSource"),  
           function(object) {  
               list(content = object@Content[object@Position],  
                    uri = object@URI)  
           })  
 setMethod("get_elem",  
           signature(object = "ReutersSource"),  
           function(object) {  
               # Construct a character representation from the XMLNode  
               con <- textConnection("virtual.file", "w")  
               saveXML(object@Content[[object@Position]], con)  
               close(con)  
   
               list(content = virtual.file, uri = object@URI)  
           })  
   
 setGeneric("eoi", function(object) standardGeneric("eoi"))  
 setMethod("eoi",  
           signature(object = "DirSource"),  
           function(object) {  
               if (length(object@FileList) <= object@Position)  
                   return(TRUE)  
               else  
                   return(FALSE)  
           })  
 setMethod("eoi",  
           signature(object = "CSVSource"),  
           function(object) {  
               if (length(object@Content) <= object@Position)  
                   return(TRUE)  
               else  
                   return(FALSE)  
           })  
 setMethod("eoi",  
           signature(object = "ReutersSource"),  
           function(object) {  
               if (length(object@Content) <= object@Position)  
                   return(TRUE)  
               else  
                   return(FALSE)  
           })  
   
 plaintext_parser <- function(...) {  
     function(elem, lodsupport, load, id) {  
         if (!lodsupport || (lodsupport && load)) {  
             doc <- new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,  
                        Author = "", DateTimeStamp = date(), Description = "", ID = id, Origin = "", Heading = "")  
         }  
         else {  
             doc <- new("PlainTextDocument", URI = elem$uri, Cached = FALSE,  
                        Author = "", DateTimeStamp = date(), Description = "", ID = id, Origin = "", Heading = "")  
         }  
   
         return(doc)  
     }  
 }  
 class(plaintext_parser) <- "function_generator"  
   
 reut21578xml_parser <- function(...) {  
     function(elem, lodsupport, load, id) {  
         corpus <- paste(elem$content, "\n", collapse = "")  
         tree <- xmlTreeParse(corpus, asText = TRUE)  
         node <- xmlRoot(tree)  
   
         # Mask as list to bypass S4 checks  
         class(tree) <- "list"  
   
         # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!  
         if (!is.null(node[["TEXT"]][["AUTHOR"]]))  
             author <- xmlValue(node[["TEXT"]][["AUTHOR"]])  
         else  
             author <- ""  
   
         datetimestamp <- xmlValue(node[["DATE"]])  
         description <- ""  
         id <- xmlAttrs(node)[["NEWID"]]  
   
         # The <TITLE></TITLE> tag is unfortunately NOT obligatory!  
         if (!is.null(node[["TEXT"]][["TITLE"]]))  
             heading <- xmlValue(node[["TEXT"]][["TITLE"]])  
         else  
             heading <- ""  
   
         topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)  
   
         if (!lodsupport || (lodsupport && load)) {  
             doc <- new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,  
                        DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",  
                        Heading = heading, LocalMetaData = list(Topics = topics))  
         } else {  
             doc <- new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = author,  
                        DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",  
                        Heading = heading, LocalMetaData = list(Topics = topics))  
         }  
   
         return(doc)  
     }  
 }  
 class(reut21578xml_parser) <- "function_generator"  
   
 rcv1_parser <- function(...) {  
     function(elem, lodsupport, load, id) {  
         corpus <- paste(elem$content, "\n", collapse = "")  
         tree <- xmlTreeParse(corpus, asText = TRUE)  
         node <- xmlRoot(tree)  
   
         # Mask as list to bypass S4 checks  
         class(tree) <- "list"  
   
         datetimestamp <- xmlAttrs(node)[["date"]]  
         id <- xmlAttrs(node)[["itemid"]]  
         heading <- xmlValue(node[["title"]])  
   
         if (!lodsupport || (lodsupport && load)) {  
             doc <- new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",  
                        DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",  
                        Heading = heading)  
         } else {  
             doc <- new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = "",  
                        DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",  
                        Heading = heading)  
         }  
   
         return(doc)  
     }  
 }  
 class(rcv1_parser) <- "function_generator"  
   
 newsgroup_parser <- function(...) {  
     function(elem, lodsupport, load, id) {  
         mail <- elem$content  
         author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))  
         datetimestamp <- gsub("Date: ", "", grep("^Date:", mail, value = TRUE))  
         origin <- gsub("Path: ", "", grep("^Path:", mail, value = TRUE))  
         heading <- gsub("Subject: ", "", grep("^Subject:", mail, value = TRUE))  
         newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE))  
   
         if (!lodsupport || (lodsupport && load)) {  
             # The header is separated from the body by a blank line.  
             # Reference: \url{http://en.wikipedia.org/wiki/E-mail#Internet_e-mail_format}  
             for (index in seq(along = mail)) {  
                 if (mail[index] == "")  
                     break  
             }  
             content <- mail[(index + 1):length(mail)]  
   
             doc <- new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,  
                        Author = author, DateTimeStamp = datetimestamp,  
                        Description = "", ID = id, Origin = origin,  
                        Heading = heading, Newsgroup = newsgroup)  
         } else {  
             doc <- new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,  
                        Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)  
         }  
   
         return(doc)  
     }  
 }  
 class(newsgroup_parser) <- "function_generator"  
   
 # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  
 rcv1_to_plain <- function(node, ...) {  
     datetimestamp <- xmlAttrs(node)[["date"]]  
     id <- xmlAttrs(node)[["itemid"]]  
     origin <- "Reuters Corpus Volume 1 XML"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
     heading <- xmlValue(node[["title"]])  
   
     new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = "", DateTimeStamp = datetimestamp,  
         Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML", Heading = heading)  
 }  
   
 # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  
 reut21578xml_to_plain <- function(node, ...) {  
     # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!  
     if (!is.null(node[["TEXT"]][["AUTHOR"]]))  
         author <- xmlValue(node[["TEXT"]][["AUTHOR"]])  
     else  
         author <- ""  
   
     datetimestamp <- xmlValue(node[["DATE"]])  
     description <- ""  
     id <- xmlAttrs(node)[["NEWID"]]  
   
     origin <- "Reuters-21578 XML"  
   
     # The <BODY></BODY> tag is unfortunately NOT obligatory!  
     if (!is.null(node[["TEXT"]][["BODY"]]))  
         corpus <- xmlValue(node[["TEXT"]][["BODY"]])  
     else  
         corpus <- ""  
   
     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!  
     if (!is.null(node[["TEXT"]][["TITLE"]]))  
         heading <- xmlValue(node[["TEXT"]][["TITLE"]])  
     else  
         heading <- ""  
   
     topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)  
   
     new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = author, DateTimeStamp = datetimestamp,  
         Description = description, ID = id, Origin = origin, Heading = heading, LocalMetaData = list(Topics = topics))  
 }  
   
35  setGeneric("load_doc", function(object, ...) standardGeneric("load_doc"))  setGeneric("load_doc", function(object, ...) standardGeneric("load_doc"))
36  setMethod("load_doc",  setMethod("load_doc",
37            signature(object = "PlainTextDocument"),            signature(object = "PlainTextDocument"),
# Line 399  Line 108 
108            signature(object = "TextDocCol", FUN = "function"),            signature(object = "TextDocCol", FUN = "function"),
109            function(object, FUN, ...) {            function(object, FUN, ...) {
110                result <- object                result <- object
111                  # Note that text corpora are automatically loaded into memory via \code{[[}
112                result@.Data <- lapply(object, FUN, ..., DMetaData = DMetaData(object))                result@.Data <- lapply(object, FUN, ..., DMetaData = DMetaData(object))
113                return(result)                return(result)
114            })            })
# Line 412  Line 122 
122  setMethod("as.plaintext_doc",  setMethod("as.plaintext_doc",
123            signature(object = "XMLTextDocument", FUN = "function"),            signature(object = "XMLTextDocument", FUN = "function"),
124            function(object, FUN, ...) {            function(object, FUN, ...) {
               if (!Cached(object))  
                   object <- load_doc(object)  
   
125                corpus <- Corpus(object)                corpus <- Corpus(object)
126    
127                # As XMLDocument is no native S4 class, restore valid information                # As XMLDocument is no native S4 class, restore valid information
# Line 428  Line 135 
135  setMethod("tm_tolower",  setMethod("tm_tolower",
136            signature(object = "PlainTextDocument"),            signature(object = "PlainTextDocument"),
137            function(object, ...) {            function(object, ...) {
               if (!Cached(object))  
                   object <- load_doc(object)  
   
138                Corpus(object) <- tolower(object)                Corpus(object) <- tolower(object)
139                return(object)                return(object)
140            })            })
# Line 439  Line 143 
143  setMethod("strip_whitespace",  setMethod("strip_whitespace",
144            signature(object = "PlainTextDocument"),            signature(object = "PlainTextDocument"),
145            function(object, ...) {            function(object, ...) {
               if (!Cached(object))  
                   object <- load_doc(object)  
   
146                Corpus(object) <- gsub("[[:space:]]+", " ", object)                Corpus(object) <- gsub("[[:space:]]+", " ", object)
147                return(object)                return(object)
148            })            })
# Line 450  Line 151 
151  setMethod("stem_doc",  setMethod("stem_doc",
152            signature(object = "PlainTextDocument"),            signature(object = "PlainTextDocument"),
153            function(object, ...) {            function(object, ...) {
               if (!Cached(object))  
                   object <- load_doc(object)  
   
154                require(Rstem)                require(Rstem)
155                splittedCorpus <- unlist(strsplit(object, " ", fixed = TRUE))                splittedCorpus <- unlist(strsplit(object, " ", fixed = TRUE))
156                stemmedCorpus <- wordStem(splittedCorpus)                stemmedCorpus <- wordStem(splittedCorpus)
# Line 464  Line 162 
162  setMethod("remove_words",  setMethod("remove_words",
163            signature(object = "PlainTextDocument", stopwords = "character"),            signature(object = "PlainTextDocument", stopwords = "character"),
164            function(object, stopwords, ...) {            function(object, stopwords, ...) {
               if (!Cached(object))  
                   object <- load_doc(object)  
   
165                require(Rstem)                require(Rstem)
166                splittedCorpus <- unlist(strsplit(object, " ", fixed = TRUE))                splittedCorpus <- unlist(strsplit(object, " ", fixed = TRUE))
167                noStopwordsCorpus <- splittedCorpus[!splittedCorpus %in% stopwords]                noStopwordsCorpus <- splittedCorpus[!splittedCorpus %in% stopwords]
# Line 556  Line 251 
251  setMethod("fulltext_search_filter",  setMethod("fulltext_search_filter",
252            signature(object = "PlainTextDocument", pattern = "character"),            signature(object = "PlainTextDocument", pattern = "character"),
253            function(object, pattern, ...) {            function(object, pattern, ...) {
               if (!Cached(object))  
                   object <- load_doc(object)  
   
254                return(any(grep(pattern, Corpus(object))))                return(any(grep(pattern, Corpus(object))))
255            })            })
256    
# Line 646  Line 338 
338  setMethod("[[",  setMethod("[[",
339            signature(x = "TextDocCol", i = "ANY", j = "ANY"),            signature(x = "TextDocCol", i = "ANY", j = "ANY"),
340            function(x, i, j, ...) {            function(x, i, j, ...) {
341                return(x@.Data[[i, ...]])                return(load_doc(x@.Data[[i]]))
342            })            })
343    
344  setMethod("[[<-",  setMethod("[[<-",
# Line 691  Line 383 
383    
384  setMethod("c",  setMethod("c",
385            signature(x = "TextDocCol"),            signature(x = "TextDocCol"),
386            function(x, y, ..., meta = list(merge_date = date(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) {            function(x, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) {
387                if (!inherits(y, "TextDocCol"))                args <- list(...)
388                  if(length(args) == 0)
389                      return(x)
390    
391                  result <- x
392                  for (c in args) {
393                      if (!inherits(c, "TextDocCol"))
394                    stop("invalid argument")                    stop("invalid argument")
395                      result <- c2(result, c)
396                  }
397                  return(result)
398              })
399    
400    setGeneric("c2", function(x, y, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) standardGeneric("c2"))
401    setMethod("c2",
402              signature(x = "TextDocCol", y = "TextDocCol"),
403              function(x, y, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) {
404                object <- x                object <- x
405                # Concatenate data slots                # Concatenate data slots
406                object@.Data <- c(as(x, "list"), as(y, "list"))                object@.Data <- c(as(x, "list"), as(y, "list"))
# Line 743  Line 449 
449    
450                return(object)                return(object)
451            })            })
452    
453    
454  setMethod("c",  setMethod("c",
455            signature(x = "TextDocument"),            signature(x = "TextDocument"),
456            function(x, ..., recursive = TRUE){            function(x, ..., recursive = TRUE){
# Line 753  Line 461 
461                dmeta.df <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)                dmeta.df <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
462                dcmeta.node <- new("MetaDataNode",                dcmeta.node <- new("MetaDataNode",
463                              NodeID = 0,                              NodeID = 0,
464                              MetaData = list(create_date = date(), creator = Sys.getenv("LOGNAME")),                              MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")),
465                              children = list())                              children = list())
466    
467                return(new("TextDocCol", .Data = list(x, ...), DMetaData = dmeta.df, DCMetaData = dcmeta.node))                return(new("TextDocCol", .Data = list(x, ...), DMetaData = dmeta.df, DCMetaData = dcmeta.node))

Legend:
Removed from v.78  
changed lines
  Added in v.689

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge