SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 21, Sat Nov 19 10:23:19 2005 UTC revision 22, Sat Nov 19 16:58:34 2005 UTC
# Line 20  Line 20 
20  # Constructors  # Constructors
21    
22  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))
 # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format  
23  setMethod("textdoccol",  setMethod("textdoccol",
24            c("character", "logical", "logical",  "character", "logical", "character", "integer", "integer", "logical"),            c("character", "character", "logical", "logical",  "character",
25            function(object, stripWhiteSpace = FALSE, toLower = FALSE, weighting = "tf", stemming = FALSE,              "logical", "character", "integer", "integer", "logical"),
26                     language = "german", minWordLength = 3, minDocFreq = 1, stopwords = NULL) {            function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE, weighting = "tf",
27                       stemming = FALSE, language = "english", minWordLength = 3, minDocFreq = 1, stopwords = NULL) {
28    
29                  # Add a new type for each unique input source format
30                  type <- match.arg(inputType,c("RCV1","CSV"))
31                  switch(type,
32                         # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
33                         "RCV1" = {
34                require(XML)                require(XML)
35    
36                tree <- xmlTreeParse(object)                tree <- xmlTreeParse(object)
37                tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))                tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))
38                tdcl@tdm = termdocmatrix(tdcl, weighting, stemming, language, minWordLength, minDocFreq, stopwords)                       },
39                         # Text in CSV format (as e.g. exported from an Excel sheet)
40                         "CSV" = {
41                             m <- as.matrix(read.csv(object))
42                             l <- vector("list", dim(m)[1])
43                             for (i in 1:dim(m)[1]) {
44                                 author <- "Not yet implemented"
45                                 timestamp <- date()
46                                 description <- "Not yet implemented"
47                                 id <- i
48                                 corpus <- as.character(m[i,2:dim(m)[2]])
49                                 if (stripWhiteSpace)
50                                     corpus <- gsub("[[:space:]]+", " ", corpus)
51                                 if (toLower)
52                                     corpus <- tolower(corpus)
53                                 origin <- "Not yet implemented"
54                                 heading <- "Not yet implemented"
55    
56                                 l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
57                                     description = description, id = id, origin = origin, heading = heading)
58                             }
59                             tdcl <- new("textdoccol", .Data = l)
60                         }
61                         )
62    
63                  tdcl@tdm <- termdocmatrix(tdcl, weighting, stemming, language, minWordLength, minDocFreq, stopwords)
64    
65                tdcl                tdcl
66            })            })
67    
68  # TODO: Implement lacking fields.  # Parse a <newsitem></newsitem> element from a valid RCV1 XML file
 # For this we need the full RCV1 XML set to know where to find those things  
69  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
70      author <- "Not yet implemented"      author <- "Not yet implemented"
71      timestamp <- xmlAttrs(node)[["date"]]      timestamp <- xmlAttrs(node)[["date"]]

Legend:
Removed from v.21  
changed lines
  Added in v.22

R-Forge@R-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge