SCM

SCM Repository

[tm] Diff of /trunk/R/textmin/R/textdoccol.R
ViewVC logotype

Diff of /trunk/R/textmin/R/textdoccol.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 20, Tue Nov 8 16:40:52 2005 UTC revision 21, Sat Nov 19 10:23:19 2005 UTC
# Line 2  Line 2 
2    
3  # S4 class definition  # S4 class definition
4  # Text document collection  # Text document collection
5  setClass("textdoccol", representation(docs = "list",  setClass("textdoccol",
6                                        tdm = "termdocmatrix"))           representation(tdm = "termdocmatrix"),
7             contains = c("list"))
8  # Accessor function  
9  if (!isGeneric("docs")) {  # Accessor functions as described in "S4 Classes in 15 pages, more or less"
10      if (is.function("docs"))  
11          fun <- docs  if (!isGeneric("tdm")) {
12        if (is.function("tdm"))
13            fun <- tdm
14      else      else
15          fun <- function(object) standardGeneric("docs")          fun <- function(object) standardGeneric("tdm")
16      setGeneric("docs", fun)      setGeneric("tdm", fun)
17  }  }
18  setMethod("docs", "textdoccol", function(object) object@docs)  setMethod("tdm", "textdoccol", function(object) object@tdm)
19    
20    # Constructors
21    
22  setGeneric("textdoccol", function(object) standardGeneric("textdoccol"))  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))
23  # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format  # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
24  setMethod("textdoccol", "character", function(object) {  setMethod("textdoccol",
25              c("character", "logical", "logical",  "character", "logical", "character", "integer", "integer", "logical"),
26              function(object, stripWhiteSpace = FALSE, toLower = FALSE, weighting = "tf", stemming = FALSE,
27                       language = "german", minWordLength = 3, minDocFreq = 1, stopwords = NULL) {
28      require(XML)      require(XML)
29    
30      tree <- xmlTreeParse(object)      tree <- xmlTreeParse(object)
31      new("textdoccol", docs = xmlApply(xmlRoot(tree), parseNewsItem))                tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))
32                  tdcl@tdm = termdocmatrix(tdcl, weighting, stemming, language, minWordLength, minDocFreq, stopwords)
33    
34                  tdcl
35  })  })
36    
37  # TODO: Implement lacking fields.  # TODO: Implement lacking fields.
38  # For this we need the full RCV1 XML set to know where to find those things  # For this we need the full RCV1 XML set to know where to find those things
39  parseNewsItem <- function(node) {  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
40      author <- "Not yet implemented"      author <- "Not yet implemented"
41      timestamp <- xmlAttrs(node)[["date"]]      timestamp <- xmlAttrs(node)[["date"]]
42      description <- "Not yet implemented"      description <- "Not yet implemented"
43      id <- as.integer(xmlAttrs(node)[["itemid"]])      id <- as.integer(xmlAttrs(node)[["itemid"]])
44      origin <- "Not yet implemented"      origin <- "Not yet implemented"
     # TODO: Concatenate list elements (= XML paragraphs) to a single string  
45      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
46    
47        if (stripWhiteSpace)
48            corpus <- gsub("[[:space:]]+", " ", corpus)
49        if (toLower)
50            corpus <- tolower(corpus)
51    
52      heading <- xmlValue(node[["title"]])      heading <- xmlValue(node[["title"]])
53    
54      new("textdocument", author = author, timestamp = timestamp, description = description,      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
55          id = id, origin = origin, corpus = corpus, heading = heading)          description = description, id = id, origin = origin, heading = heading)
56  }  }
   
 # If necessary build the term-document matrix for a given text document collection  
 setGeneric("buildTDM", function(object) standardGeneric("buildTDM"))  
 setMethod("buildTDM", "textdoccol", function(object) {  
     termdocmatrix(docs(object))  
 })  

Legend:
Removed from v.20  
changed lines
  Added in v.21

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge