SCM

SCM Repository

[tm] Diff of /trunk/R/textmin/R/textdoccol.R
ViewVC logotype

Diff of /trunk/R/textmin/R/textdoccol.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 17, Sat Nov 5 14:47:12 2005 UTC revision 18, Sat Nov 5 19:00:05 2005 UTC
# Line 16  Line 16 
16  }  }
17  setMethod("docs", "textdoccol", function(object) object@docs)  setMethod("docs", "textdoccol", function(object) object@docs)
18    
19    setGeneric("textdoccol", function(docs) standardGeneric("textdoccol"))
20  # Read in XML text documents  # Read in XML text documents
21  # Reuters Corpus Volume 1 (RCV1)  # Reuters Corpus Volume 1 (RCV1)
22  readXML <- function(file) {  setMethod("textdoccol", "character", function(docs) {
23      tree <- xmlTreeParse(file)      require(XML)
24    
25        tree <- xmlTreeParse(docs)
26      root <- xmlRoot(tree)      root <- xmlRoot(tree)
27    
28      # TODO: At each loop node points to the current newsitem      # TODO: At each loop node points to the current newsitem
# Line 28  Line 31 
31      # TODO: Implement lacking fields.      # TODO: Implement lacking fields.
32      # For this we need the full RCV1 XML set to know where to find those things      # For this we need the full RCV1 XML set to know where to find those things
33      author <- "Not yet implemented"      author <- "Not yet implemented"
34      date <- xmlAttrs(node)[["date"]]      timestamp <- xmlAttrs(node)[["date"]]
35      description <- "Not yet implemented"      description <- "Not yet implemented"
36      id <- as.integer(xmlAttrs(node)[["itemid"]])      id <- as.integer(xmlAttrs(node)[["itemid"]])
37      origin <- "Not yet implemented"      origin <- "Not yet implemented"
38      text <- xmlSApply(node[["text"]], xmlValue)      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
     title <- xmlValue(node[["title"]])  
39    
40      doc <- new("textdocument", author = author, date = date, description = description,      heading <- xmlValue(node[["title"]])
                id = id, origin = origin, text = text, title = title)  
41    
42      new("textdoccol", docs = list(doc), matrix = ())      doc <- new("textdocument", author = author, timestamp = timestamp, description = description,
43  }                 id = id, origin = origin, corpus = corpus, heading = heading)
44    
45  setGeneric("textdoccol", function(object) standardGeneric("textdoccol"))      new("textdoccol", docs = list(doc), tdm = matrix())
46  setMethod("textdoccol", "file", readXML)  })

Legend:
Removed from v.17  
changed lines
  Added in v.18

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge