SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 18, Sat Nov 5 19:00:05 2005 UTC revision 19, Sun Nov 6 15:38:48 2005 UTC
# Line 16  Line 16 
16  }  }
17  setMethod("docs", "textdoccol", function(object) object@docs)  setMethod("docs", "textdoccol", function(object) object@docs)
18    
19  setGeneric("textdoccol", function(docs) standardGeneric("textdoccol"))  setGeneric("textdoccol", function(object) standardGeneric("textdoccol"))
20  # Read in XML text documents  # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
21  # Reuters Corpus Volume 1 (RCV1)  setMethod("textdoccol", "character", function(object) {
 setMethod("textdoccol", "character", function(docs) {  
22      require(XML)      require(XML)
23    
24      tree <- xmlTreeParse(docs)      tree <- xmlTreeParse(object)
25      root <- xmlRoot(tree)      new("textdoccol", docs = xmlApply(xmlRoot(tree), parseNewsItem), tdm = matrix())
26    })
     # TODO: At each loop node points to the current newsitem  
     node <- root  
27    
28      # TODO: Implement lacking fields.      # TODO: Implement lacking fields.
29      # For this we need the full RCV1 XML set to know where to find those things      # For this we need the full RCV1 XML set to know where to find those things
30    parseNewsItem <- function(node) {
31      author <- "Not yet implemented"      author <- "Not yet implemented"
32      timestamp <- xmlAttrs(node)[["date"]]      timestamp <- xmlAttrs(node)[["date"]]
33      description <- "Not yet implemented"      description <- "Not yet implemented"
34      id <- as.integer(xmlAttrs(node)[["itemid"]])      id <- as.integer(xmlAttrs(node)[["itemid"]])
35      origin <- "Not yet implemented"      origin <- "Not yet implemented"
36        # TODO: Concatenate list elements (= XML paragraphs) to a single string
37      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
   
38      heading <- xmlValue(node[["title"]])      heading <- xmlValue(node[["title"]])
39    
40      doc <- new("textdocument", author = author, timestamp = timestamp, description = description,      new("textdocument", author = author, timestamp = timestamp, description = description,
41                 id = id, origin = origin, corpus = corpus, heading = heading)                 id = id, origin = origin, corpus = corpus, heading = heading)
42    }
     new("textdoccol", docs = list(doc), tdm = matrix())  
 })  

Legend:
Removed from v.18  
changed lines
  Added in v.19

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge