SCM

SCM Repository

[tm] Annotation of /pkg/R/corpus.R
ViewVC logotype

Annotation of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 17 - (view) (download)
Original Path: trunk/R/trunk/R/textdoccol.R

1 : feinerer 17 # Author: Ingo Feinerer
2 :    
3 :     # S4 class definition
4 :     # Text document collection
5 :     # TODO: Define proper S4 term-document matrix
6 :     setClass("textdoccol", representation(docs = "list",
7 :     tdm = "matrix"))
8 :    
9 :     # Accessor function
10 :     if (!isGeneric("docs")) {
11 :     if (is.function("docs"))
12 :     fun <- docs
13 :     else
14 :     fun <- function(object) standardGeneric("docs")
15 :     setGeneric("docs", fun)
16 :     }
17 :     setMethod("docs", "textdoccol", function(object) object@docs)
18 :    
19 :     # Read in XML text documents
20 :     # Reuters Corpus Volume 1 (RCV1)
21 :     readXML <- function(file) {
22 :     tree <- xmlTreeParse(file)
23 :     root <- xmlRoot(tree)
24 :    
25 :     # TODO: At each loop node points to the current newsitem
26 :     node <- root
27 :    
28 :     # TODO: Implement lacking fields.
29 :     # For this we need the full RCV1 XML set to know where to find those things
30 :     author <- "Not yet implemented"
31 :     date <- xmlAttrs(node)[["date"]]
32 :     description <- "Not yet implemented"
33 :     id <- as.integer(xmlAttrs(node)[["itemid"]])
34 :     origin <- "Not yet implemented"
35 :     text <- xmlSApply(node[["text"]], xmlValue)
36 :     title <- xmlValue(node[["title"]])
37 :    
38 :     doc <- new("textdocument", author = author, date = date, description = description,
39 :     id = id, origin = origin, text = text, title = title)
40 :    
41 :     new("textdoccol", docs = list(doc), matrix = ())
42 :     }
43 :    
44 :     setGeneric("textdoccol", function(object) standardGeneric("textdoccol"))
45 :     setMethod("textdoccol", "file", readXML)

R-Forge@R-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge