SCM

SCM Repository

[tm] Annotation of /pkg/R/corpus.R
ViewVC logotype

Annotation of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 18 - (view) (download)
Original Path: trunk/R/trunk/R/textdoccol.R

1 : feinerer 17 # Author: Ingo Feinerer
2 :    
3 :     # S4 class definition
4 :     # Text document collection
5 :     # TODO: Define proper S4 term-document matrix
6 :     setClass("textdoccol", representation(docs = "list",
7 :     tdm = "matrix"))
8 :    
9 :     # Accessor function
10 :     if (!isGeneric("docs")) {
11 :     if (is.function("docs"))
12 :     fun <- docs
13 :     else
14 :     fun <- function(object) standardGeneric("docs")
15 :     setGeneric("docs", fun)
16 :     }
17 :     setMethod("docs", "textdoccol", function(object) object@docs)
18 :    
19 : feinerer 18 setGeneric("textdoccol", function(docs) standardGeneric("textdoccol"))
20 : feinerer 17 # Read in XML text documents
21 :     # Reuters Corpus Volume 1 (RCV1)
22 : feinerer 18 setMethod("textdoccol", "character", function(docs) {
23 :     require(XML)
24 :    
25 :     tree <- xmlTreeParse(docs)
26 : feinerer 17 root <- xmlRoot(tree)
27 :    
28 :     # TODO: At each loop node points to the current newsitem
29 :     node <- root
30 :    
31 :     # TODO: Implement lacking fields.
32 :     # For this we need the full RCV1 XML set to know where to find those things
33 :     author <- "Not yet implemented"
34 : feinerer 18 timestamp <- xmlAttrs(node)[["date"]]
35 : feinerer 17 description <- "Not yet implemented"
36 :     id <- as.integer(xmlAttrs(node)[["itemid"]])
37 :     origin <- "Not yet implemented"
38 : feinerer 18 corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
39 : feinerer 17
40 : feinerer 18 heading <- xmlValue(node[["title"]])
41 : feinerer 17
42 : feinerer 18 doc <- new("textdocument", author = author, timestamp = timestamp, description = description,
43 :     id = id, origin = origin, corpus = corpus, heading = heading)
44 : feinerer 17
45 : feinerer 18 new("textdoccol", docs = list(doc), tdm = matrix())
46 :     })

R-Forge@R-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge