SCM

SCM Repository

[tm] Annotation of /pkg/R/corpus.R
ViewVC logotype

Annotation of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 20 - (view) (download)
Original Path: trunk/R/trunk/R/textdoccol.R

1 : feinerer 17 # Author: Ingo Feinerer
2 :    
3 :     # S4 class definition
4 :     # Text document collection
5 :     setClass("textdoccol", representation(docs = "list",
6 : feinerer 20 tdm = "termdocmatrix"))
7 : feinerer 17
8 :     # Accessor function
9 :     if (!isGeneric("docs")) {
10 :     if (is.function("docs"))
11 :     fun <- docs
12 :     else
13 :     fun <- function(object) standardGeneric("docs")
14 :     setGeneric("docs", fun)
15 :     }
16 :     setMethod("docs", "textdoccol", function(object) object@docs)
17 :    
18 : feinerer 19 setGeneric("textdoccol", function(object) standardGeneric("textdoccol"))
19 :     # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
20 :     setMethod("textdoccol", "character", function(object) {
21 : feinerer 18 require(XML)
22 :    
23 : feinerer 19 tree <- xmlTreeParse(object)
24 : feinerer 20 new("textdoccol", docs = xmlApply(xmlRoot(tree), parseNewsItem))
25 : feinerer 19 })
26 : feinerer 17
27 : feinerer 19 # TODO: Implement lacking fields.
28 :     # For this we need the full RCV1 XML set to know where to find those things
29 :     parseNewsItem <- function(node) {
30 : feinerer 17 author <- "Not yet implemented"
31 : feinerer 18 timestamp <- xmlAttrs(node)[["date"]]
32 : feinerer 17 description <- "Not yet implemented"
33 :     id <- as.integer(xmlAttrs(node)[["itemid"]])
34 :     origin <- "Not yet implemented"
35 : feinerer 19 # TODO: Concatenate list elements (= XML paragraphs) to a single string
36 : feinerer 18 corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
37 :     heading <- xmlValue(node[["title"]])
38 : feinerer 17
39 : feinerer 19 new("textdocument", author = author, timestamp = timestamp, description = description,
40 :     id = id, origin = origin, corpus = corpus, heading = heading)
41 :     }
42 : feinerer 20
43 :     # If necessary build the term-document matrix for a given text document collection
44 :     setGeneric("buildTDM", function(object) standardGeneric("buildTDM"))
45 :     setMethod("buildTDM", "textdoccol", function(object) {
46 :     termdocmatrix(docs(object))
47 :     })

R-Forge@R-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge