SCM

SCM Repository

[tm] Annotation of /pkg/R/corpus.R
ViewVC logotype

Annotation of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 21 - (view) (download)
Original Path: trunk/R/trunk/R/textdoccol.R

1 : feinerer 17 # Author: Ingo Feinerer
2 :    
3 :     # S4 class definition
4 :     # Text document collection
5 : feinerer 21 setClass("textdoccol",
6 :     representation(tdm = "termdocmatrix"),
7 :     contains = c("list"))
8 : feinerer 17
9 : feinerer 21 # Accessor functions as described in "S4 Classes in 15 pages, more or less"
10 :    
11 :     if (!isGeneric("tdm")) {
12 :     if (is.function("tdm"))
13 :     fun <- tdm
14 : feinerer 17 else
15 : feinerer 21 fun <- function(object) standardGeneric("tdm")
16 :     setGeneric("tdm", fun)
17 : feinerer 17 }
18 : feinerer 21 setMethod("tdm", "textdoccol", function(object) object@tdm)
19 : feinerer 17
20 : feinerer 21 # Constructors
21 :    
22 :     setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))
23 : feinerer 19 # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
24 : feinerer 21 setMethod("textdoccol",
25 :     c("character", "logical", "logical", "character", "logical", "character", "integer", "integer", "logical"),
26 :     function(object, stripWhiteSpace = FALSE, toLower = FALSE, weighting = "tf", stemming = FALSE,
27 :     language = "german", minWordLength = 3, minDocFreq = 1, stopwords = NULL) {
28 :     require(XML)
29 : feinerer 18
30 : feinerer 21 tree <- xmlTreeParse(object)
31 :     tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))
32 :     tdcl@tdm = termdocmatrix(tdcl, weighting, stemming, language, minWordLength, minDocFreq, stopwords)
33 : feinerer 17
34 : feinerer 21 tdcl
35 :     })
36 :    
37 : feinerer 19 # TODO: Implement lacking fields.
38 :     # For this we need the full RCV1 XML set to know where to find those things
39 : feinerer 21 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
40 : feinerer 17 author <- "Not yet implemented"
41 : feinerer 18 timestamp <- xmlAttrs(node)[["date"]]
42 : feinerer 17 description <- "Not yet implemented"
43 :     id <- as.integer(xmlAttrs(node)[["itemid"]])
44 :     origin <- "Not yet implemented"
45 : feinerer 18 corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
46 : feinerer 21
47 :     if (stripWhiteSpace)
48 :     corpus <- gsub("[[:space:]]+", " ", corpus)
49 :     if (toLower)
50 :     corpus <- tolower(corpus)
51 :    
52 : feinerer 18 heading <- xmlValue(node[["title"]])
53 : feinerer 17
54 : feinerer 21 new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
55 :     description = description, id = id, origin = origin, heading = heading)
56 : feinerer 19 }

R-Forge@R-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge