SCM

SCM Repository

[tm] Annotation of /pkg/R/corpus.R
ViewVC logotype

Annotation of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 22 - (view) (download)
Original Path: trunk/R/trunk/R/textdoccol.R

1 : feinerer 17 # Author: Ingo Feinerer
2 :    
3 :     # S4 class definition
4 :     # Text document collection
5 : feinerer 21 setClass("textdoccol",
6 :     representation(tdm = "termdocmatrix"),
7 :     contains = c("list"))
8 : feinerer 17
9 : feinerer 21 # Accessor functions as described in "S4 Classes in 15 pages, more or less"
10 :    
11 :     if (!isGeneric("tdm")) {
12 :     if (is.function("tdm"))
13 :     fun <- tdm
14 : feinerer 17 else
15 : feinerer 21 fun <- function(object) standardGeneric("tdm")
16 :     setGeneric("tdm", fun)
17 : feinerer 17 }
18 : feinerer 21 setMethod("tdm", "textdoccol", function(object) object@tdm)
19 : feinerer 17
20 : feinerer 21 # Constructors
21 :    
22 :     setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))
23 :     setMethod("textdoccol",
24 : feinerer 22 c("character", "character", "logical", "logical", "character",
25 :     "logical", "character", "integer", "integer", "logical"),
26 :     function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE, weighting = "tf",
27 :     stemming = FALSE, language = "english", minWordLength = 3, minDocFreq = 1, stopwords = NULL) {
28 : feinerer 18
29 : feinerer 22 # Add a new type for each unique input source format
30 :     type <- match.arg(inputType,c("RCV1","CSV"))
31 :     switch(type,
32 :     # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
33 :     "RCV1" = {
34 :     require(XML)
35 : feinerer 17
36 : feinerer 22 tree <- xmlTreeParse(object)
37 :     tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))
38 :     },
39 :     # Text in CSV format (as e.g. exported from an Excel sheet)
40 :     "CSV" = {
41 :     m <- as.matrix(read.csv(object))
42 :     l <- vector("list", dim(m)[1])
43 :     for (i in 1:dim(m)[1]) {
44 :     author <- "Not yet implemented"
45 :     timestamp <- date()
46 :     description <- "Not yet implemented"
47 :     id <- i
48 :     corpus <- as.character(m[i,2:dim(m)[2]])
49 :     if (stripWhiteSpace)
50 :     corpus <- gsub("[[:space:]]+", " ", corpus)
51 :     if (toLower)
52 :     corpus <- tolower(corpus)
53 :     origin <- "Not yet implemented"
54 :     heading <- "Not yet implemented"
55 :    
56 :     l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
57 :     description = description, id = id, origin = origin, heading = heading)
58 :     }
59 :     tdcl <- new("textdoccol", .Data = l)
60 :     }
61 :     )
62 :    
63 :     tdcl@tdm <- termdocmatrix(tdcl, weighting, stemming, language, minWordLength, minDocFreq, stopwords)
64 :    
65 : feinerer 21 tdcl
66 :     })
67 :    
68 : feinerer 22 # Parse a <newsitem></newsitem> element from a valid RCV1 XML file
69 : feinerer 21 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
70 : feinerer 17 author <- "Not yet implemented"
71 : feinerer 18 timestamp <- xmlAttrs(node)[["date"]]
72 : feinerer 17 description <- "Not yet implemented"
73 :     id <- as.integer(xmlAttrs(node)[["itemid"]])
74 :     origin <- "Not yet implemented"
75 : feinerer 18 corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
76 : feinerer 21
77 :     if (stripWhiteSpace)
78 :     corpus <- gsub("[[:space:]]+", " ", corpus)
79 :     if (toLower)
80 :     corpus <- tolower(corpus)
81 :    
82 : feinerer 18 heading <- xmlValue(node[["title"]])
83 : feinerer 17
84 : feinerer 21 new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
85 :     description = description, id = id, origin = origin, heading = heading)
86 : feinerer 19 }

R-Forge@R-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge