SCM

SCM Repository

[tm] Annotation of /pkg/R/corpus.R
ViewVC logotype

Annotation of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 23 - (view) (download)
Original Path: trunk/R/trunk/R/textdoccol.R

1 : feinerer 17 # Author: Ingo Feinerer
2 :    
3 :     # S4 class definition
4 :     # Text document collection
5 : feinerer 21 setClass("textdoccol",
6 :     representation(tdm = "termdocmatrix"),
7 :     contains = c("list"))
8 : feinerer 17
9 : feinerer 21 # Accessor functions as described in "S4 Classes in 15 pages, more or less"
10 :    
11 :     if (!isGeneric("tdm")) {
12 :     if (is.function("tdm"))
13 :     fun <- tdm
14 : feinerer 17 else
15 : feinerer 21 fun <- function(object) standardGeneric("tdm")
16 :     setGeneric("tdm", fun)
17 : feinerer 17 }
18 : feinerer 21 setMethod("tdm", "textdoccol", function(object) object@tdm)
19 : feinerer 17
20 : feinerer 21 # Constructors
21 :    
22 :     setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))
23 :     setMethod("textdoccol",
24 : feinerer 22 c("character", "character", "logical", "logical", "character",
25 :     "logical", "character", "integer", "integer", "logical"),
26 :     function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE, weighting = "tf",
27 :     stemming = FALSE, language = "english", minWordLength = 3, minDocFreq = 1, stopwords = NULL) {
28 : feinerer 18
29 : feinerer 22 # Add a new type for each unique input source format
30 : feinerer 23 type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))
31 : feinerer 22 switch(type,
32 :     # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
33 :     "RCV1" = {
34 :     require(XML)
35 : feinerer 17
36 : feinerer 22 tree <- xmlTreeParse(object)
37 :     tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))
38 :     },
39 :     # Text in CSV format (as e.g. exported from an Excel sheet)
40 :     "CSV" = {
41 :     m <- as.matrix(read.csv(object))
42 :     l <- vector("list", dim(m)[1])
43 :     for (i in 1:dim(m)[1]) {
44 :     author <- "Not yet implemented"
45 :     timestamp <- date()
46 :     description <- "Not yet implemented"
47 :     id <- i
48 :     corpus <- as.character(m[i,2:dim(m)[2]])
49 :     if (stripWhiteSpace)
50 :     corpus <- gsub("[[:space:]]+", " ", corpus)
51 :     if (toLower)
52 :     corpus <- tolower(corpus)
53 :     origin <- "Not yet implemented"
54 :     heading <- "Not yet implemented"
55 :    
56 :     l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
57 :     description = description, id = id, origin = origin, heading = heading)
58 :     }
59 :     tdcl <- new("textdoccol", .Data = l)
60 : feinerer 23 },
61 :     # Read in text documents in Reuters-21578 XML (not SGML) format
62 :     # Typically the first argument will be a directory where we can
63 :     # find the files reut2-000.xml ... reut2-021.xml
64 :     "REUT21578" = {
65 :     require(XML)
66 :    
67 :     # TODO: Read in a whole directory of XML files
68 :     # lapply(dir(object, full.names = TRUE), function)
69 :     # object is for the moment still a single XML file
70 :     tree <- xmlTreeParse(object)
71 :     tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower))
72 : feinerer 22 }
73 :     )
74 :    
75 :     tdcl@tdm <- termdocmatrix(tdcl, weighting, stemming, language, minWordLength, minDocFreq, stopwords)
76 :    
77 : feinerer 21 tdcl
78 :     })
79 :    
80 : feinerer 23 # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
81 : feinerer 21 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
82 : feinerer 17 author <- "Not yet implemented"
83 : feinerer 18 timestamp <- xmlAttrs(node)[["date"]]
84 : feinerer 17 description <- "Not yet implemented"
85 :     id <- as.integer(xmlAttrs(node)[["itemid"]])
86 :     origin <- "Not yet implemented"
87 : feinerer 18 corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
88 : feinerer 21
89 :     if (stripWhiteSpace)
90 :     corpus <- gsub("[[:space:]]+", " ", corpus)
91 :     if (toLower)
92 :     corpus <- tolower(corpus)
93 :    
94 : feinerer 18 heading <- xmlValue(node[["title"]])
95 : feinerer 17
96 : feinerer 21 new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
97 :     description = description, id = id, origin = origin, heading = heading)
98 : feinerer 19 }
99 : feinerer 23
100 :     # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
101 :     parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
102 :     author <- "Not yet implemented"
103 :     timestamp <- xmlValue(node[["DATE"]])
104 :     description <- "Not yet implemented"
105 :     id <- as.integer(xmlAttrs(node)[["NEWID"]])
106 :    
107 :     origin <- "Not yet implemented"
108 :    
109 :     # The <BODY></BODY> tag is unfortunately NOT obligatory!
110 :     if (!is.null(node[["TEXT"]][["BODY"]]))
111 :     corpus <- xmlValue(node[["TEXT"]][["BODY"]])
112 :     else
113 :     corpus <- ""
114 :    
115 :     if (stripWhiteSpace)
116 :     corpus <- gsub("[[:space:]]+", " ", corpus)
117 :     if (toLower)
118 :     corpus <- tolower(corpus)
119 :    
120 :     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
121 :     if (!is.null(node[["TEXT"]][["TITLE"]]))
122 :     heading <- xmlValue(node[["TEXT"]][["TITLE"]])
123 :     else
124 :     heading <- ""
125 :    
126 :     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
127 :     description = description, id = id, origin = origin, heading = heading)
128 :     }

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge