SCM

SCM Repository

[tm] Annotation of /pkg/R/corpus.R
ViewVC logotype

Annotation of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 49 - (view) (download)
Original Path: trunk/R/textmin/R/textdoccol.R

1 : feinerer 17 # Author: Ingo Feinerer
2 :    
3 : feinerer 49 setGeneric("TextDocCol", function(object, inputType = "CSV", stripWhiteSpace = FALSE, toLower = FALSE) standardGeneric("TextDocCol"))
4 :     setMethod("TextDocCol",
5 : feinerer 42 c("character"),
6 : feinerer 37 function(object, inputType = "CSV", stripWhiteSpace = FALSE, toLower = FALSE) {
7 : feinerer 22 # Add a new type for each unique input source format
8 : feinerer 49 type <- match.arg(inputType,c("CSV", "RCV1_PLAIN", "REUT21578_PLAIN", "REUT21578_XML", "RIS"))
9 : feinerer 22 switch(type,
10 : feinerer 37 # Text in a special CSV format
11 :     # For details on the file format see the R documentation file
12 :     # The first argument is a directory with .csv files
13 :     "CSV" = {
14 : feinerer 41 filelist <- dir(object, pattern = ".csv", full.names = TRUE)
15 : feinerer 39 tdl <- sapply(filelist,
16 : feinerer 37 function(file) {
17 :     m <- as.matrix(read.csv(file, header = FALSE))
18 :     l <- vector("list", dim(m)[1])
19 :     for (i in 1:dim(m)[1]) {
20 :     author <- ""
21 : feinerer 42 datetimestamp <- date()
22 : feinerer 37 description <- ""
23 :     id <- as.integer(m[i,1])
24 :     corpus <- as.character(m[i,2:dim(m)[2]])
25 :     if (stripWhiteSpace)
26 :     corpus <- gsub("[[:space:]]+", " ", corpus)
27 :     if (toLower)
28 :     corpus <- tolower(corpus)
29 :     origin <- "CSV"
30 :     heading <- ""
31 :    
32 : feinerer 49 l[[i]] <- new("PlainTextDocument", .Data = corpus, Author = author, DateTimeStamp = datetimestamp,
33 :     Description = description, ID = id, Origin = origin, Heading = heading)
34 : feinerer 37 }
35 :     l
36 :     })
37 : feinerer 39 if (length(filelist) > 1)
38 : feinerer 49 tdcl <- new("TextDocCol", .Data = unlist(tdl, recursive = FALSE))
39 : feinerer 39 else
40 : feinerer 49 tdcl <- new("TextDocCol", .Data = tdl)
41 : feinerer 37 },
42 : feinerer 22 # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
43 : feinerer 37 # The first argument is a directory with the RCV1 XML files
44 : feinerer 49 "RCV1_PLAIN" = {
45 : feinerer 40 filelist <- dir(object, pattern = ".xml", full.names = TRUE)
46 : feinerer 39 tdl <- sapply(filelist,
47 : feinerer 37 function(file) {
48 :     tree <- xmlTreeParse(file)
49 : feinerer 49 xmlApply(xmlRoot(tree), parseNewsItemPlain, stripWhiteSpace, toLower)
50 : feinerer 37 })
51 : feinerer 39 if (length(filelist) > 1)
52 : feinerer 49 tdcl <- new("TextDocCol", .Data = unlist(tdl, recursive = FALSE))
53 : feinerer 39 else
54 : feinerer 49 tdcl <- new("TextDocCol", .Data = tdl)
55 : feinerer 22 },
56 : feinerer 23 # Read in text documents in Reuters-21578 XML (not SGML) format
57 :     # Typically the first argument will be a directory where we can
58 :     # find the files reut2-000.xml ... reut2-021.xml
59 : feinerer 49 "REUT21578_PLAIN" = {
60 : feinerer 40 filelist <- dir(object, pattern = ".xml", full.names = TRUE)
61 : feinerer 39 tdl <- sapply(filelist,
62 : feinerer 24 function(file) {
63 :     tree <- xmlTreeParse(file)
64 : feinerer 49 xmlApply(xmlRoot(tree), parseReutersPlain, stripWhiteSpace, toLower)
65 : feinerer 24 })
66 : feinerer 39 if (length(filelist) > 1)
67 : feinerer 49 tdcl <- new("TextDocCol", .Data = unlist(tdl, recursive = FALSE))
68 : feinerer 39 else
69 : feinerer 49 tdcl <- new("TextDocCol", .Data = tdl)
70 : feinerer 40 },
71 : feinerer 49 "REUT21578_XML" = {
72 :     filelist <- dir(object, pattern = ".xml", full.names = TRUE)
73 :     tdl <- sapply(filelist,
74 :     function(file) {
75 :     parseReutersXML(file)
76 :     })
77 :     if (length(filelist) > 1)
78 :     tdcl <- new("TextDocCol", .Data = unlist(tdl, recursive = FALSE))
79 :     else
80 :     tdcl <- new("TextDocCol", .Data = tdl)
81 :     },
82 : feinerer 40 # Read in HTML documents as used by http://ris.bka.gv.at/vwgh
83 :     "RIS" = {
84 :     filelist <- dir(object, pattern = ".html", full.names = TRUE)
85 :     tdl <- sapply(filelist,
86 :     function(file) {
87 : feinerer 41 # Ignore warnings from misformed HTML documents
88 : feinerer 49 suppressWarnings(RISDoc <- parseHTMLPlain(file, stripWhiteSpace, toLower))
89 : feinerer 41 if (!is.null(RISDoc)) {
90 :     l <- list()
91 :     l[[length(l) + 1]] <- RISDoc
92 :     l
93 :     }
94 : feinerer 40 })
95 : feinerer 49 tdcl <- new("TextDocCol", .Data = tdl)
96 : feinerer 24 })
97 : feinerer 21 tdcl
98 :     })
99 :    
100 : feinerer 42 # Parse an Austrian RIS HTML document
101 : feinerer 49 parseHTMLPlain <- function(file, stripWhiteSpace = FALSE, toLower = FALSE) {
102 : feinerer 40 author <- ""
103 : feinerer 42 datetimestamp <- date()
104 : feinerer 40 description <- ""
105 :    
106 :     tree <- htmlTreeParse(file)
107 :     htmlElem <- unlist(tree$children$html$children)
108 : feinerer 41
109 :     if (is.null(htmlElem))
110 :     stop(paste("Empty document", file, "cannot be processed."))
111 :    
112 : feinerer 40 textElem <- htmlElem[which(regexpr("text.value", names(htmlElem)) > 0)]
113 :     names(textElem) <- NULL
114 :    
115 :     corpus <- paste(textElem, collapse = " ")
116 : feinerer 41
117 :     year <- substring(corpus, regexpr("..../../", corpus), regexpr("..../../", corpus) + 3)
118 :     senat <- substring(corpus, regexpr("..../../", corpus) + 5, regexpr("..../../", corpus) + 6)
119 :     number <- substring(corpus, regexpr("..../../", corpus) + 8, regexpr("..../../", corpus) + 11)
120 :    
121 :     id <- as.integer(paste(year, senat, number, sep = ""))
122 :    
123 :     if (is.na(id))
124 :     stop(paste("Cannot extract 'Geschaeftszahl' out of malformed document", file))
125 : feinerer 40 origin <- ""
126 :    
127 :     if (stripWhiteSpace)
128 :     corpus <- gsub("[[:space:]]+", " ", corpus)
129 :     if (toLower)
130 :     corpus <- tolower(corpus)
131 :    
132 :     heading <- ""
133 :    
134 : feinerer 49 new("PlainTextDocument", .Data = corpus, Author = author, DateTimeStamp = datetimestamp,
135 :     Description = description, ID = id, Origin = origin, Heading = heading)
136 : feinerer 40 }
137 :    
138 : feinerer 23 # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
139 : feinerer 49 parseNewsItemPlain <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
140 : feinerer 17 author <- "Not yet implemented"
141 : feinerer 42 datetimestamp <- xmlAttrs(node)[["date"]]
142 : feinerer 17 description <- "Not yet implemented"
143 :     id <- as.integer(xmlAttrs(node)[["itemid"]])
144 : feinerer 36 origin <- "Reuters Corpus Volume 1 XML"
145 : feinerer 18 corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
146 : feinerer 21
147 :     if (stripWhiteSpace)
148 :     corpus <- gsub("[[:space:]]+", " ", corpus)
149 :     if (toLower)
150 :     corpus <- tolower(corpus)
151 :    
152 : feinerer 18 heading <- xmlValue(node[["title"]])
153 : feinerer 17
154 : feinerer 49 new("PlainTextDocument", .Data = corpus, Author = author, DateTimeStamp = datetimestamp,
155 :     Description = description, ID = id, Origin = origin, Heading = heading)
156 : feinerer 19 }
157 : feinerer 23
158 :     # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
159 : feinerer 49 parseReutersPlain <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
160 : feinerer 36 # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
161 :     if (!is.null(node[["TEXT"]][["AUTHOR"]]))
162 :     author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
163 :     else
164 :     author <- ""
165 :    
166 : feinerer 42 datetimestamp <- xmlValue(node[["DATE"]])
167 : feinerer 36 description <- ""
168 : feinerer 23 id <- as.integer(xmlAttrs(node)[["NEWID"]])
169 :    
170 : feinerer 36 origin <- "Reuters-21578 XML"
171 : feinerer 23
172 :     # The <BODY></BODY> tag is unfortunately NOT obligatory!
173 :     if (!is.null(node[["TEXT"]][["BODY"]]))
174 :     corpus <- xmlValue(node[["TEXT"]][["BODY"]])
175 :     else
176 :     corpus <- ""
177 :    
178 :     if (stripWhiteSpace)
179 :     corpus <- gsub("[[:space:]]+", " ", corpus)
180 :     if (toLower)
181 :     corpus <- tolower(corpus)
182 :    
183 :     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
184 :     if (!is.null(node[["TEXT"]][["TITLE"]]))
185 :     heading <- xmlValue(node[["TEXT"]][["TITLE"]])
186 :     else
187 :     heading <- ""
188 :    
189 : feinerer 49 # TODO: Check whether <TOPICS></TOPICS> tags are obligatory
190 :     topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
191 :    
192 :     new("PlainTextDocument", .Data = corpus, Cached = 1, Author = author, DateTimeStamp = datetimestamp,
193 :     Description = description, ID = id, Origin = origin, Heading = heading, LocalMetaData = list(Topics = topics))
194 : feinerer 23 }
195 : feinerer 49
196 :     # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
197 :     parseReutersXML<- function(file) {
198 :     new("XMLTextDocument", FileName = file, Cached = 0, Author = "REUTERS", DateTimeStamp = date(),
199 :     Description = "Reuters21578 file containing several news articles", ID = as.integer(0),
200 :     Origin = "Reuters-21578 XML", Heading = "Reuters21578 news articles")
201 :     }
202 :    
203 :     setGeneric("loadFileIntoMem", function(object) standardGeneric("loadFileIntoMem"))
204 :     setMethod("loadFileIntoMem",
205 :     c("XMLTextDocument"),
206 :     function(object) {
207 :     if (object@Cached == 0) {
208 :     file <- object@FileName
209 :     doc <- xmlTreeParse(file)
210 :     class(doc) <- "list"
211 :     object@.Data <- doc
212 :     object@Cached <- 1
213 :     return(object)
214 :     } else {
215 :     return(object)
216 :     }
217 :     })
218 :    
219 :     setGeneric("transformTextDocCol", function(object, FUN, ...) standardGeneric("transformTextDocCol"))
220 :     setMethod("transformTextDocCol",
221 :     c("TextDocCol"),
222 :     function(object, FUN, ...) {
223 :     lapply(object, FUN, ...)
224 :     })
225 :    
226 :     setGeneric("toPlainTextDocument", function(object, FUN, ...) standardGeneric("toPlainTextDocument"))
227 :     setMethod("toPlainTextDocument",
228 :     c("PlainTextDocument"),
229 :     function(object, FUN, ...) {
230 :     return(object)
231 :     })
232 :     setMethod("toPlainTextDocument",
233 :     c("XMLTextDocument"),
234 :     function(object, FUN, ...) {
235 :     if (object@Cached == 0)
236 :     object <- loadFileIntoMem(object)
237 :    
238 :     corpus <- object@.Data
239 :    
240 :     # As XMLDocument is no native S4 class, restore valid information
241 :     class(corpus) <- "XMLDocument"
242 :     names(corpus) <- c("doc","dtd")
243 :    
244 :     return(xmlApply(xmlRoot(corpus), FUN, ...))
245 :     })
246 :    
247 :     setGeneric("stemTextDocument", function(object) standardGeneric("stemTextDocument"))
248 :     setMethod("stemTextDocument",
249 :     c("PlainTextDocument"),
250 :     function(object) {
251 :     require(Rstem)
252 :     splittedCorpus <- unlist(strsplit(object, " ", fixed = TRUE))
253 :     stemmedCorpus <- wordStem(splittedCorpus)
254 :     object@.Data <- paste(stemmedCorpus, collapse = " ")
255 :     return (object)
256 :     })
257 :    
258 :     setGeneric("removeStopWordsInTextDocument", function(object, stopwords) standardGeneric("removeStopWordsInTextDocument"))
259 :     setMethod("removeStopWordsInTextDocument",
260 :     c("PlainTextDocument", "character"),
261 :     function(object, stopwords) {
262 :     require(Rstem)
263 :     splittedCorpus <- unlist(strsplit(object, " ", fixed = TRUE))
264 :     noStopwordsCorpus <- splittedCorpus[!splittedCorpus %in% stopwords]
265 :     object@.Data <- paste(noStopwordsCorpus, collapse = " ")
266 :     return (object)
267 :     })
268 :    
269 :     setGeneric("filterTextDocCol", function(object, FUN, ...) standardGeneric("filterTextDocCol"))
270 :     setMethod("filterTextDocCol",
271 :     c("TextDocCol"),
272 :     function(object, FUN, ...) {
273 :     sapply(object, FUN, ...)
274 :     })
275 :    
276 :     setGeneric("filterREUT21578Topics", function(object, topics, ...) standardGeneric("filterREUT21578Topics"))
277 :     setMethod("filterREUT21578Topics",
278 :     c("PlainTextDocument", "character"),
279 :     function(object, topics, ...) {
280 :     if (object@Cached == 0)
281 :     object <- loadFileIntoMem(object)
282 :    
283 :     if (any(object@LocalMetaData$Topics %in% topics))
284 :     return(TRUE)
285 :     else
286 :     return(FALSE)
287 :     })

R-Forge@R-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge