SCM Repository
Annotation of /pkg/R/corpus.R
Parent Directory
|
Revision Log
Revision 71 -
(view)
(download)
Original Path: trunk/R/textmin/R/textdoccol.R
1 : | feinerer | 17 | # Author: Ingo Feinerer |
2 : | |||
3 : | feinerer | 65 | # The "..." are additional arguments for the function_generator parser |
4 : | feinerer | 63 | setGeneric("TextDocCol", function(object, parser = plaintext_parser, ...) standardGeneric("TextDocCol")) |
5 : | feinerer | 49 | setMethod("TextDocCol", |
6 : | feinerer | 63 | signature(object = "Source"), |
7 : | feinerer | 67 | function(object, parser = plaintext_parser, ...) { |
8 : | feinerer | 63 | if (inherits(parser, "function_generator")) |
9 : | parser <- parser(...) | ||
10 : | |||
11 : | tdl <- list() | ||
12 : | counter <- 1 | ||
13 : | while (!eoi(object)) { | ||
14 : | feinerer | 66 | object <- step_next(object) |
15 : | elem <- get_elem(object) | ||
16 : | feinerer | 63 | # If there is no Load on Demand support |
17 : | # we need to load the corpus into memory at startup | ||
18 : | if (object@LoDSupport) | ||
19 : | load <- object@Load | ||
20 : | else | ||
21 : | load <- TRUE | ||
22 : | tdl <- c(tdl, list(parser(elem, object@LoDSupport, load, as.character(counter)))) | ||
23 : | counter <- counter + 1 | ||
24 : | } | ||
25 : | |||
26 : | feinerer | 71 | dmeta.df <- data.frame(MetaID = rep(0, length(tdl))) |
27 : | dcmeta.node <- new("MetaDataNode", | ||
28 : | NodeID = 0, | ||
29 : | MetaData = list(create_date = date(), creator = Sys.getenv("LOGNAME")), | ||
30 : | children = list()) | ||
31 : | |||
32 : | return(new("TextDocCol", .Data = tdl, DMetaData = dmeta.df, DCMetaData = dcmeta.node)) | ||
33 : | feinerer | 21 | }) |
34 : | |||
35 : | feinerer | 63 | setGeneric("DirSource", function(directory, load = FALSE) standardGeneric("DirSource")) |
36 : | setMethod("DirSource", | ||
37 : | signature(directory = "character"), | ||
38 : | function(directory, load = FALSE) { | ||
39 : | new("DirSource", LoDSupport = TRUE, FileList = dir(directory, full.names = TRUE), | ||
40 : | Position = 0, Load = load) | ||
41 : | }) | ||
42 : | feinerer | 60 | |
43 : | feinerer | 67 | setGeneric("CSVSource", function(object) standardGeneric("CSVSource")) |
44 : | feinerer | 63 | setMethod("CSVSource", |
45 : | feinerer | 65 | signature(object = "character"), |
46 : | feinerer | 67 | function(object) { |
47 : | object <- substitute(file(object)) | ||
48 : | con <- eval(object) | ||
49 : | feinerer | 65 | content <- scan(con, what = "character") |
50 : | close(con) | ||
51 : | new("CSVSource", LoDSupport = FALSE, URI = object, | ||
52 : | Content = content, Position = 0) | ||
53 : | feinerer | 63 | }) |
54 : | feinerer | 67 | setMethod("CSVSource", |
55 : | signature(object = "ANY"), | ||
56 : | function(object) { | ||
57 : | object <- substitute(object) | ||
58 : | con <- eval(object) | ||
59 : | content <- scan(con, what = "character") | ||
60 : | close(con) | ||
61 : | new("CSVSource", LoDSupport = FALSE, URI = object, | ||
62 : | Content = content, Position = 0) | ||
63 : | }) | ||
64 : | feinerer | 60 | |
65 : | feinerer | 67 | setGeneric("ReutersSource", function(object) standardGeneric("ReutersSource")) |
66 : | feinerer | 65 | setMethod("ReutersSource", |
67 : | signature(object = "character"), | ||
68 : | feinerer | 67 | function(object) { |
69 : | object <- substitute(file(object)) | ||
70 : | con <- eval(object) | ||
71 : | feinerer | 65 | corpus <- paste(readLines(con), "\n", collapse = "") |
72 : | close(con) | ||
73 : | tree <- xmlTreeParse(corpus, asText = TRUE) | ||
74 : | feinerer | 63 | content <- xmlRoot(tree)$children |
75 : | feinerer | 65 | |
76 : | new("ReutersSource", LoDSupport = FALSE, URI = object, | ||
77 : | feinerer | 63 | Content = content, Position = 0) |
78 : | }) | ||
79 : | feinerer | 67 | setMethod("ReutersSource", |
80 : | signature(object = "ANY"), | ||
81 : | function(object) { | ||
82 : | object <- substitute(object) | ||
83 : | con <- eval(object) | ||
84 : | corpus <- paste(readLines(con), "\n", collapse = "") | ||
85 : | close(con) | ||
86 : | tree <- xmlTreeParse(corpus, asText = TRUE) | ||
87 : | content <- xmlRoot(tree)$children | ||
88 : | feinerer | 63 | |
89 : | feinerer | 67 | new("ReutersSource", LoDSupport = FALSE, URI = object, |
90 : | Content = content, Position = 0) | ||
91 : | }) | ||
92 : | |||
93 : | feinerer | 66 | setGeneric("step_next", function(object) standardGeneric("step_next")) |
94 : | setMethod("step_next", | ||
95 : | feinerer | 63 | signature(object = "DirSource"), |
96 : | function(object) { | ||
97 : | object@Position <- object@Position + 1 | ||
98 : | object | ||
99 : | }) | ||
100 : | feinerer | 66 | setMethod("step_next", |
101 : | feinerer | 63 | signature(object = "CSVSource"), |
102 : | function(object) { | ||
103 : | object@Position <- object@Position + 1 | ||
104 : | object | ||
105 : | }) | ||
106 : | feinerer | 66 | setMethod("step_next", |
107 : | feinerer | 65 | signature(object = "ReutersSource"), |
108 : | feinerer | 63 | function(object) { |
109 : | object@Position <- object@Position + 1 | ||
110 : | object | ||
111 : | }) | ||
112 : | |||
113 : | feinerer | 66 | setGeneric("get_elem", function(object) standardGeneric("get_elem")) |
114 : | setMethod("get_elem", | ||
115 : | feinerer | 63 | signature(object = "DirSource"), |
116 : | function(object) { | ||
117 : | feinerer | 67 | filename <- object@FileList[object@Position] |
118 : | feinerer | 63 | list(content = readLines(object@FileList[object@Position]), |
119 : | feinerer | 67 | uri = substitute(file(filename))) |
120 : | feinerer | 63 | }) |
121 : | feinerer | 66 | setMethod("get_elem", |
122 : | feinerer | 63 | signature(object = "CSVSource"), |
123 : | function(object) { | ||
124 : | list(content = object@Content[object@Position], | ||
125 : | feinerer | 65 | uri = object@URI) |
126 : | feinerer | 63 | }) |
127 : | feinerer | 66 | setMethod("get_elem", |
128 : | feinerer | 65 | signature(object = "ReutersSource"), |
129 : | feinerer | 63 | function(object) { |
130 : | feinerer | 65 | # Construct a character representation from the XMLNode |
131 : | con <- textConnection("virtual.file", "w") | ||
132 : | saveXML(object@Content[[object@Position]], con) | ||
133 : | close(con) | ||
134 : | |||
135 : | list(content = virtual.file, uri = object@URI) | ||
136 : | feinerer | 63 | }) |
137 : | |||
138 : | setGeneric("eoi", function(object) standardGeneric("eoi")) | ||
139 : | setMethod("eoi", | ||
140 : | signature(object = "DirSource"), | ||
141 : | function(object) { | ||
142 : | if (length(object@FileList) <= object@Position) | ||
143 : | return(TRUE) | ||
144 : | else | ||
145 : | return(FALSE) | ||
146 : | }) | ||
147 : | setMethod("eoi", | ||
148 : | signature(object = "CSVSource"), | ||
149 : | function(object) { | ||
150 : | if (length(object@Content) <= object@Position) | ||
151 : | return(TRUE) | ||
152 : | else | ||
153 : | return(FALSE) | ||
154 : | }) | ||
155 : | setMethod("eoi", | ||
156 : | feinerer | 65 | signature(object = "ReutersSource"), |
157 : | feinerer | 63 | function(object) { |
158 : | if (length(object@Content) <= object@Position) | ||
159 : | return(TRUE) | ||
160 : | else | ||
161 : | return(FALSE) | ||
162 : | }) | ||
163 : | |||
164 : | plaintext_parser <- function(...) { | ||
165 : | function(elem, lodsupport, load, id) { | ||
166 : | if (!lodsupport || (lodsupport && load)) { | ||
167 : | feinerer | 65 | doc <- new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE, |
168 : | feinerer | 63 | Author = "", DateTimeStamp = date(), Description = "", ID = id, Origin = "", Heading = "") |
169 : | } | ||
170 : | else { | ||
171 : | feinerer | 65 | doc <- new("PlainTextDocument", URI = elem$uri, Cached = FALSE, |
172 : | feinerer | 63 | Author = "", DateTimeStamp = date(), Description = "", ID = id, Origin = "", Heading = "") |
173 : | } | ||
174 : | |||
175 : | return(doc) | ||
176 : | feinerer | 60 | } |
177 : | } | ||
178 : | feinerer | 63 | class(plaintext_parser) <- "function_generator" |
179 : | feinerer | 60 | |
180 : | feinerer | 66 | reut21578xml_parser <- function(...) { |
181 : | feinerer | 63 | function(elem, lodsupport, load, id) { |
182 : | feinerer | 65 | corpus <- paste(elem$content, "\n", collapse = "") |
183 : | tree <- xmlTreeParse(corpus, asText = TRUE) | ||
184 : | feinerer | 63 | node <- xmlRoot(tree) |
185 : | feinerer | 60 | |
186 : | feinerer | 65 | # Mask as list to bypass S4 checks |
187 : | class(tree) <- "list" | ||
188 : | |||
189 : | feinerer | 63 | # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory! |
190 : | if (!is.null(node[["TEXT"]][["AUTHOR"]])) | ||
191 : | author <- xmlValue(node[["TEXT"]][["AUTHOR"]]) | ||
192 : | else | ||
193 : | author <- "" | ||
194 : | feinerer | 60 | |
195 : | feinerer | 63 | datetimestamp <- xmlValue(node[["DATE"]]) |
196 : | description <- "" | ||
197 : | id <- xmlAttrs(node)[["NEWID"]] | ||
198 : | feinerer | 40 | |
199 : | feinerer | 63 | # The <TITLE></TITLE> tag is unfortunately NOT obligatory! |
200 : | if (!is.null(node[["TEXT"]][["TITLE"]])) | ||
201 : | heading <- xmlValue(node[["TEXT"]][["TITLE"]]) | ||
202 : | else | ||
203 : | heading <- "" | ||
204 : | feinerer | 41 | |
205 : | feinerer | 63 | topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE) |
206 : | feinerer | 41 | |
207 : | feinerer | 63 | if (!lodsupport || (lodsupport && load)) { |
208 : | feinerer | 65 | doc <- new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author, |
209 : | feinerer | 63 | DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML", |
210 : | Heading = heading, LocalMetaData = list(Topics = topics)) | ||
211 : | } else { | ||
212 : | feinerer | 65 | doc <- new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = author, |
213 : | feinerer | 63 | DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML", |
214 : | Heading = heading, LocalMetaData = list(Topics = topics)) | ||
215 : | } | ||
216 : | feinerer | 40 | |
217 : | feinerer | 63 | return(doc) |
218 : | feinerer | 60 | } |
219 : | } | ||
220 : | feinerer | 66 | class(reut21578xml_parser) <- "function_generator" |
221 : | feinerer | 41 | |
222 : | feinerer | 63 | rcv1_parser <- function(...) { |
223 : | function(elem, lodsupport, load, id) { | ||
224 : | feinerer | 65 | corpus <- paste(elem$content, "\n", collapse = "") |
225 : | tree <- xmlTreeParse(corpus, asText = TRUE) | ||
226 : | feinerer | 63 | node <- xmlRoot(tree) |
227 : | feinerer | 41 | |
228 : | feinerer | 65 | # Mask as list to bypass S4 checks |
229 : | class(tree) <- "list" | ||
230 : | |||
231 : | feinerer | 63 | datetimestamp <- xmlAttrs(node)[["date"]] |
232 : | id <- xmlAttrs(node)[["itemid"]] | ||
233 : | heading <- xmlValue(node[["title"]]) | ||
234 : | feinerer | 40 | |
235 : | feinerer | 63 | if (!lodsupport || (lodsupport && load)) { |
236 : | feinerer | 65 | doc <- new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "", |
237 : | feinerer | 63 | DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML", |
238 : | Heading = heading) | ||
239 : | } else { | ||
240 : | feinerer | 65 | doc <- new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = "", |
241 : | feinerer | 63 | DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML", |
242 : | Heading = heading) | ||
243 : | } | ||
244 : | feinerer | 40 | |
245 : | feinerer | 63 | return(doc) |
246 : | feinerer | 60 | } |
247 : | feinerer | 40 | } |
248 : | feinerer | 63 | class(rcv1_parser) <- "function_generator" |
249 : | feinerer | 40 | |
250 : | feinerer | 66 | newsgroup_parser <- function(...) { |
251 : | feinerer | 63 | function(elem, lodsupport, load, id) { |
252 : | feinerer | 65 | mail <- elem$content |
253 : | feinerer | 63 | author <- gsub("From: ", "", grep("^From:", mail, value = TRUE)) |
254 : | datetimestamp <- gsub("Date: ", "", grep("^Date:", mail, value = TRUE)) | ||
255 : | origin <- gsub("Path: ", "", grep("^Path:", mail, value = TRUE)) | ||
256 : | heading <- gsub("Subject: ", "", grep("^Subject:", mail, value = TRUE)) | ||
257 : | newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE)) | ||
258 : | feinerer | 60 | |
259 : | feinerer | 63 | if (!lodsupport || (lodsupport && load)) { |
260 : | feinerer | 65 | # The header is separated from the body by a blank line. |
261 : | # Reference: \url{http://en.wikipedia.org/wiki/E-mail#Internet_e-mail_format} | ||
262 : | for (index in seq(along = mail)) { | ||
263 : | if (mail[index] == "") | ||
264 : | break | ||
265 : | } | ||
266 : | feinerer | 63 | content <- mail[(index + 1):length(mail)] |
267 : | feinerer | 60 | |
268 : | feinerer | 65 | doc <- new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE, |
269 : | feinerer | 63 | Author = author, DateTimeStamp = datetimestamp, |
270 : | feinerer | 65 | Description = "", ID = id, Origin = origin, |
271 : | feinerer | 63 | Heading = heading, Newsgroup = newsgroup) |
272 : | } else { | ||
273 : | feinerer | 65 | doc <- new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp, |
274 : | Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup) | ||
275 : | feinerer | 63 | } |
276 : | |||
277 : | return(doc) | ||
278 : | feinerer | 60 | } |
279 : | } | ||
280 : | feinerer | 66 | class(newsgroup_parser) <- "function_generator" |
281 : | feinerer | 60 | |
282 : | feinerer | 23 | # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file |
283 : | feinerer | 65 | rcv1_to_plain <- function(node, ...) { |
284 : | feinerer | 42 | datetimestamp <- xmlAttrs(node)[["date"]] |
285 : | feinerer | 60 | id <- xmlAttrs(node)[["itemid"]] |
286 : | feinerer | 36 | origin <- "Reuters Corpus Volume 1 XML" |
287 : | feinerer | 18 | corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE) |
288 : | heading <- xmlValue(node[["title"]]) | ||
289 : | feinerer | 17 | |
290 : | feinerer | 65 | new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = "", DateTimeStamp = datetimestamp, |
291 : | feinerer | 60 | Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML", Heading = heading) |
292 : | feinerer | 19 | } |
293 : | feinerer | 23 | |
294 : | # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file | ||
295 : | feinerer | 66 | reut21578xml_to_plain <- function(node, ...) { |
296 : | feinerer | 36 | # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory! |
297 : | if (!is.null(node[["TEXT"]][["AUTHOR"]])) | ||
298 : | author <- xmlValue(node[["TEXT"]][["AUTHOR"]]) | ||
299 : | else | ||
300 : | author <- "" | ||
301 : | |||
302 : | feinerer | 42 | datetimestamp <- xmlValue(node[["DATE"]]) |
303 : | feinerer | 36 | description <- "" |
304 : | feinerer | 60 | id <- xmlAttrs(node)[["NEWID"]] |
305 : | feinerer | 23 | |
306 : | feinerer | 36 | origin <- "Reuters-21578 XML" |
307 : | feinerer | 23 | |
308 : | # The <BODY></BODY> tag is unfortunately NOT obligatory! | ||
309 : | if (!is.null(node[["TEXT"]][["BODY"]])) | ||
310 : | corpus <- xmlValue(node[["TEXT"]][["BODY"]]) | ||
311 : | else | ||
312 : | corpus <- "" | ||
313 : | |||
314 : | # The <TITLE></TITLE> tag is unfortunately NOT obligatory! | ||
315 : | if (!is.null(node[["TEXT"]][["TITLE"]])) | ||
316 : | heading <- xmlValue(node[["TEXT"]][["TITLE"]]) | ||
317 : | else | ||
318 : | heading <- "" | ||
319 : | |||
320 : | feinerer | 49 | topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE) |
321 : | |||
322 : | feinerer | 65 | new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = author, DateTimeStamp = datetimestamp, |
323 : | feinerer | 49 | Description = description, ID = id, Origin = origin, Heading = heading, LocalMetaData = list(Topics = topics)) |
324 : | feinerer | 23 | } |
325 : | feinerer | 49 | |
326 : | feinerer | 66 | setGeneric("load_doc", function(object, ...) standardGeneric("load_doc")) |
327 : | setMethod("load_doc", | ||
328 : | feinerer | 62 | signature(object = "PlainTextDocument"), |
329 : | feinerer | 65 | function(object, ...) { |
330 : | feinerer | 61 | if (!Cached(object)) { |
331 : | feinerer | 67 | con <- eval(URI(object)) |
332 : | feinerer | 65 | corpus <- readLines(con) |
333 : | close(con) | ||
334 : | feinerer | 56 | Corpus(object) <- corpus |
335 : | feinerer | 60 | Cached(object) <- TRUE |
336 : | feinerer | 56 | return(object) |
337 : | } else { | ||
338 : | return(object) | ||
339 : | } | ||
340 : | }) | ||
341 : | feinerer | 66 | setMethod("load_doc", |
342 : | feinerer | 62 | signature(object = "XMLTextDocument"), |
343 : | feinerer | 65 | function(object, ...) { |
344 : | feinerer | 61 | if (!Cached(object)) { |
345 : | feinerer | 67 | con <- eval(URI(object)) |
346 : | feinerer | 65 | corpus <- paste(readLines(con), "\n", collapse = "") |
347 : | close(con) | ||
348 : | doc <- xmlTreeParse(corpus, asText = TRUE) | ||
349 : | feinerer | 49 | class(doc) <- "list" |
350 : | feinerer | 56 | Corpus(object) <- doc |
351 : | feinerer | 60 | Cached(object) <- TRUE |
352 : | feinerer | 49 | return(object) |
353 : | } else { | ||
354 : | return(object) | ||
355 : | } | ||
356 : | }) | ||
357 : | feinerer | 66 | setMethod("load_doc", |
358 : | feinerer | 62 | signature(object = "NewsgroupDocument"), |
359 : | feinerer | 65 | function(object, ...) { |
360 : | feinerer | 61 | if (!Cached(object)) { |
361 : | feinerer | 67 | con <- eval(URI(object)) |
362 : | feinerer | 65 | mail <- readLines(con) |
363 : | close(con) | ||
364 : | feinerer | 60 | Cached(object) <- TRUE |
365 : | feinerer | 65 | for (index in seq(along = mail)) { |
366 : | if (mail[index] == "") | ||
367 : | break | ||
368 : | } | ||
369 : | feinerer | 56 | Corpus(object) <- mail[(index + 1):length(mail)] |
370 : | return(object) | ||
371 : | } else { | ||
372 : | return(object) | ||
373 : | } | ||
374 : | }) | ||
375 : | feinerer | 49 | |
376 : | feinerer | 54 | setGeneric("tm_transform", function(object, FUN, ...) standardGeneric("tm_transform")) |
377 : | setMethod("tm_transform", | ||
378 : | feinerer | 62 | signature(object = "TextDocCol", FUN = "function"), |
379 : | feinerer | 49 | function(object, FUN, ...) { |
380 : | feinerer | 71 | result <- as(lapply(object, FUN, ..., DMetaData = DMetaData(object)), "TextDocCol") |
381 : | result@DMetaData <- DMetaData(object) | ||
382 : | feinerer | 56 | return(result) |
383 : | feinerer | 49 | }) |
384 : | |||
385 : | feinerer | 66 | setGeneric("as.plaintext_doc", function(object, FUN, ...) standardGeneric("as.plaintext_doc")) |
386 : | setMethod("as.plaintext_doc", | ||
387 : | feinerer | 62 | signature(object = "PlainTextDocument"), |
388 : | feinerer | 49 | function(object, FUN, ...) { |
389 : | return(object) | ||
390 : | }) | ||
391 : | feinerer | 66 | setMethod("as.plaintext_doc", |
392 : | feinerer | 62 | signature(object = "XMLTextDocument", FUN = "function"), |
393 : | feinerer | 49 | function(object, FUN, ...) { |
394 : | feinerer | 61 | if (!Cached(object)) |
395 : | feinerer | 66 | object <- load_doc(object) |
396 : | feinerer | 49 | |
397 : | feinerer | 56 | corpus <- Corpus(object) |
398 : | feinerer | 49 | |
399 : | # As XMLDocument is no native S4 class, restore valid information | ||
400 : | class(corpus) <- "XMLDocument" | ||
401 : | names(corpus) <- c("doc","dtd") | ||
402 : | |||
403 : | feinerer | 61 | return(FUN(xmlRoot(corpus), ...)) |
404 : | feinerer | 49 | }) |
405 : | |||
406 : | feinerer | 67 | setGeneric("tm_tolower", function(object, ...) standardGeneric("tm_tolower")) |
407 : | setMethod("tm_tolower", | ||
408 : | signature(object = "PlainTextDocument"), | ||
409 : | function(object, ...) { | ||
410 : | if (!Cached(object)) | ||
411 : | object <- load_doc(object) | ||
412 : | |||
413 : | Corpus(object) <- tolower(object) | ||
414 : | return(object) | ||
415 : | }) | ||
416 : | |||
417 : | setGeneric("strip_whitespace", function(object, ...) standardGeneric("strip_whitespace")) | ||
418 : | setMethod("strip_whitespace", | ||
419 : | signature(object = "PlainTextDocument"), | ||
420 : | function(object, ...) { | ||
421 : | if (!Cached(object)) | ||
422 : | object <- load_doc(object) | ||
423 : | |||
424 : | Corpus(object) <- gsub("[[:space:]]+", " ", object) | ||
425 : | return(object) | ||
426 : | }) | ||
427 : | |||
428 : | feinerer | 66 | setGeneric("stem_doc", function(object, ...) standardGeneric("stem_doc")) |
429 : | setMethod("stem_doc", | ||
430 : | feinerer | 62 | signature(object = "PlainTextDocument"), |
431 : | feinerer | 61 | function(object, ...) { |
432 : | if (!Cached(object)) | ||
433 : | feinerer | 66 | object <- load_doc(object) |
434 : | feinerer | 56 | |
435 : | feinerer | 49 | require(Rstem) |
436 : | splittedCorpus <- unlist(strsplit(object, " ", fixed = TRUE)) | ||
437 : | feinerer | 67 | stemmedCorpus <- wordStem(splittedCorpus) |
438 : | feinerer | 56 | Corpus(object) <- paste(stemmedCorpus, collapse = " ") |
439 : | return(object) | ||
440 : | feinerer | 49 | }) |
441 : | |||
442 : | feinerer | 66 | setGeneric("remove_words", function(object, stopwords, ...) standardGeneric("remove_words")) |
443 : | setMethod("remove_words", | ||
444 : | feinerer | 60 | signature(object = "PlainTextDocument", stopwords = "character"), |
445 : | feinerer | 61 | function(object, stopwords, ...) { |
446 : | if (!Cached(object)) | ||
447 : | feinerer | 66 | object <- load_doc(object) |
448 : | feinerer | 56 | |
449 : | feinerer | 49 | require(Rstem) |
450 : | splittedCorpus <- unlist(strsplit(object, " ", fixed = TRUE)) | ||
451 : | noStopwordsCorpus <- splittedCorpus[!splittedCorpus %in% stopwords] | ||
452 : | feinerer | 56 | Corpus(object) <- paste(noStopwordsCorpus, collapse = " ") |
453 : | return(object) | ||
454 : | feinerer | 49 | }) |
455 : | |||
456 : | feinerer | 64 | setGeneric("tm_filter", function(object, ..., FUN = s_filter) standardGeneric("tm_filter")) |
457 : | feinerer | 54 | setMethod("tm_filter", |
458 : | feinerer | 61 | signature(object = "TextDocCol"), |
459 : | feinerer | 64 | function(object, ..., FUN = s_filter) { |
460 : | feinerer | 71 | indices <- sapply(object, FUN, ..., DMetaData = DMetaData(object)) |
461 : | feinerer | 68 | object[indices] |
462 : | feinerer | 61 | }) |
463 : | |||
464 : | feinerer | 63 | setGeneric("tm_index", function(object, ..., FUN = s_filter) standardGeneric("tm_index")) |
465 : | feinerer | 61 | setMethod("tm_index", |
466 : | signature(object = "TextDocCol"), | ||
467 : | feinerer | 64 | function(object, ..., FUN = s_filter) { |
468 : | feinerer | 71 | sapply(object, FUN, ..., DMetaData = DMetaData(object)) |
469 : | feinerer | 49 | }) |
470 : | |||
471 : | feinerer | 71 | s_filter <- function(object, s, ..., DMetaData) { |
472 : | feinerer | 61 | b <- TRUE |
473 : | for (tag in names(s)) { | ||
474 : | if (tag %in% names(LocalMetaData(object))) { | ||
475 : | b <- b && any(grep(s[[tag]], LocalMetaData(object)[[tag]])) | ||
476 : | feinerer | 71 | } else if (tag %in% names(DMetaData)){ |
477 : | b <- b && any(grep(s[[tag]], DMetaData[[tag]])) | ||
478 : | feinerer | 61 | } else { |
479 : | b <- b && any(grep(s[[tag]], eval(call(tag, object)))) | ||
480 : | } | ||
481 : | } | ||
482 : | return(b) | ||
483 : | } | ||
484 : | |||
485 : | feinerer | 63 | setGeneric("fulltext_search_filter", function(object, pattern, ...) standardGeneric("fulltext_search_filter")) |
486 : | setMethod("fulltext_search_filter", | ||
487 : | feinerer | 61 | signature(object = "PlainTextDocument", pattern = "character"), |
488 : | function(object, pattern, ...) { | ||
489 : | if (!Cached(object)) | ||
490 : | feinerer | 66 | object <- load_doc(object) |
491 : | feinerer | 61 | |
492 : | return(any(grep(pattern, Corpus(object)))) | ||
493 : | }) | ||
494 : | |||
495 : | feinerer | 66 | setGeneric("attach_data", function(object, data) standardGeneric("attach_data")) |
496 : | feinerer | 71 | setGeneric("attach_metadata", function(object, name, metadata) standardGeneric("attach_metadata")) |
497 : | |||
498 : | setGeneric("append_doc", function(object, data, meta) standardGeneric("append_doc")) | ||
499 : | setMethod("append_doc", | ||
500 : | signature(object = "TextDocCol", data = "TextDocument", meta = "list"), | ||
501 : | function(object, data, meta) { | ||
502 : | object@.Data <- c(object@.Data, list(data)) | ||
503 : | object@DMetaData <- rbind(object@DMetaData, c(MetaID = DCMetaData(object)@NodeID, meta)) | ||
504 : | feinerer | 52 | return(object) |
505 : | }) | ||
506 : | |||
507 : | feinerer | 71 | setGeneric("append_meta", function(object, dcmeta, dmeta) standardGeneric("append_meta")) |
508 : | setMethod("append_meta", | ||
509 : | signature(object = "TextDocCol", dcmeta = "list", dmeta = "list"), | ||
510 : | function(object, dcmeta, dmeta) { | ||
511 : | object@DCMetaData@MetaData <- c(object@DCMetaData@MetaData, dcmeta) | ||
512 : | object@DMetaData <- cbind(object@DMetaData, dmeta) | ||
513 : | feinerer | 52 | return(object) |
514 : | }) | ||
515 : | feinerer | 53 | |
516 : | feinerer | 69 | setGeneric("remove_metadata", function(object, name) standardGeneric("remove_metadata")) |
517 : | feinerer | 71 | #setMethod("remove_metadata", |
518 : | # signature(object = "TextDocCol"), | ||
519 : | # function(object, name) { | ||
520 : | # object@DMetaData <- DMetaData(object)[names(DMetaData(object)) != name] | ||
521 : | # return(object) | ||
522 : | # }) | ||
523 : | feinerer | 69 | |
524 : | setGeneric("modify_metadata", function(object, name, metadata) standardGeneric("modify_metadata")) | ||
525 : | feinerer | 71 | #setMethod("modify_metadata", |
526 : | # signature(object = "TextDocCol"), | ||
527 : | # function(object, name, metadata) { | ||
528 : | # object@DMetaData[[name]] <- metadata | ||
529 : | # return(object) | ||
530 : | # }) | ||
531 : | feinerer | 69 | |
532 : | feinerer | 53 | setMethod("[", |
533 : | signature(x = "TextDocCol", i = "ANY", j = "ANY", drop = "ANY"), | ||
534 : | function(x, i, j, ... , drop) { | ||
535 : | if(missing(i)) | ||
536 : | return(x) | ||
537 : | |||
538 : | object <- x | ||
539 : | object@.Data <- x@.Data[i, ..., drop = FALSE] | ||
540 : | feinerer | 71 | object@DMetaData <- DMetaData(object)[i, ] |
541 : | feinerer | 53 | return(object) |
542 : | }) | ||
543 : | |||
544 : | feinerer | 63 | setMethod("[<-", |
545 : | signature(x = "TextDocCol", i = "ANY", j = "ANY", value = "ANY"), | ||
546 : | function(x, i, j, ... , value) { | ||
547 : | object <- x | ||
548 : | object@.Data[i, ...] <- value | ||
549 : | return(object) | ||
550 : | }) | ||
551 : | |||
552 : | setMethod("[[", | ||
553 : | signature(x = "TextDocCol", i = "ANY", j = "ANY"), | ||
554 : | function(x, i, j, ...) { | ||
555 : | return(x@.Data[[i, ...]]) | ||
556 : | }) | ||
557 : | |||
558 : | setMethod("[[<-", | ||
559 : | signature(x = "TextDocCol", i = "ANY", j = "ANY", value = "ANY"), | ||
560 : | function(x, i, j, ..., value) { | ||
561 : | object <- x | ||
562 : | object@.Data[[i, ...]] <- value | ||
563 : | return(object) | ||
564 : | }) | ||
565 : | |||
566 : | feinerer | 71 | # Update \code{NodeID}s of a DCMetaData tree |
567 : | # TODO: Avoid global variables outside of update_id function | ||
568 : | update_id <- function(object) { | ||
569 : | id <<- 0 | ||
570 : | mapping <<- left.mapping <<- NULL | ||
571 : | level <<- 0 | ||
572 : | return(list(root = set_id(object), left.mapping = left.mapping, right.mapping = mapping)) | ||
573 : | } | ||
574 : | |||
575 : | # Traversal of (binary) DCMetaData tree with setup of \code{NodeID}s | ||
576 : | set_id <- function(object) { | ||
577 : | object@NodeID <- id | ||
578 : | id <<- id + 1 | ||
579 : | level <<- level + 1 | ||
580 : | |||
581 : | if (length(object@children) > 0) { | ||
582 : | mapping <<- cbind(mapping, c(object@children[[1]]@NodeID, id)) | ||
583 : | left <- set_id(object@children[[1]]) | ||
584 : | if (level == 1) { | ||
585 : | left.mapping <<- mapping | ||
586 : | mapping <<- NULL | ||
587 : | } | ||
588 : | mapping <<- cbind(mapping, c(object@children[[2]]@NodeID, id)) | ||
589 : | right <- set_id(object@children[[2]]) | ||
590 : | |||
591 : | object@children <- list(left, right) | ||
592 : | } | ||
593 : | level <<- level - 1 | ||
594 : | |||
595 : | return(object) | ||
596 : | } | ||
597 : | |||
598 : | feinerer | 53 | setMethod("c", |
599 : | signature(x = "TextDocCol"), | ||
600 : | feinerer | 71 | function(x, y, ..., meta = list(merge_date = date(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) { |
601 : | if (!inherits(y, "TextDocCol")) | ||
602 : | stop("invalid argument") | ||
603 : | |||
604 : | object <- x | ||
605 : | # Concatenate data slots | ||
606 : | object@.Data <- c(as(x, "list"), as(y, "list")) | ||
607 : | |||
608 : | # Update the DCMetaData tree | ||
609 : | dcmeta <- new("MetaDataNode", NodeID = 0, MetaData = meta, children = list(DCMetaData(x), DCMetaData(y))) | ||
610 : | update.struct <- update_id(dcmeta) | ||
611 : | object@DCMetaData <- update.struct$root | ||
612 : | |||
613 : | # Find indices to be updated for the left tree | ||
614 : | indices.mapping <- NULL | ||
615 : | for (m in levels(as.factor(DMetaData(x)$MetaID))) { | ||
616 : | indices <- (DMetaData(x)$MetaID == m) | ||
617 : | indices.mapping <- c(indices.mapping, list(m = indices)) | ||
618 : | names(indices.mapping)[length(indices.mapping)] <- m | ||
619 : | } | ||
620 : | |||
621 : | # Update the DMetaData data frames for the left tree | ||
622 : | for (i in 1:ncol(update.struct$left.mapping)) { | ||
623 : | map <- update.struct$left.mapping[,i] | ||
624 : | x@DMetaData$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2]) | ||
625 : | } | ||
626 : | |||
627 : | # Find indices to be updated for the right tree | ||
628 : | indices.mapping <- NULL | ||
629 : | for (m in levels(as.factor(DMetaData(y)$MetaID))) { | ||
630 : | indices <- (DMetaData(y)$MetaID == m) | ||
631 : | indices.mapping <- c(indices.mapping, list(m = indices)) | ||
632 : | names(indices.mapping)[length(indices.mapping)] <- m | ||
633 : | } | ||
634 : | |||
635 : | # Update the DMetaData data frames for the right tree | ||
636 : | for (i in 1:ncol(update.struct$right.mapping)) { | ||
637 : | map <- update.struct$right.mapping[,i] | ||
638 : | y@DMetaData$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2]) | ||
639 : | } | ||
640 : | |||
641 : | # Merge the DMetaData data frames | ||
642 : | labels <- setdiff(names(DMetaData(y)), names(DMetaData(x))) | ||
643 : | na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels)) | ||
644 : | x.dmeta.aug <- cbind(DMetaData(x), na.matrix) | ||
645 : | labels <- setdiff(names(DMetaData(x)), names(DMetaData(y))) | ||
646 : | na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels)) | ||
647 : | y.dmeta.aug <- cbind(DMetaData(y), na.matrix) | ||
648 : | object@DMetaData <- rbind(x.dmeta.aug, y.dmeta.aug) | ||
649 : | |||
650 : | return(object) | ||
651 : | feinerer | 53 | }) |
652 : | feinerer | 71 | #setMethod("c", |
653 : | # signature(x = "TextDocument"), | ||
654 : | # function(x, ..., recursive = TRUE){ | ||
655 : | # args <- list(...) | ||
656 : | # if(length(args) == 0) | ||
657 : | # return(x) | ||
658 : | # return(new("TextDocCol", .Data = list(x, ...))) | ||
659 : | # }) | ||
660 : | feinerer | 54 | |
661 : | setMethod("length", | ||
662 : | signature(x = "TextDocCol"), | ||
663 : | function(x){ | ||
664 : | return(length(as(x, "list"))) | ||
665 : | }) | ||
666 : | |||
667 : | setMethod("show", | ||
668 : | signature(object = "TextDocCol"), | ||
669 : | function(object){ | ||
670 : | feinerer | 70 | cat(sprintf(ngettext(length(object), |
671 : | "A text document collection with %d text document\n", | ||
672 : | "A text document collection with %d text documents\n"), | ||
673 : | length(object))) | ||
674 : | feinerer | 54 | }) |
675 : | |||
676 : | setMethod("summary", | ||
677 : | signature(object = "TextDocCol"), | ||
678 : | function(object){ | ||
679 : | show(object) | ||
680 : | feinerer | 71 | if (length(DMetaData(object)) > 0) { |
681 : | cat(sprintf(ngettext(length(DMetaData(object)), | ||
682 : | feinerer | 70 | "\nThe global metadata consists of %d tag-value pair\n", |
683 : | "\nThe global metadata consists of %d tag-value pairs\n"), | ||
684 : | feinerer | 71 | length(DMetaData(object)))) |
685 : | feinerer | 54 | cat("Available tags are:\n") |
686 : | feinerer | 71 | cat(names(DMetaData(object)), "\n") |
687 : | feinerer | 54 | } |
688 : | }) | ||
689 : | feinerer | 55 | |
690 : | setGeneric("inspect", function(object) standardGeneric("inspect")) | ||
691 : | setMethod("inspect", | ||
692 : | feinerer | 65 | signature("TextDocCol"), |
693 : | feinerer | 55 | function(object) { |
694 : | summary(object) | ||
695 : | cat("\n") | ||
696 : | show(as(object, "list")) | ||
697 : | }) | ||
698 : | feinerer | 65 | |
699 : | # No metadata is checked | ||
700 : | setGeneric("%IN%", function(x, y) standardGeneric("%IN%")) | ||
701 : | setMethod("%IN%", | ||
702 : | signature(x = "TextDocument", y = "TextDocCol"), | ||
703 : | function(x, y) { | ||
704 : | x %in% y | ||
705 : | }) |
R-Forge@R-project.org | ViewVC Help |
Powered by ViewVC 1.0.0 |