SCM Repository
Annotation of /pkg/R/corpus.R
Parent Directory
|
Revision Log
Revision 725 -
(view)
(download)
Original Path: trunk/tm/R/textdoccol.R
1 : | feinerer | 17 | # Author: Ingo Feinerer |
2 : | |||
3 : | feinerer | 722 | # The "..." are additional arguments for the FunctionGenerator reader |
4 : | feinerer | 712 | setGeneric("TextDocCol", function(object, |
5 : | feinerer | 723 | readerControl = list(reader = object@DefaultReader, language = "en_US", load = FALSE), |
6 : | feinerer | 712 | dbControl = list(useDb = FALSE, dbName = "", dbType = "DB1"), |
7 : | ...) standardGeneric("TextDocCol")) | ||
8 : | feinerer | 49 | setMethod("TextDocCol", |
9 : | feinerer | 63 | signature(object = "Source"), |
10 : | feinerer | 717 | function(object, |
11 : | feinerer | 723 | readerControl = list(reader = object@DefaultReader, language = "en_US", load = FALSE), |
12 : | feinerer | 717 | dbControl = list(useDb = FALSE, dbName = "", dbType = "DB1"), |
13 : | ...) { | ||
14 : | feinerer | 722 | if (attr(readerControl$reader, "FunctionGenerator")) |
15 : | readerControl$reader <- readerControl$reader(...) | ||
16 : | feinerer | 63 | |
17 : | feinerer | 712 | if (dbControl$useDb) { |
18 : | if (!dbCreate(dbControl$dbName, dbControl$dbType)) | ||
19 : | stop("error in creating database") | ||
20 : | db <- dbInit(dbControl$dbName, dbControl$dbType) | ||
21 : | } | ||
22 : | |||
23 : | feinerer | 63 | tdl <- list() |
24 : | counter <- 1 | ||
25 : | while (!eoi(object)) { | ||
26 : | feinerer | 698 | object <- stepNext(object) |
27 : | elem <- getElem(object) | ||
28 : | feinerer | 63 | # If there is no Load on Demand support |
29 : | # we need to load the corpus into memory at startup | ||
30 : | feinerer | 694 | if (!object@LoDSupport) |
31 : | feinerer | 722 | readerControl$load <- TRUE |
32 : | doc <- readerControl$reader(elem, readerControl$load, readerControl$language, as.character(counter)) | ||
33 : | feinerer | 712 | if (dbControl$useDb) { |
34 : | dbInsert(db, ID(doc), doc) | ||
35 : | tdl <- c(tdl, ID(doc)) | ||
36 : | } | ||
37 : | else | ||
38 : | tdl <- c(tdl, list(doc)) | ||
39 : | feinerer | 63 | counter <- counter + 1 |
40 : | } | ||
41 : | |||
42 : | feinerer | 712 | df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE) |
43 : | if (dbControl$useDb) { | ||
44 : | dbInsert(db, "DMetaData", df) | ||
45 : | feinerer | 724 | dmeta.df <- data.frame(key = "DMetaData", subset = NA) |
46 : | feinerer | 712 | dbDisconnect(db) |
47 : | } | ||
48 : | else | ||
49 : | dmeta.df <- df | ||
50 : | |||
51 : | feinerer | 698 | cmeta.node <- new("MetaDataNode", |
52 : | feinerer | 71 | NodeID = 0, |
53 : | feinerer | 689 | MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")), |
54 : | feinerer | 71 | children = list()) |
55 : | |||
56 : | feinerer | 712 | return(new("TextDocCol", .Data = tdl, DMetaData = dmeta.df, CMetaData = cmeta.node, DBControl = dbControl)) |
57 : | feinerer | 21 | }) |
58 : | |||
59 : | feinerer | 698 | setGeneric("loadDoc", function(object, ...) standardGeneric("loadDoc")) |
60 : | setMethod("loadDoc", | ||
61 : | feinerer | 62 | signature(object = "PlainTextDocument"), |
62 : | feinerer | 65 | function(object, ...) { |
63 : | feinerer | 61 | if (!Cached(object)) { |
64 : | feinerer | 67 | con <- eval(URI(object)) |
65 : | feinerer | 65 | corpus <- readLines(con) |
66 : | close(con) | ||
67 : | feinerer | 56 | Corpus(object) <- corpus |
68 : | feinerer | 60 | Cached(object) <- TRUE |
69 : | feinerer | 56 | return(object) |
70 : | } else { | ||
71 : | return(object) | ||
72 : | } | ||
73 : | }) | ||
74 : | feinerer | 698 | setMethod("loadDoc", |
75 : | feinerer | 62 | signature(object = "XMLTextDocument"), |
76 : | feinerer | 65 | function(object, ...) { |
77 : | feinerer | 61 | if (!Cached(object)) { |
78 : | feinerer | 67 | con <- eval(URI(object)) |
79 : | feinerer | 65 | corpus <- paste(readLines(con), "\n", collapse = "") |
80 : | close(con) | ||
81 : | doc <- xmlTreeParse(corpus, asText = TRUE) | ||
82 : | feinerer | 49 | class(doc) <- "list" |
83 : | feinerer | 56 | Corpus(object) <- doc |
84 : | feinerer | 60 | Cached(object) <- TRUE |
85 : | feinerer | 49 | return(object) |
86 : | } else { | ||
87 : | return(object) | ||
88 : | } | ||
89 : | }) | ||
90 : | feinerer | 698 | setMethod("loadDoc", |
91 : | feinerer | 62 | signature(object = "NewsgroupDocument"), |
92 : | feinerer | 65 | function(object, ...) { |
93 : | feinerer | 61 | if (!Cached(object)) { |
94 : | feinerer | 67 | con <- eval(URI(object)) |
95 : | feinerer | 65 | mail <- readLines(con) |
96 : | close(con) | ||
97 : | feinerer | 60 | Cached(object) <- TRUE |
98 : | feinerer | 65 | for (index in seq(along = mail)) { |
99 : | if (mail[index] == "") | ||
100 : | break | ||
101 : | } | ||
102 : | feinerer | 56 | Corpus(object) <- mail[(index + 1):length(mail)] |
103 : | return(object) | ||
104 : | } else { | ||
105 : | return(object) | ||
106 : | } | ||
107 : | }) | ||
108 : | feinerer | 49 | |
109 : | feinerer | 719 | setGeneric("tmUpdate", function(object, |
110 : | origin, | ||
111 : | feinerer | 723 | readerControl = list(reader = origin@DefaultReader, language = "en_US", load = FALSE), |
112 : | feinerer | 719 | ...) standardGeneric("tmUpdate")) |
113 : | feinerer | 72 | # Update is only supported for directories |
114 : | # At the moment no other LoD devices are available anyway | ||
115 : | feinerer | 698 | setMethod("tmUpdate", |
116 : | feinerer | 72 | signature(object = "TextDocCol", origin = "DirSource"), |
117 : | feinerer | 719 | function(object, origin, |
118 : | feinerer | 723 | readerControl = list(reader = origin@DefaultReader, language = "en_US", load = FALSE), |
119 : | feinerer | 719 | ...) { |
120 : | feinerer | 722 | if (inherits(readerControl$reader, "FunctionGenerator")) |
121 : | readerControl$reader <- readerControl$reader(...) | ||
122 : | feinerer | 72 | |
123 : | object.filelist <- unlist(lapply(object, function(x) {as.character(URI(x))[2]})) | ||
124 : | new.files <- setdiff(origin@FileList, object.filelist) | ||
125 : | |||
126 : | for (filename in new.files) { | ||
127 : | elem <- list(content = readLines(filename), | ||
128 : | uri = substitute(file(filename))) | ||
129 : | feinerer | 722 | object <- appendElem(object, readerControl$reader(elem, readerControl$load, readerControl$language, filename)) |
130 : | feinerer | 72 | } |
131 : | |||
132 : | return(object) | ||
133 : | }) | ||
134 : | |||
135 : | feinerer | 698 | setGeneric("tmMap", function(object, FUN, ...) standardGeneric("tmMap")) |
136 : | setMethod("tmMap", | ||
137 : | feinerer | 62 | signature(object = "TextDocCol", FUN = "function"), |
138 : | feinerer | 49 | function(object, FUN, ...) { |
139 : | feinerer | 73 | result <- object |
140 : | feinerer | 689 | # Note that text corpora are automatically loaded into memory via \code{[[} |
141 : | feinerer | 719 | if (DBControl(object)[["useDb"]]) { |
142 : | db <- dbInit(DBControl(object)[["dbName"]], DBControl(object)[["dbType"]]) | ||
143 : | new <- lapply(object, FUN, ..., DMetaData = DMetaData(object)) | ||
144 : | ids <- lapply(object, ID) | ||
145 : | # Avoidance of explicit loop is probably more efficient | ||
146 : | for (i in length(new)) { | ||
147 : | db[[ids[i]]] <- new[[i]] | ||
148 : | } | ||
149 : | dbDisconnect(db) | ||
150 : | } | ||
151 : | else | ||
152 : | result@.Data <- lapply(object, FUN, ..., DMetaData = DMetaData(object)) | ||
153 : | feinerer | 56 | return(result) |
154 : | feinerer | 49 | }) |
155 : | |||
156 : | feinerer | 698 | setGeneric("asPlain", function(object, FUN, ...) standardGeneric("asPlain")) |
157 : | setMethod("asPlain", | ||
158 : | feinerer | 62 | signature(object = "PlainTextDocument"), |
159 : | feinerer | 49 | function(object, FUN, ...) { |
160 : | return(object) | ||
161 : | }) | ||
162 : | feinerer | 698 | setMethod("asPlain", |
163 : | feinerer | 62 | signature(object = "XMLTextDocument", FUN = "function"), |
164 : | feinerer | 49 | function(object, FUN, ...) { |
165 : | feinerer | 56 | corpus <- Corpus(object) |
166 : | feinerer | 49 | |
167 : | # As XMLDocument is no native S4 class, restore valid information | ||
168 : | class(corpus) <- "XMLDocument" | ||
169 : | names(corpus) <- c("doc","dtd") | ||
170 : | |||
171 : | feinerer | 61 | return(FUN(xmlRoot(corpus), ...)) |
172 : | feinerer | 49 | }) |
173 : | feinerer | 725 | setMethod("asPlain", |
174 : | signature(object = "NewsgroupDocument"), | ||
175 : | function(object, FUN, ...) { | ||
176 : | new("PlainTextDocument", .Data = Corpus(object), Cached = TRUE, URI = "", Author = Author(object), | ||
177 : | DateTimeStamp = DateTimeStamp(object), Description = Description(object), ID = ID(object), | ||
178 : | Origin = Origin(object), Heading = Heading(object), Language = Language(object)) | ||
179 : | }) | ||
180 : | feinerer | 49 | |
181 : | feinerer | 698 | setGeneric("tmTolower", function(object, ...) standardGeneric("tmTolower")) |
182 : | setMethod("tmTolower", | ||
183 : | feinerer | 67 | signature(object = "PlainTextDocument"), |
184 : | function(object, ...) { | ||
185 : | Corpus(object) <- tolower(object) | ||
186 : | return(object) | ||
187 : | }) | ||
188 : | |||
189 : | feinerer | 698 | setGeneric("stripWhitespace", function(object, ...) standardGeneric("stripWhitespace")) |
190 : | setMethod("stripWhitespace", | ||
191 : | feinerer | 67 | signature(object = "PlainTextDocument"), |
192 : | function(object, ...) { | ||
193 : | Corpus(object) <- gsub("[[:space:]]+", " ", object) | ||
194 : | return(object) | ||
195 : | }) | ||
196 : | |||
197 : | feinerer | 713 | setGeneric("stemDoc", function(object, language = "english", ...) standardGeneric("stemDoc")) |
198 : | feinerer | 698 | setMethod("stemDoc", |
199 : | feinerer | 62 | signature(object = "PlainTextDocument"), |
200 : | feinerer | 713 | function(object, language = "english", ...) { |
201 : | feinerer | 49 | splittedCorpus <- unlist(strsplit(object, " ", fixed = TRUE)) |
202 : | feinerer | 713 | stemmedCorpus <- if (require("Rstem")) |
203 : | Rstem::wordStem(splittedCorpus, language) | ||
204 : | else | ||
205 : | SnowballStemmer(splittedCorpus, Weka_control(S = language)) | ||
206 : | feinerer | 56 | Corpus(object) <- paste(stemmedCorpus, collapse = " ") |
207 : | return(object) | ||
208 : | feinerer | 49 | }) |
209 : | |||
210 : | feinerer | 722 | setGeneric("removePunctuation", function(object, ...) standardGeneric("removePunctuation")) |
211 : | setMethod("removePunctuation", | ||
212 : | signature(object = "PlainTextDocument"), | ||
213 : | function(object, ...) { | ||
214 : | Corpus(object) <- gsub("[[:punct:]]+", "", Corpus(object)) | ||
215 : | return(object) | ||
216 : | }) | ||
217 : | |||
218 : | feinerer | 698 | setGeneric("removeWords", function(object, stopwords, ...) standardGeneric("removeWords")) |
219 : | setMethod("removeWords", | ||
220 : | feinerer | 60 | signature(object = "PlainTextDocument", stopwords = "character"), |
221 : | feinerer | 61 | function(object, stopwords, ...) { |
222 : | feinerer | 49 | splittedCorpus <- unlist(strsplit(object, " ", fixed = TRUE)) |
223 : | noStopwordsCorpus <- splittedCorpus[!splittedCorpus %in% stopwords] | ||
224 : | feinerer | 56 | Corpus(object) <- paste(noStopwordsCorpus, collapse = " ") |
225 : | return(object) | ||
226 : | feinerer | 49 | }) |
227 : | |||
228 : | feinerer | 698 | setGeneric("tmFilter", function(object, ..., FUN = sFilter, doclevel = FALSE) standardGeneric("tmFilter")) |
229 : | setMethod("tmFilter", | ||
230 : | feinerer | 61 | signature(object = "TextDocCol"), |
231 : | feinerer | 698 | function(object, ..., FUN = sFilter, doclevel = FALSE) { |
232 : | feinerer | 73 | if (doclevel) |
233 : | return(object[sapply(object, FUN, ..., DMetaData = DMetaData(object))]) | ||
234 : | else | ||
235 : | feinerer | 724 | return(object[FUN(object, ...)]) |
236 : | feinerer | 61 | }) |
237 : | |||
238 : | feinerer | 698 | setGeneric("tmIndex", function(object, ..., FUN = sFilter, doclevel = FALSE) standardGeneric("tmIndex")) |
239 : | setMethod("tmIndex", | ||
240 : | feinerer | 61 | signature(object = "TextDocCol"), |
241 : | feinerer | 698 | function(object, ..., FUN = sFilter, doclevel = FALSE) { |
242 : | feinerer | 73 | if (doclevel) |
243 : | return(sapply(object, FUN, ..., DMetaData = DMetaData(object))) | ||
244 : | else | ||
245 : | feinerer | 724 | return(FUN(object, ...)) |
246 : | feinerer | 49 | }) |
247 : | |||
248 : | feinerer | 698 | sFilter <- function(object, s, ...) { |
249 : | feinerer | 73 | con <- textConnection(s) |
250 : | tokens <- scan(con, "character") | ||
251 : | close(con) | ||
252 : | feinerer | 721 | localMetaNames <- unique(names(sapply(object, LocalMetaData))) |
253 : | localMetaTokens <- localMetaNames[localMetaNames %in% tokens] | ||
254 : | query.df <- DMetaData(prescindMeta(object, | ||
255 : | c("Author", "DateTimeStamp", "Description", "ID", | ||
256 : | "Origin", "Heading", "Language", localMetaTokens))) | ||
257 : | # Rename to avoid name conflicts | ||
258 : | names(query.df)[names(query.df) == "Author"] <- "author" | ||
259 : | names(query.df)[names(query.df) == "DateTimeStamp"] <- "datetimestamp" | ||
260 : | names(query.df)[names(query.df) == "Description"] <- "description" | ||
261 : | names(query.df)[names(query.df) == "ID"] <- "identifier" | ||
262 : | names(query.df)[names(query.df) == "Origin"] <- "origin" | ||
263 : | names(query.df)[names(query.df) == "Heading"] <- "heading" | ||
264 : | names(query.df)[names(query.df) == "Language"] <- "language" | ||
265 : | feinerer | 73 | attach(query.df) |
266 : | feinerer | 74 | try(result <- rownames(query.df) %in% row.names(query.df[eval(parse(text = s)), ])) |
267 : | feinerer | 73 | detach(query.df) |
268 : | return(result) | ||
269 : | feinerer | 61 | } |
270 : | |||
271 : | feinerer | 698 | setGeneric("searchFullText", function(object, pattern, ...) standardGeneric("searchFullText")) |
272 : | setMethod("searchFullText", | ||
273 : | feinerer | 61 | signature(object = "PlainTextDocument", pattern = "character"), |
274 : | function(object, pattern, ...) { | ||
275 : | return(any(grep(pattern, Corpus(object)))) | ||
276 : | }) | ||
277 : | |||
278 : | feinerer | 698 | setGeneric("appendElem", function(object, data, meta = NULL) standardGeneric("appendElem")) |
279 : | setMethod("appendElem", | ||
280 : | feinerer | 72 | signature(object = "TextDocCol", data = "TextDocument"), |
281 : | function(object, data, meta = NULL) { | ||
282 : | feinerer | 712 | if (DBControl(object)[["useDb"]]) { |
283 : | db <- dbInit(DBControl(object)[["dbName"]], DBControl(object)[["dbType"]]) | ||
284 : | if (dbExists(db, ID(data))) | ||
285 : | warning("document with identical ID already exists") | ||
286 : | dbInsert(db, ID(data), data) | ||
287 : | dbDisconnect(db) | ||
288 : | object@.Data[[length(object)+1]] <- ID(data) | ||
289 : | } | ||
290 : | else | ||
291 : | object@.Data[[length(object)+1]] <- data | ||
292 : | DMetaData(object) <- rbind(DMetaData(object), c(MetaID = CMetaData(object)@NodeID, meta)) | ||
293 : | feinerer | 52 | return(object) |
294 : | }) | ||
295 : | |||
296 : | feinerer | 698 | setGeneric("appendMeta", function(object, cmeta = NULL, dmeta = NULL) standardGeneric("appendMeta")) |
297 : | setMethod("appendMeta", | ||
298 : | feinerer | 72 | signature(object = "TextDocCol"), |
299 : | feinerer | 698 | function(object, cmeta = NULL, dmeta = NULL) { |
300 : | feinerer | 712 | object@CMetaData@MetaData <- c(CMetaData(object)@MetaData, cmeta) |
301 : | if (!is.null(dmeta)) { | ||
302 : | feinerer | 719 | DMetaData(object) <- cbind(DMetaData(object), eval(dmeta)) |
303 : | feinerer | 712 | } |
304 : | feinerer | 52 | return(object) |
305 : | }) | ||
306 : | feinerer | 53 | |
307 : | feinerer | 698 | setGeneric("removeMeta", function(object, cname = NULL, dname = NULL) standardGeneric("removeMeta")) |
308 : | setMethod("removeMeta", | ||
309 : | feinerer | 77 | signature(object = "TextDocCol"), |
310 : | feinerer | 698 | function(object, cname = NULL, dname = NULL) { |
311 : | if (!is.null(cname)) { | ||
312 : | object@CMetaData@MetaData <- CMetaData(object)@MetaData[names(CMetaData(object)@MetaData) != cname] | ||
313 : | feinerer | 77 | } |
314 : | if (!is.null(dname)) { | ||
315 : | feinerer | 712 | DMetaData(object) <- DMetaData(object)[names(DMetaData(object)) != dname] |
316 : | feinerer | 77 | } |
317 : | return(object) | ||
318 : | }) | ||
319 : | feinerer | 69 | |
320 : | feinerer | 698 | setGeneric("prescindMeta", function(object, meta) standardGeneric("prescindMeta")) |
321 : | setMethod("prescindMeta", | ||
322 : | feinerer | 73 | signature(object = "TextDocCol", meta = "character"), |
323 : | function(object, meta) { | ||
324 : | for (m in meta) { | ||
325 : | feinerer | 719 | if (m %in% c("Author", "DateTimeStamp", "Description", "ID", "Origin", "Heading", "Language")) { |
326 : | feinerer | 75 | local.m <- lapply(object, m) |
327 : | local.m <- lapply(local.m, function(x) if (is.null(x)) return(NA) else return(x)) | ||
328 : | feinerer | 73 | local.m <- unlist(local.m) |
329 : | feinerer | 721 | DMetaData(object) <- cbind(DMetaData(object), data.frame(m = local.m, stringsAsFactors = FALSE)) |
330 : | names(DMetaData(object))[which(names(DMetaData(object)) == "m")] <- m | ||
331 : | feinerer | 75 | } |
332 : | else { | ||
333 : | local.meta <- lapply(object, LocalMetaData) | ||
334 : | local.m <- lapply(local.meta, "[[", m) | ||
335 : | local.m <- lapply(local.m, function(x) if (is.null(x)) return(NA) else return(x)) | ||
336 : | if (length(local.m) == length(unlist(local.m))) | ||
337 : | local.m <- unlist(local.m) | ||
338 : | else | ||
339 : | local.m <- I(local.m) | ||
340 : | feinerer | 721 | DMetaData(object) <- cbind(DMetaData(object), data.frame(m = local.m, stringsAsFactors = FALSE)) |
341 : | names(DMetaData(object))[which(names(DMetaData(object)) == "m")] <- m | ||
342 : | feinerer | 75 | } |
343 : | feinerer | 73 | } |
344 : | return(object) | ||
345 : | }) | ||
346 : | |||
347 : | feinerer | 53 | setMethod("[", |
348 : | signature(x = "TextDocCol", i = "ANY", j = "ANY", drop = "ANY"), | ||
349 : | function(x, i, j, ... , drop) { | ||
350 : | if(missing(i)) | ||
351 : | return(x) | ||
352 : | |||
353 : | object <- x | ||
354 : | object@.Data <- x@.Data[i, ..., drop = FALSE] | ||
355 : | feinerer | 724 | if (DBControl(object)[["useDb"]]) { |
356 : | index <- object@DMetaData[1 , "subset"] | ||
357 : | if (is.na(index)) | ||
358 : | object@DMetaData[1 , "subset"] <- i | ||
359 : | else | ||
360 : | object@DMetaData[1 , "subset"] <- index[i] | ||
361 : | } | ||
362 : | else { | ||
363 : | df <- as.data.frame(DMetaData(x)[i, ]) | ||
364 : | names(df) <- names(DMetaData(x)) | ||
365 : | DMetaData(object) <- df | ||
366 : | } | ||
367 : | feinerer | 53 | return(object) |
368 : | }) | ||
369 : | |||
370 : | feinerer | 63 | setMethod("[<-", |
371 : | signature(x = "TextDocCol", i = "ANY", j = "ANY", value = "ANY"), | ||
372 : | function(x, i, j, ... , value) { | ||
373 : | object <- x | ||
374 : | feinerer | 724 | if (DBControl(object)[["useDb"]]) { |
375 : | db <- dbInit(DBControl(object)[["dbName"]], DBControl(object)[["dbType"]]) | ||
376 : | counter <- 1 | ||
377 : | for (id in object@.Data[i, ...]) { | ||
378 : | if (length(value) == 1) | ||
379 : | db[[id]] <- value | ||
380 : | else { | ||
381 : | db[[id]] <- value[[counter]] | ||
382 : | } | ||
383 : | counter <- counter + 1 | ||
384 : | } | ||
385 : | dbDisconnect(db) | ||
386 : | } | ||
387 : | else | ||
388 : | object@.Data[i, ...] <- value | ||
389 : | feinerer | 63 | return(object) |
390 : | }) | ||
391 : | |||
392 : | setMethod("[[", | ||
393 : | signature(x = "TextDocCol", i = "ANY", j = "ANY"), | ||
394 : | function(x, i, j, ...) { | ||
395 : | feinerer | 712 | if (DBControl(x)[["useDb"]]) { |
396 : | db <- dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]]) | ||
397 : | result <- dbFetch(db, x@.Data[[i]]) | ||
398 : | dbDisconnect(db) | ||
399 : | return(loadDoc(result)) | ||
400 : | } | ||
401 : | else | ||
402 : | return(loadDoc(x@.Data[[i]])) | ||
403 : | feinerer | 63 | }) |
404 : | |||
405 : | setMethod("[[<-", | ||
406 : | signature(x = "TextDocCol", i = "ANY", j = "ANY", value = "ANY"), | ||
407 : | function(x, i, j, ..., value) { | ||
408 : | object <- x | ||
409 : | feinerer | 712 | if (DBControl(object)[["useDb"]]) { |
410 : | db <- dbInit(DBControl(object)[["dbName"]], DBControl(object)[["dbType"]]) | ||
411 : | index <- object@.Data[[i]] | ||
412 : | db[[index]] <- value | ||
413 : | dbDisconnect(db) | ||
414 : | } | ||
415 : | else | ||
416 : | object@.Data[[i, ...]] <- value | ||
417 : | feinerer | 63 | return(object) |
418 : | }) | ||
419 : | |||
420 : | feinerer | 698 | # Update \code{NodeID}s of a CMetaData tree |
421 : | feinerer | 697 | update_id <- function(object, id = 0, mapping = NULL, left.mapping = NULL, level = 0) { |
422 : | feinerer | 698 | # Traversal of (binary) CMetaData tree with setup of \code{NodeID}s |
423 : | feinerer | 697 | set_id <- function(object) { |
424 : | object@NodeID <- id | ||
425 : | id <<- id + 1 | ||
426 : | level <<- level + 1 | ||
427 : | feinerer | 71 | |
428 : | feinerer | 697 | if (length(object@children) > 0) { |
429 : | mapping <<- cbind(mapping, c(object@children[[1]]@NodeID, id)) | ||
430 : | left <- set_id(object@children[[1]]) | ||
431 : | if (level == 1) { | ||
432 : | left.mapping <<- mapping | ||
433 : | mapping <<- NULL | ||
434 : | } | ||
435 : | mapping <<- cbind(mapping, c(object@children[[2]]@NodeID, id)) | ||
436 : | right <- set_id(object@children[[2]]) | ||
437 : | feinerer | 71 | |
438 : | feinerer | 697 | object@children <- list(left, right) |
439 : | feinerer | 71 | } |
440 : | feinerer | 697 | level <<- level - 1 |
441 : | feinerer | 71 | |
442 : | feinerer | 697 | return(object) |
443 : | feinerer | 71 | } |
444 : | |||
445 : | feinerer | 697 | return(list(root = set_id(object), left.mapping = left.mapping, right.mapping = mapping)) |
446 : | feinerer | 71 | } |
447 : | |||
448 : | feinerer | 53 | setMethod("c", |
449 : | signature(x = "TextDocCol"), | ||
450 : | feinerer | 689 | function(x, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) { |
451 : | args <- list(...) | ||
452 : | feinerer | 720 | if (length(args) == 0) |
453 : | feinerer | 689 | return(x) |
454 : | feinerer | 71 | |
455 : | feinerer | 720 | if (!all(sapply(args, inherits, "TextDocCol"))) |
456 : | stop("not all arguments are text document collections") | ||
457 : | if (DBControl(x)$useDb == TRUE || any(unlist(sapply(args, DBControl)["useDb", ]))) | ||
458 : | stop("concatenating text document collections with activated database is not supported") | ||
459 : | |||
460 : | feinerer | 689 | result <- x |
461 : | for (c in args) { | ||
462 : | result <- c2(result, c) | ||
463 : | } | ||
464 : | return(result) | ||
465 : | }) | ||
466 : | |||
467 : | setGeneric("c2", function(x, y, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) standardGeneric("c2")) | ||
468 : | setMethod("c2", | ||
469 : | signature(x = "TextDocCol", y = "TextDocCol"), | ||
470 : | function(x, y, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) { | ||
471 : | feinerer | 71 | object <- x |
472 : | # Concatenate data slots | ||
473 : | object@.Data <- c(as(x, "list"), as(y, "list")) | ||
474 : | |||
475 : | feinerer | 720 | # Set the DBControl slot |
476 : | object@DBControl <- list(useDb = FALSE, dbName = "", dbType = "DB1") | ||
477 : | |||
478 : | feinerer | 698 | # Update the CMetaData tree |
479 : | cmeta <- new("MetaDataNode", NodeID = 0, MetaData = meta, children = list(CMetaData(x), CMetaData(y))) | ||
480 : | update.struct <- update_id(cmeta) | ||
481 : | object@CMetaData <- update.struct$root | ||
482 : | feinerer | 71 | |
483 : | # Find indices to be updated for the left tree | ||
484 : | indices.mapping <- NULL | ||
485 : | for (m in levels(as.factor(DMetaData(x)$MetaID))) { | ||
486 : | indices <- (DMetaData(x)$MetaID == m) | ||
487 : | indices.mapping <- c(indices.mapping, list(m = indices)) | ||
488 : | names(indices.mapping)[length(indices.mapping)] <- m | ||
489 : | } | ||
490 : | |||
491 : | # Update the DMetaData data frames for the left tree | ||
492 : | for (i in 1:ncol(update.struct$left.mapping)) { | ||
493 : | map <- update.struct$left.mapping[,i] | ||
494 : | x@DMetaData$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2]) | ||
495 : | } | ||
496 : | |||
497 : | # Find indices to be updated for the right tree | ||
498 : | indices.mapping <- NULL | ||
499 : | for (m in levels(as.factor(DMetaData(y)$MetaID))) { | ||
500 : | indices <- (DMetaData(y)$MetaID == m) | ||
501 : | indices.mapping <- c(indices.mapping, list(m = indices)) | ||
502 : | names(indices.mapping)[length(indices.mapping)] <- m | ||
503 : | } | ||
504 : | |||
505 : | # Update the DMetaData data frames for the right tree | ||
506 : | for (i in 1:ncol(update.struct$right.mapping)) { | ||
507 : | map <- update.struct$right.mapping[,i] | ||
508 : | y@DMetaData$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2]) | ||
509 : | } | ||
510 : | |||
511 : | # Merge the DMetaData data frames | ||
512 : | labels <- setdiff(names(DMetaData(y)), names(DMetaData(x))) | ||
513 : | na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels)) | ||
514 : | x.dmeta.aug <- cbind(DMetaData(x), na.matrix) | ||
515 : | labels <- setdiff(names(DMetaData(x)), names(DMetaData(y))) | ||
516 : | na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels)) | ||
517 : | y.dmeta.aug <- cbind(DMetaData(y), na.matrix) | ||
518 : | object@DMetaData <- rbind(x.dmeta.aug, y.dmeta.aug) | ||
519 : | |||
520 : | return(object) | ||
521 : | feinerer | 72 | }) |
522 : | feinerer | 689 | |
523 : | feinerer | 72 | setMethod("c", |
524 : | signature(x = "TextDocument"), | ||
525 : | function(x, ..., recursive = TRUE){ | ||
526 : | args <- list(...) | ||
527 : | if(length(args) == 0) | ||
528 : | return(x) | ||
529 : | feinerer | 54 | |
530 : | feinerer | 73 | dmeta.df <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE) |
531 : | feinerer | 698 | cmeta.node <- new("MetaDataNode", |
532 : | feinerer | 72 | NodeID = 0, |
533 : | feinerer | 689 | MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")), |
534 : | feinerer | 72 | children = list()) |
535 : | |||
536 : | feinerer | 720 | return(new("TextDocCol", |
537 : | .Data = list(x, ...), | ||
538 : | DMetaData = dmeta.df, | ||
539 : | CMetaData = cmeta.node, | ||
540 : | DBControl = list(useDb = FALSE, dbName = "", dbType = "DB1"))) | ||
541 : | feinerer | 72 | }) |
542 : | |||
543 : | feinerer | 54 | setMethod("length", |
544 : | signature(x = "TextDocCol"), | ||
545 : | function(x){ | ||
546 : | return(length(as(x, "list"))) | ||
547 : | }) | ||
548 : | |||
549 : | setMethod("show", | ||
550 : | signature(object = "TextDocCol"), | ||
551 : | function(object){ | ||
552 : | feinerer | 70 | cat(sprintf(ngettext(length(object), |
553 : | "A text document collection with %d text document\n", | ||
554 : | "A text document collection with %d text documents\n"), | ||
555 : | length(object))) | ||
556 : | feinerer | 54 | }) |
557 : | |||
558 : | setMethod("summary", | ||
559 : | signature(object = "TextDocCol"), | ||
560 : | function(object){ | ||
561 : | show(object) | ||
562 : | feinerer | 71 | if (length(DMetaData(object)) > 0) { |
563 : | feinerer | 698 | cat(sprintf(ngettext(length(CMetaData(object)@MetaData), |
564 : | feinerer | 77 | "\nThe metadata consists of %d tag-value pair and a data frame\n", |
565 : | "\nThe metadata consists of %d tag-value pairs and a data frame\n"), | ||
566 : | feinerer | 698 | length(CMetaData(object)@MetaData))) |
567 : | feinerer | 54 | cat("Available tags are:\n") |
568 : | feinerer | 722 | cat(strwrap(paste(names(CMetaData(object)@MetaData), collapse = " "), indent = 2, exdent = 2), "\n") |
569 : | feinerer | 77 | cat("Available variables in the data frame are:\n") |
570 : | feinerer | 722 | cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n") |
571 : | feinerer | 54 | } |
572 : | }) | ||
573 : | feinerer | 55 | |
574 : | setGeneric("inspect", function(object) standardGeneric("inspect")) | ||
575 : | setMethod("inspect", | ||
576 : | feinerer | 65 | signature("TextDocCol"), |
577 : | feinerer | 55 | function(object) { |
578 : | summary(object) | ||
579 : | cat("\n") | ||
580 : | feinerer | 719 | if (DBControl(object)[["useDb"]]) { |
581 : | db <- dbInit(DBControl(object)[["dbName"]], DBControl(object)[["dbType"]]) | ||
582 : | show(dbMultiFetch(db, unlist(object))) | ||
583 : | dbDisconnect(db) | ||
584 : | } | ||
585 : | else | ||
586 : | show(object@.Data) | ||
587 : | feinerer | 55 | }) |
588 : | feinerer | 65 | |
589 : | # No metadata is checked | ||
590 : | setGeneric("%IN%", function(x, y) standardGeneric("%IN%")) | ||
591 : | setMethod("%IN%", | ||
592 : | signature(x = "TextDocument", y = "TextDocCol"), | ||
593 : | function(x, y) { | ||
594 : | x %in% y | ||
595 : | }) | ||
596 : | feinerer | 719 | |
597 : | setMethod("lapply", | ||
598 : | signature(X = "TextDocCol"), | ||
599 : | function(X, FUN, ...) { | ||
600 : | if (DBControl(X)[["useDb"]]) { | ||
601 : | db <- dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]]) | ||
602 : | result <- lapply(dbMultiFetch(db, unlist(X)), FUN, ...) | ||
603 : | dbDisconnect(db) | ||
604 : | } | ||
605 : | else | ||
606 : | result <- base::lapply(X, FUN, ...) | ||
607 : | return(result) | ||
608 : | }) | ||
609 : | |||
610 : | setMethod("sapply", | ||
611 : | signature(X = "TextDocCol"), | ||
612 : | function(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE) { | ||
613 : | if (DBControl(X)[["useDb"]]) { | ||
614 : | db <- dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]]) | ||
615 : | result <- sapply(dbMultiFetch(db, unlist(X)), FUN, ...) | ||
616 : | dbDisconnect(db) | ||
617 : | } | ||
618 : | else | ||
619 : | result <- base::sapply(X, FUN, ...) | ||
620 : | return(result) | ||
621 : | }) |
R-Forge@R-project.org | ViewVC Help |
Powered by ViewVC 1.0.0 |