SCM Repository
Annotation of /pkg/R/corpus.R
Parent Directory
|
Revision Log
Revision 1297 - (view) (download)
1 : | feinerer | 17 | # Author: Ingo Feinerer |
2 : | |||
3 : | khornik | 1203 | .PCorpus <- |
4 : | function(x, cmeta, dmeta, dbcontrol) | ||
5 : | { | ||
6 : | feinerer | 985 | attr(x, "CMetaData") <- cmeta |
7 : | attr(x, "DMetaData") <- dmeta | ||
8 : | attr(x, "DBControl") <- dbcontrol | ||
9 : | class(x) <- c("PCorpus", "Corpus", "list") | ||
10 : | x | ||
11 : | } | ||
12 : | |||
13 : | khornik | 1203 | DBControl <- |
14 : | function(x) | ||
15 : | attr(x, "DBControl") | ||
16 : | |||
17 : | PCorpus <- | ||
18 : | function(x, | ||
19 : | readerControl = list(reader = x$DefaultReader, language = "en"), | ||
20 : | feinerer | 1259 | dbControl = list(dbName = "", dbType = "DB1")) |
21 : | khornik | 1203 | { |
22 : | feinerer | 1297 | stopifnot(inherits(x, "Source")) |
23 : | feinerer | 1273 | |
24 : | feinerer | 1259 | readerControl <- prepareReader(readerControl, x$DefaultReader) |
25 : | feinerer | 63 | |
26 : | feinerer | 1114 | if (is.function(readerControl$init)) |
27 : | readerControl$init() | ||
28 : | |||
29 : | if (is.function(readerControl$exit)) | ||
30 : | on.exit(readerControl$exit()) | ||
31 : | |||
32 : | feinerer | 946 | if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType)) |
33 : | stop("error in creating database") | ||
34 : | db <- filehash::dbInit(dbControl$dbName, dbControl$dbType) | ||
35 : | feinerer | 712 | |
36 : | feinerer | 946 | # Allocate memory in advance if length is known |
37 : | feinerer | 1074 | tdl <- if (x$Length > 0) |
38 : | feinerer | 985 | vector("list", as.integer(x$Length)) |
39 : | feinerer | 946 | else |
40 : | list() | ||
41 : | feinerer | 869 | |
42 : | feinerer | 946 | counter <- 1 |
43 : | feinerer | 985 | while (!eoi(x)) { |
44 : | x <- stepNext(x) | ||
45 : | elem <- getElem(x) | ||
46 : | feinerer | 1259 | id <- if (is.null(x$Names) || is.na(x$Names)) |
47 : | as.character(counter) | ||
48 : | else | ||
49 : | x$Names[counter] | ||
50 : | doc <- readerControl$reader(elem, readerControl$language, id) | ||
51 : | feinerer | 946 | filehash::dbInsert(db, ID(doc), doc) |
52 : | feinerer | 985 | if (x$Length > 0) tdl[[counter]] <- ID(doc) |
53 : | feinerer | 946 | else tdl <- c(tdl, ID(doc)) |
54 : | counter <- counter + 1 | ||
55 : | } | ||
56 : | feinerer | 1261 | if (!is.null(x$Names) && !is.na(x$Names)) |
57 : | names(tdl) <- x$Names | ||
58 : | feinerer | 63 | |
59 : | feinerer | 946 | df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE) |
60 : | filehash::dbInsert(db, "DMetaData", df) | ||
61 : | dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA))) | ||
62 : | feinerer | 712 | |
63 : | feinerer | 985 | .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl) |
64 : | } | ||
65 : | feinerer | 71 | |
66 : | khornik | 1203 | .VCorpus <- |
67 : | function(x, cmeta, dmeta) | ||
68 : | { | ||
69 : | feinerer | 985 | attr(x, "CMetaData") <- cmeta |
70 : | attr(x, "DMetaData") <- dmeta | ||
71 : | class(x) <- c("VCorpus", "Corpus", "list") | ||
72 : | x | ||
73 : | feinerer | 946 | } |
74 : | feinerer | 21 | |
75 : | khornik | 1203 | VCorpus <- |
76 : | Corpus <- | ||
77 : | feinerer | 1259 | function(x, readerControl = list(reader = x$DefaultReader, language = "en")) |
78 : | khornik | 1203 | { |
79 : | feinerer | 1297 | stopifnot(inherits(x, "Source")) |
80 : | feinerer | 1273 | |
81 : | feinerer | 1259 | readerControl <- prepareReader(readerControl, x$DefaultReader) |
82 : | feinerer | 49 | |
83 : | feinerer | 1114 | if (is.function(readerControl$init)) |
84 : | readerControl$init() | ||
85 : | |||
86 : | if (is.function(readerControl$exit)) | ||
87 : | on.exit(readerControl$exit()) | ||
88 : | |||
89 : | feinerer | 946 | # Allocate memory in advance if length is known |
90 : | feinerer | 985 | tdl <- if (x$Length > 0) |
91 : | vector("list", as.integer(x$Length)) | ||
92 : | feinerer | 946 | else |
93 : | list() | ||
94 : | feinerer | 72 | |
95 : | feinerer | 985 | if (x$Vectorized) |
96 : | feinerer | 987 | tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id), |
97 : | pGetElem(x), | ||
98 : | feinerer | 1258 | id = if (is.null(x$Names) || is.na(x$Names)) as.character(seq_len(x$Length)) else x$Names, |
99 : | feinerer | 987 | SIMPLIFY = FALSE) |
100 : | feinerer | 946 | else { |
101 : | counter <- 1 | ||
102 : | feinerer | 985 | while (!eoi(x)) { |
103 : | x <- stepNext(x) | ||
104 : | elem <- getElem(x) | ||
105 : | feinerer | 1258 | id <- if (is.null(x$Names) || is.na(x$Names)) |
106 : | as.character(counter) | ||
107 : | else | ||
108 : | x$Names[counter] | ||
109 : | doc <- readerControl$reader(elem, readerControl$language, id) | ||
110 : | feinerer | 985 | if (x$Length > 0) |
111 : | feinerer | 946 | tdl[[counter]] <- doc |
112 : | else | ||
113 : | tdl <- c(tdl, list(doc)) | ||
114 : | counter <- counter + 1 | ||
115 : | } | ||
116 : | } | ||
117 : | feinerer | 1261 | if (!is.null(x$Names) && !is.na(x$Names)) |
118 : | names(tdl) <- x$Names | ||
119 : | feinerer | 946 | df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE) |
120 : | feinerer | 985 | .VCorpus(tdl, .MetaDataNode(), df) |
121 : | } | ||
122 : | feinerer | 72 | |
123 : | khornik | 1203 | `[.PCorpus` <- |
124 : | function(x, i) | ||
125 : | { | ||
126 : | feinerer | 985 | if (missing(i)) return(x) |
127 : | index <- attr(x, "DMetaData")[[1 , "subset"]] | ||
128 : | attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i | ||
129 : | dmeta <- attr(x, "DMetaData") | ||
130 : | feinerer | 995 | .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x)) |
131 : | feinerer | 946 | } |
132 : | feinerer | 72 | |
133 : | khornik | 1203 | `[.VCorpus` <- |
134 : | function(x, i) | ||
135 : | { | ||
136 : | feinerer | 985 | if (missing(i)) return(x) |
137 : | feinerer | 995 | .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE]) |
138 : | feinerer | 985 | } |
139 : | feinerer | 49 | |
140 : | khornik | 1203 | `[<-.PCorpus` <- |
141 : | function(x, i, value) | ||
142 : | { | ||
143 : | feinerer | 985 | db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]]) |
144 : | counter <- 1 | ||
145 : | for (id in unclass(x)[i]) { | ||
146 : | feinerer | 1025 | if (identical(length(value), 1L)) db[[id]] <- value |
147 : | feinerer | 985 | else db[[id]] <- value[[counter]] |
148 : | counter <- counter + 1 | ||
149 : | feinerer | 829 | } |
150 : | feinerer | 985 | x |
151 : | feinerer | 828 | } |
152 : | |||
153 : | khornik | 1203 | .map_name_index <- |
154 : | function(x, i) | ||
155 : | { | ||
156 : | feinerer | 1108 | if (is.character(i)) { |
157 : | if (is.null(names(x))) | ||
158 : | match(i, meta(x, "ID", type = "local")) | ||
159 : | else | ||
160 : | match(i, names(x)) | ||
161 : | } | ||
162 : | i | ||
163 : | } | ||
164 : | |||
165 : | khornik | 1203 | `[[.PCorpus` <- |
166 : | function(x, i) | ||
167 : | { | ||
168 : | feinerer | 1108 | i <- .map_name_index(x, i) |
169 : | feinerer | 985 | db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]]) |
170 : | feinerer | 995 | filehash::dbFetch(db, NextMethod("[[")) |
171 : | feinerer | 985 | } |
172 : | khornik | 1203 | `[[.VCorpus` <- |
173 : | function(x, i) | ||
174 : | { | ||
175 : | feinerer | 1108 | i <- .map_name_index(x, i) |
176 : | feinerer | 985 | lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus") |
177 : | if (!is.null(lazyTmMap)) | ||
178 : | .Call("copyCorpus", x, materialize(x, i)) | ||
179 : | feinerer | 995 | NextMethod("[[") |
180 : | feinerer | 985 | } |
181 : | feinerer | 886 | |
182 : | khornik | 1203 | `[[<-.PCorpus` <- |
183 : | function(x, i, value) | ||
184 : | { | ||
185 : | feinerer | 1108 | i <- .map_name_index(x, i) |
186 : | feinerer | 985 | db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]]) |
187 : | index <- unclass(x)[[i]] | ||
188 : | db[[index]] <- value | ||
189 : | x | ||
190 : | feinerer | 946 | } |
191 : | khornik | 1203 | `[[<-.VCorpus` <- |
192 : | function(x, i, value) | ||
193 : | { | ||
194 : | feinerer | 1108 | i <- .map_name_index(x, i) |
195 : | feinerer | 985 | # Mark new objects as not active for lazy mapping |
196 : | lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus") | ||
197 : | if (!is.null(lazyTmMap)) { | ||
198 : | lazyTmMap$index[i] <- FALSE | ||
199 : | meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap | ||
200 : | } | ||
201 : | # Set the value | ||
202 : | cl <- class(x) | ||
203 : | feinerer | 995 | y <- NextMethod("[[<-") |
204 : | class(y) <- cl | ||
205 : | y | ||
206 : | feinerer | 985 | } |
207 : | feinerer | 946 | |
208 : | feinerer | 1004 | # Update NodeIDs of a CMetaData tree |
209 : | khornik | 1203 | .update_id <- |
210 : | function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) | ||
211 : | { | ||
212 : | feinerer | 1004 | # Traversal of (binary) CMetaData tree with setup of NodeIDs |
213 : | feinerer | 985 | set_id <- function(x) { |
214 : | feinerer | 988 | x$NodeID <- id |
215 : | feinerer | 697 | id <<- id + 1 |
216 : | level <<- level + 1 | ||
217 : | feinerer | 1285 | if (length(x$Children)) { |
218 : | feinerer | 988 | mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id)) |
219 : | left <- set_id(x$Children[[1]]) | ||
220 : | feinerer | 697 | if (level == 1) { |
221 : | left.mapping <<- mapping | ||
222 : | mapping <<- NULL | ||
223 : | } | ||
224 : | feinerer | 988 | mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id)) |
225 : | right <- set_id(x$Children[[2]]) | ||
226 : | feinerer | 71 | |
227 : | feinerer | 988 | x$Children <- list(left, right) |
228 : | feinerer | 71 | } |
229 : | feinerer | 697 | level <<- level - 1 |
230 : | feinerer | 985 | x |
231 : | feinerer | 71 | } |
232 : | feinerer | 985 | list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping) |
233 : | feinerer | 71 | } |
234 : | |||
235 : | feinerer | 1004 | # Find indices to be updated for a CMetaData tree |
236 : | khornik | 1203 | .find_indices <- |
237 : | function(x) | ||
238 : | { | ||
239 : | feinerer | 1004 | indices.mapping <- NULL |
240 : | for (m in levels(as.factor(DMetaData(x)$MetaID))) { | ||
241 : | indices <- (DMetaData(x)$MetaID == m) | ||
242 : | indices.mapping <- c(indices.mapping, list(m = indices)) | ||
243 : | names(indices.mapping)[length(indices.mapping)] <- m | ||
244 : | } | ||
245 : | indices.mapping | ||
246 : | } | ||
247 : | |||
248 : | khornik | 1203 | c2 <- |
249 : | function(x, y, ...) | ||
250 : | { | ||
251 : | feinerer | 985 | # Update the CMetaData tree |
252 : | cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y))) | ||
253 : | feinerer | 1004 | update.struct <- .update_id(cmeta) |
254 : | feinerer | 71 | |
255 : | feinerer | 985 | new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL) |
256 : | feinerer | 720 | |
257 : | feinerer | 985 | # Find indices to be updated for the left tree |
258 : | feinerer | 1004 | indices.mapping <- .find_indices(x) |
259 : | feinerer | 958 | |
260 : | feinerer | 985 | # Update the DMetaData data frames for the left tree |
261 : | for (i in 1:ncol(update.struct$left.mapping)) { | ||
262 : | map <- update.struct$left.mapping[,i] | ||
263 : | DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2]) | ||
264 : | } | ||
265 : | feinerer | 689 | |
266 : | feinerer | 985 | # Find indices to be updated for the right tree |
267 : | feinerer | 1004 | indices.mapping <- .find_indices(y) |
268 : | feinerer | 71 | |
269 : | feinerer | 985 | # Update the DMetaData data frames for the right tree |
270 : | for (i in 1:ncol(update.struct$right.mapping)) { | ||
271 : | map <- update.struct$right.mapping[,i] | ||
272 : | DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2]) | ||
273 : | } | ||
274 : | feinerer | 71 | |
275 : | feinerer | 985 | # Merge the DMetaData data frames |
276 : | labels <- setdiff(names(DMetaData(y)), names(DMetaData(x))) | ||
277 : | khornik | 1203 | na.matrix <- matrix(NA, |
278 : | nrow = nrow(DMetaData(x)), | ||
279 : | ncol = length(labels), | ||
280 : | dimnames = list(row.names(DMetaData(x)), labels)) | ||
281 : | feinerer | 985 | x.dmeta.aug <- cbind(DMetaData(x), na.matrix) |
282 : | labels <- setdiff(names(DMetaData(x)), names(DMetaData(y))) | ||
283 : | khornik | 1203 | na.matrix <- matrix(NA, |
284 : | nrow = nrow(DMetaData(y)), | ||
285 : | ncol = length(labels), | ||
286 : | dimnames = list(row.names(DMetaData(y)), labels)) | ||
287 : | feinerer | 985 | y.dmeta.aug <- cbind(DMetaData(y), na.matrix) |
288 : | DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug) | ||
289 : | feinerer | 71 | |
290 : | feinerer | 985 | new |
291 : | } | ||
292 : | feinerer | 71 | |
293 : | feinerer | 985 | c.Corpus <- |
294 : | khornik | 1203 | function(..., recursive = FALSE) |
295 : | feinerer | 985 | { |
296 : | args <- list(...) | ||
297 : | khornik | 1203 | x <- args[[1L]] |
298 : | feinerer | 71 | |
299 : | khornik | 1203 | if(length(args) == 1L) |
300 : | feinerer | 985 | return(x) |
301 : | feinerer | 71 | |
302 : | feinerer | 985 | if (!all(unlist(lapply(args, inherits, class(x))))) |
303 : | stop("not all arguments are of the same corpus type") | ||
304 : | feinerer | 71 | |
305 : | feinerer | 985 | if (inherits(x, "PCorpus")) |
306 : | stop("concatenation of corpora with underlying databases is not supported") | ||
307 : | feinerer | 689 | |
308 : | feinerer | 1095 | if (recursive) |
309 : | khornik | 1203 | Reduce(c2, args) |
310 : | feinerer | 1095 | else { |
311 : | khornik | 1203 | args <- do.call("c", lapply(args, unclass)) |
312 : | .VCorpus(args, | ||
313 : | feinerer | 1095 | cmeta = .MetaDataNode(), |
314 : | khornik | 1203 | dmeta = data.frame(MetaID = rep(0, length(args)), |
315 : | stringsAsFactors = FALSE)) | ||
316 : | feinerer | 1095 | } |
317 : | feinerer | 985 | } |
318 : | feinerer | 54 | |
319 : | khornik | 1203 | c.TextDocument <- |
320 : | function(..., recursive = FALSE) | ||
321 : | { | ||
322 : | feinerer | 985 | args <- list(...) |
323 : | khornik | 1203 | x <- args[[1L]] |
324 : | feinerer | 72 | |
325 : | khornik | 1203 | if(length(args) == 1L) |
326 : | feinerer | 985 | return(x) |
327 : | feinerer | 72 | |
328 : | feinerer | 985 | if (!all(unlist(lapply(args, inherits, class(x))))) |
329 : | stop("not all arguments are text documents") | ||
330 : | feinerer | 54 | |
331 : | khornik | 1203 | dmeta <- data.frame(MetaID = rep(0, length(args)), |
332 : | stringsAsFactors = FALSE) | ||
333 : | .VCorpus(args, .MetaDataNode(), dmeta) | ||
334 : | feinerer | 985 | } |
335 : | feinerer | 55 | |
336 : | khornik | 1203 | print.Corpus <- |
337 : | function(x, ...) | ||
338 : | { | ||
339 : | feinerer | 985 | cat(sprintf(ngettext(length(x), |
340 : | "A corpus with %d text document\n", | ||
341 : | "A corpus with %d text documents\n"), | ||
342 : | length(x))) | ||
343 : | invisible(x) | ||
344 : | } | ||
345 : | |||
346 : | khornik | 1203 | summary.Corpus <- |
347 : | function(object, ...) | ||
348 : | { | ||
349 : | feinerer | 988 | print(object) |
350 : | feinerer | 1285 | if (length(DMetaData(object))) { |
351 : | feinerer | 988 | cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")), |
352 : | feinerer | 985 | "\nThe metadata consists of %d tag-value pair and a data frame\n", |
353 : | "\nThe metadata consists of %d tag-value pairs and a data frame\n"), | ||
354 : | feinerer | 988 | length(CMetaData(object)$MetaData))) |
355 : | feinerer | 985 | cat("Available tags are:\n") |
356 : | feinerer | 988 | cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n") |
357 : | feinerer | 985 | cat("Available variables in the data frame are:\n") |
358 : | feinerer | 988 | cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n") |
359 : | feinerer | 985 | } |
360 : | } | ||
361 : | |||
362 : | khornik | 1203 | inspect <- |
363 : | function(x) | ||
364 : | UseMethod("inspect", x) | ||
365 : | inspect.PCorpus <- | ||
366 : | function(x) | ||
367 : | { | ||
368 : | feinerer | 938 | summary(x) |
369 : | cat("\n") | ||
370 : | feinerer | 946 | db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]]) |
371 : | show(filehash::dbMultiFetch(db, unlist(x))) | ||
372 : | feinerer | 938 | } |
373 : | khornik | 1203 | inspect.VCorpus <- |
374 : | function(x) | ||
375 : | { | ||
376 : | feinerer | 946 | summary(x) |
377 : | cat("\n") | ||
378 : | print(noquote(lapply(x, identity))) | ||
379 : | } | ||
380 : | feinerer | 65 | |
381 : | khornik | 1203 | lapply.PCorpus <- |
382 : | function(X, FUN, ...) | ||
383 : | { | ||
384 : | feinerer | 985 | db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]]) |
385 : | lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...) | ||
386 : | } | ||
387 : | khornik | 1203 | lapply.VCorpus <- |
388 : | function(X, FUN, ...) | ||
389 : | { | ||
390 : | feinerer | 985 | lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus") |
391 : | if (!is.null(lazyTmMap)) | ||
392 : | .Call("copyCorpus", X, materialize(X)) | ||
393 : | base::lapply(X, FUN, ...) | ||
394 : | } | ||
395 : | feinerer | 719 | |
396 : | khornik | 1203 | writeCorpus <- |
397 : | function(x, path = ".", filenames = NULL) | ||
398 : | { | ||
399 : | feinerer | 985 | filenames <- file.path(path, |
400 : | if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x)))) | ||
401 : | else filenames) | ||
402 : | i <- 1 | ||
403 : | for (o in x) { | ||
404 : | writeLines(as.PlainTextDocument(o), filenames[i]) | ||
405 : | i <- i + 1 | ||
406 : | feinerer | 836 | } |
407 : | feinerer | 985 | } |
R-Forge@R-project.org | ViewVC Help |
Powered by ViewVC 1.0.0 |