SCM Repository
Annotation of /pkg/R/corpus.R
Parent Directory
|
Revision Log
Revision 900 -
(view)
(download)
Original Path: pkg/R/textdoccol.R
1 : | feinerer | 17 | # Author: Ingo Feinerer |
2 : | |||
3 : | feinerer | 722 | # The "..." are additional arguments for the FunctionGenerator reader |
4 : | feinerer | 816 | setGeneric("Corpus", function(object, |
5 : | feinerer | 900 | readerControl = list(reader = object@DefaultReader, language = "en_US", load = TRUE), |
6 : | dbControl = list(useDb = FALSE, dbName = "", dbType = "DB1"), | ||
7 : | ...) standardGeneric("Corpus")) | ||
8 : | feinerer | 816 | setMethod("Corpus", |
9 : | feinerer | 63 | signature(object = "Source"), |
10 : | feinerer | 717 | function(object, |
11 : | feinerer | 817 | readerControl = list(reader = object@DefaultReader, language = "en_US", load = TRUE), |
12 : | feinerer | 741 | dbControl = list(useDb = FALSE, dbName = "", dbType = "DB1"), |
13 : | feinerer | 733 | ...) { |
14 : | feinerer | 809 | if (is.null(readerControl$reader)) |
15 : | readerControl$reader <- object@DefaultReader | ||
16 : | feinerer | 777 | if (is(readerControl$reader, "FunctionGenerator")) |
17 : | feinerer | 722 | readerControl$reader <- readerControl$reader(...) |
18 : | feinerer | 799 | if (is.null(readerControl$language)) |
19 : | readerControl$language = "en_US" | ||
20 : | if (is.null(readerControl$load)) | ||
21 : | feinerer | 817 | readerControl$load = TRUE |
22 : | feinerer | 63 | |
23 : | feinerer | 886 | if (dbControl$useDb && require("filehash")) { |
24 : | feinerer | 712 | if (!dbCreate(dbControl$dbName, dbControl$dbType)) |
25 : | stop("error in creating database") | ||
26 : | db <- dbInit(dbControl$dbName, dbControl$dbType) | ||
27 : | } | ||
28 : | |||
29 : | feinerer | 869 | # Allocate memory in advance if length is known |
30 : | tdl <- if (object@Length > 0) | ||
31 : | vector("list", as.integer(object@Length)) | ||
32 : | else | ||
33 : | list() | ||
34 : | |||
35 : | feinerer | 63 | counter <- 1 |
36 : | while (!eoi(object)) { | ||
37 : | feinerer | 698 | object <- stepNext(object) |
38 : | elem <- getElem(object) | ||
39 : | feinerer | 63 | # If there is no Load on Demand support |
40 : | # we need to load the corpus into memory at startup | ||
41 : | feinerer | 694 | if (!object@LoDSupport) |
42 : | feinerer | 722 | readerControl$load <- TRUE |
43 : | doc <- readerControl$reader(elem, readerControl$load, readerControl$language, as.character(counter)) | ||
44 : | feinerer | 886 | if (dbControl$useDb && require("filehash")) { |
45 : | feinerer | 712 | dbInsert(db, ID(doc), doc) |
46 : | feinerer | 869 | if (object@Length > 0) |
47 : | tdl[[counter]] <- ID(doc) | ||
48 : | else | ||
49 : | tdl <- c(tdl, ID(doc)) | ||
50 : | feinerer | 712 | } |
51 : | feinerer | 869 | else { |
52 : | if (object@Length > 0) | ||
53 : | tdl[[counter]] <- doc | ||
54 : | else | ||
55 : | tdl <- c(tdl, list(doc)) | ||
56 : | } | ||
57 : | feinerer | 63 | counter <- counter + 1 |
58 : | } | ||
59 : | |||
60 : | feinerer | 712 | df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE) |
61 : | feinerer | 886 | if (dbControl$useDb && require("filehash")) { |
62 : | feinerer | 712 | dbInsert(db, "DMetaData", df) |
63 : | feinerer | 727 | dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA))) |
64 : | feinerer | 712 | } |
65 : | else | ||
66 : | dmeta.df <- df | ||
67 : | |||
68 : | feinerer | 698 | cmeta.node <- new("MetaDataNode", |
69 : | feinerer | 71 | NodeID = 0, |
70 : | feinerer | 689 | MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")), |
71 : | feinerer | 71 | children = list()) |
72 : | |||
73 : | feinerer | 816 | return(new("Corpus", .Data = tdl, DMetaData = dmeta.df, CMetaData = cmeta.node, DBControl = dbControl)) |
74 : | feinerer | 21 | }) |
75 : | |||
76 : | feinerer | 698 | setGeneric("loadDoc", function(object, ...) standardGeneric("loadDoc")) |
77 : | setMethod("loadDoc", | ||
78 : | feinerer | 62 | signature(object = "PlainTextDocument"), |
79 : | feinerer | 65 | function(object, ...) { |
80 : | feinerer | 61 | if (!Cached(object)) { |
81 : | feinerer | 67 | con <- eval(URI(object)) |
82 : | feinerer | 65 | corpus <- readLines(con) |
83 : | close(con) | ||
84 : | feinerer | 816 | Content(object) <- corpus |
85 : | feinerer | 60 | Cached(object) <- TRUE |
86 : | feinerer | 56 | return(object) |
87 : | } else { | ||
88 : | return(object) | ||
89 : | } | ||
90 : | }) | ||
91 : | feinerer | 698 | setMethod("loadDoc", |
92 : | feinerer | 62 | signature(object = "XMLTextDocument"), |
93 : | feinerer | 65 | function(object, ...) { |
94 : | feinerer | 886 | if (!Cached(object) && require("XML")) { |
95 : | feinerer | 67 | con <- eval(URI(object)) |
96 : | feinerer | 65 | corpus <- paste(readLines(con), "\n", collapse = "") |
97 : | close(con) | ||
98 : | doc <- xmlTreeParse(corpus, asText = TRUE) | ||
99 : | feinerer | 49 | class(doc) <- "list" |
100 : | feinerer | 816 | Content(object) <- doc |
101 : | feinerer | 60 | Cached(object) <- TRUE |
102 : | feinerer | 49 | return(object) |
103 : | } else { | ||
104 : | return(object) | ||
105 : | } | ||
106 : | }) | ||
107 : | feinerer | 698 | setMethod("loadDoc", |
108 : | feinerer | 62 | signature(object = "NewsgroupDocument"), |
109 : | feinerer | 65 | function(object, ...) { |
110 : | feinerer | 61 | if (!Cached(object)) { |
111 : | feinerer | 67 | con <- eval(URI(object)) |
112 : | feinerer | 65 | mail <- readLines(con) |
113 : | close(con) | ||
114 : | feinerer | 60 | Cached(object) <- TRUE |
115 : | feinerer | 744 | for (index in seq_along(mail)) { |
116 : | feinerer | 65 | if (mail[index] == "") |
117 : | break | ||
118 : | } | ||
119 : | feinerer | 816 | Content(object) <- mail[(index + 1):length(mail)] |
120 : | feinerer | 56 | return(object) |
121 : | } else { | ||
122 : | return(object) | ||
123 : | } | ||
124 : | }) | ||
125 : | feinerer | 767 | setMethod("loadDoc", |
126 : | signature(object = "StructuredTextDocument"), | ||
127 : | function(object, ...) { | ||
128 : | if (!Cached(object)) { | ||
129 : | warning("load on demand not (yet) supported for StructuredTextDocuments") | ||
130 : | return(object) | ||
131 : | } else | ||
132 : | return(object) | ||
133 : | }) | ||
134 : | feinerer | 49 | |
135 : | feinerer | 719 | setGeneric("tmUpdate", function(object, |
136 : | origin, | ||
137 : | feinerer | 817 | readerControl = list(reader = origin@DefaultReader, language = "en_US", load = TRUE), |
138 : | feinerer | 719 | ...) standardGeneric("tmUpdate")) |
139 : | feinerer | 72 | # Update is only supported for directories |
140 : | # At the moment no other LoD devices are available anyway | ||
141 : | feinerer | 698 | setMethod("tmUpdate", |
142 : | feinerer | 816 | signature(object = "Corpus", origin = "DirSource"), |
143 : | feinerer | 719 | function(object, origin, |
144 : | feinerer | 817 | readerControl = list(reader = origin@DefaultReader, language = "en_US", load = TRUE), |
145 : | feinerer | 719 | ...) { |
146 : | feinerer | 809 | if (is.null(readerControl$reader)) |
147 : | readerControl$reader <- origin@DefaultReader | ||
148 : | feinerer | 777 | if (is(readerControl$reader, "FunctionGenerator")) |
149 : | feinerer | 722 | readerControl$reader <- readerControl$reader(...) |
150 : | feinerer | 809 | if (is.null(readerControl$language)) |
151 : | readerControl$language = "en_US" | ||
152 : | if (is.null(readerControl$load)) | ||
153 : | feinerer | 817 | readerControl$load = TRUE |
154 : | feinerer | 72 | |
155 : | feinerer | 875 | object.filelist <- unlist(lapply(object, function(x) {summary(eval(URI(x)))$description})) |
156 : | feinerer | 72 | new.files <- setdiff(origin@FileList, object.filelist) |
157 : | |||
158 : | for (filename in new.files) { | ||
159 : | feinerer | 812 | encoding <- origin@Encoding |
160 : | elem <- list(content = readLines(filename, encoding = encoding), | ||
161 : | uri = substitute(file(filename, encoding = encoding))) | ||
162 : | feinerer | 722 | object <- appendElem(object, readerControl$reader(elem, readerControl$load, readerControl$language, filename)) |
163 : | feinerer | 72 | } |
164 : | |||
165 : | return(object) | ||
166 : | }) | ||
167 : | |||
168 : | feinerer | 830 | setGeneric("tmMap", function(object, FUN, ..., lazy = FALSE) standardGeneric("tmMap")) |
169 : | feinerer | 698 | setMethod("tmMap", |
170 : | feinerer | 816 | signature(object = "Corpus", FUN = "function"), |
171 : | feinerer | 830 | function(object, FUN, ..., lazy = FALSE) { |
172 : | feinerer | 73 | result <- object |
173 : | feinerer | 689 | # Note that text corpora are automatically loaded into memory via \code{[[} |
174 : | feinerer | 886 | if (DBControl(object)[["useDb"]] && require("filehash")) { |
175 : | feinerer | 830 | if (lazy) |
176 : | warning("lazy mapping is deactived when using database backend") | ||
177 : | feinerer | 719 | db <- dbInit(DBControl(object)[["dbName"]], DBControl(object)[["dbType"]]) |
178 : | feinerer | 728 | i <- 1 |
179 : | for (id in unlist(object)) { | ||
180 : | db[[id]] <- FUN(object[[i]], ..., DMetaData = DMetaData(object)) | ||
181 : | i <- i + 1 | ||
182 : | } | ||
183 : | feinerer | 833 | # Suggested by Christian Buchta |
184 : | dbReorganize(db) | ||
185 : | feinerer | 719 | } |
186 : | feinerer | 828 | else { |
187 : | feinerer | 830 | # Lazy mapping |
188 : | if (lazy) { | ||
189 : | lazyTmMap <- meta(object, tag = "lazyTmMap", type = "corpus") | ||
190 : | if (is.null(lazyTmMap)) { | ||
191 : | meta(result, tag = "lazyTmMap", type = "corpus") <- | ||
192 : | list(index = rep(TRUE, length(result)), | ||
193 : | maps = list(function(x, DMetaData) FUN(x, ..., DMetaData = DMetaData))) | ||
194 : | } | ||
195 : | else { | ||
196 : | lazyTmMap$maps <- c(lazyTmMap$maps, list(function(x, DMetaData) FUN(x, ..., DMetaData = DMetaData))) | ||
197 : | meta(result, tag = "lazyTmMap", type = "corpus") <- lazyTmMap | ||
198 : | } | ||
199 : | } | ||
200 : | feinerer | 861 | else { |
201 : | result@.Data <- if (clusterAvailable()) | ||
202 : | feinerer | 870 | snow::parLapply(snow::getMPIcluster(), object, FUN, ..., DMetaData = DMetaData(object)) |
203 : | feinerer | 861 | else |
204 : | lapply(object, FUN, ..., DMetaData = DMetaData(object)) | ||
205 : | } | ||
206 : | feinerer | 828 | } |
207 : | feinerer | 56 | return(result) |
208 : | feinerer | 49 | }) |
209 : | |||
210 : | feinerer | 828 | # Materialize lazy mappings |
211 : | feinerer | 833 | # Improvements by Christian Buchta |
212 : | feinerer | 829 | materialize <- function(corpus, range = seq_along(corpus)) { |
213 : | feinerer | 828 | lazyTmMap <- meta(corpus, tag = "lazyTmMap", type = "corpus") |
214 : | feinerer | 829 | if (!is.null(lazyTmMap)) { |
215 : | feinerer | 833 | # Make valid and lazy index |
216 : | idx <- (seq_along(corpus) %in% range) & lazyTmMap$index | ||
217 : | if (any(idx)) { | ||
218 : | res <- lapply(corpus@.Data[idx], loadDoc) | ||
219 : | for (m in lazyTmMap$maps) | ||
220 : | res <- lapply(res, m, DMetaData = DMetaData(corpus)) | ||
221 : | corpus@.Data[idx] <- res | ||
222 : | lazyTmMap$index[idx] <- FALSE | ||
223 : | } | ||
224 : | feinerer | 829 | } |
225 : | feinerer | 830 | # Clean up if everything is materialized |
226 : | if (!any(lazyTmMap$index)) | ||
227 : | lazyTmMap <- NULL | ||
228 : | feinerer | 829 | meta(corpus, tag = "lazyTmMap", type = "corpus") <- lazyTmMap |
229 : | feinerer | 828 | return(corpus) |
230 : | } | ||
231 : | |||
232 : | feinerer | 698 | setGeneric("asPlain", function(object, FUN, ...) standardGeneric("asPlain")) |
233 : | setMethod("asPlain", | ||
234 : | feinerer | 62 | signature(object = "PlainTextDocument"), |
235 : | feinerer | 49 | function(object, FUN, ...) { |
236 : | return(object) | ||
237 : | }) | ||
238 : | feinerer | 698 | setMethod("asPlain", |
239 : | feinerer | 856 | signature(object = "XMLTextDocument"), |
240 : | feinerer | 49 | function(object, FUN, ...) { |
241 : | feinerer | 886 | require("XML") |
242 : | |||
243 : | feinerer | 816 | corpus <- Content(object) |
244 : | feinerer | 49 | |
245 : | # As XMLDocument is no native S4 class, restore valid information | ||
246 : | class(corpus) <- "XMLDocument" | ||
247 : | names(corpus) <- c("doc","dtd") | ||
248 : | |||
249 : | feinerer | 61 | return(FUN(xmlRoot(corpus), ...)) |
250 : | feinerer | 49 | }) |
251 : | feinerer | 725 | setMethod("asPlain", |
252 : | feinerer | 757 | signature(object = "Reuters21578Document"), |
253 : | function(object, FUN, ...) { | ||
254 : | feinerer | 886 | require("XML") |
255 : | |||
256 : | feinerer | 757 | FUN <- convertReut21578XMLPlain |
257 : | feinerer | 816 | corpus <- Content(object) |
258 : | feinerer | 757 | |
259 : | # As XMLDocument is no native S4 class, restore valid information | ||
260 : | class(corpus) <- "XMLDocument" | ||
261 : | names(corpus) <- c("doc","dtd") | ||
262 : | |||
263 : | return(FUN(xmlRoot(corpus), ...)) | ||
264 : | }) | ||
265 : | setMethod("asPlain", | ||
266 : | signature(object = "RCV1Document"), | ||
267 : | function(object, FUN, ...) { | ||
268 : | feinerer | 856 | return(convertRCV1Plain(object, ...)) |
269 : | feinerer | 757 | }) |
270 : | setMethod("asPlain", | ||
271 : | feinerer | 725 | signature(object = "NewsgroupDocument"), |
272 : | function(object, FUN, ...) { | ||
273 : | feinerer | 875 | new("PlainTextDocument", .Data = Content(object), Cached = TRUE, URI = NULL, Author = Author(object), |
274 : | feinerer | 725 | DateTimeStamp = DateTimeStamp(object), Description = Description(object), ID = ID(object), |
275 : | feinerer | 826 | Origin = Origin(object), Heading = Heading(object), Language = Language(object), |
276 : | LocalMetaData = LocalMetaData(object)) | ||
277 : | feinerer | 725 | }) |
278 : | feinerer | 775 | setMethod("asPlain", |
279 : | signature(object = "StructuredTextDocument"), | ||
280 : | function(object, FUN, ...) { | ||
281 : | feinerer | 816 | new("PlainTextDocument", .Data = unlist(Content(object)), Cached = TRUE, |
282 : | feinerer | 875 | URI = NULL, Author = Author(object), DateTimeStamp = DateTimeStamp(object), |
283 : | feinerer | 775 | Description = Description(object), ID = ID(object), Origin = Origin(object), |
284 : | feinerer | 826 | Heading = Heading(object), Language = Language(object), |
285 : | LocalMetaData = LocalMetaData(object)) | ||
286 : | feinerer | 775 | }) |
287 : | feinerer | 49 | |
288 : | feinerer | 854 | setGeneric("tmFilter", function(object, ..., FUN = searchFullText, doclevel = TRUE) standardGeneric("tmFilter")) |
289 : | feinerer | 698 | setMethod("tmFilter", |
290 : | feinerer | 816 | signature(object = "Corpus"), |
291 : | feinerer | 854 | function(object, ..., FUN = searchFullText, doclevel = TRUE) { |
292 : | if (!is.null(attr(FUN, "doclevel"))) | ||
293 : | doclevel <- attr(FUN, "doclevel") | ||
294 : | feinerer | 861 | if (doclevel) { |
295 : | if (clusterAvailable()) | ||
296 : | feinerer | 870 | return(object[snow::parSapply(snow::getMPIcluster(), object, FUN, ..., DMetaData = DMetaData(object))]) |
297 : | feinerer | 861 | else |
298 : | return(object[sapply(object, FUN, ..., DMetaData = DMetaData(object))]) | ||
299 : | } | ||
300 : | feinerer | 73 | else |
301 : | feinerer | 724 | return(object[FUN(object, ...)]) |
302 : | feinerer | 61 | }) |
303 : | |||
304 : | feinerer | 854 | setGeneric("tmIndex", function(object, ..., FUN = searchFullText, doclevel = TRUE) standardGeneric("tmIndex")) |
305 : | feinerer | 698 | setMethod("tmIndex", |
306 : | feinerer | 816 | signature(object = "Corpus"), |
307 : | feinerer | 854 | function(object, ..., FUN = searchFullText, doclevel = TRUE) { |
308 : | if (!is.null(attr(FUN, "doclevel"))) | ||
309 : | doclevel <- attr(FUN, "doclevel") | ||
310 : | feinerer | 861 | if (doclevel) { |
311 : | if (clusterAvailable()) | ||
312 : | feinerer | 870 | return(snow::parSapply(snow::getMPIcluster(), object, FUN, ..., DMetaData = DMetaData(object))) |
313 : | feinerer | 861 | else |
314 : | return(sapply(object, FUN, ..., DMetaData = DMetaData(object))) | ||
315 : | } | ||
316 : | feinerer | 73 | else |
317 : | feinerer | 724 | return(FUN(object, ...)) |
318 : | feinerer | 49 | }) |
319 : | |||
320 : | feinerer | 698 | setGeneric("appendElem", function(object, data, meta = NULL) standardGeneric("appendElem")) |
321 : | setMethod("appendElem", | ||
322 : | feinerer | 816 | signature(object = "Corpus", data = "TextDocument"), |
323 : | feinerer | 72 | function(object, data, meta = NULL) { |
324 : | feinerer | 886 | if (DBControl(object)[["useDb"]] && require("filehash")) { |
325 : | feinerer | 712 | db <- dbInit(DBControl(object)[["dbName"]], DBControl(object)[["dbType"]]) |
326 : | if (dbExists(db, ID(data))) | ||
327 : | warning("document with identical ID already exists") | ||
328 : | dbInsert(db, ID(data), data) | ||
329 : | object@.Data[[length(object)+1]] <- ID(data) | ||
330 : | } | ||
331 : | else | ||
332 : | object@.Data[[length(object)+1]] <- data | ||
333 : | DMetaData(object) <- rbind(DMetaData(object), c(MetaID = CMetaData(object)@NodeID, meta)) | ||
334 : | feinerer | 52 | return(object) |
335 : | }) | ||
336 : | |||
337 : | feinerer | 698 | setGeneric("appendMeta", function(object, cmeta = NULL, dmeta = NULL) standardGeneric("appendMeta")) |
338 : | setMethod("appendMeta", | ||
339 : | feinerer | 816 | signature(object = "Corpus"), |
340 : | feinerer | 698 | function(object, cmeta = NULL, dmeta = NULL) { |
341 : | feinerer | 712 | object@CMetaData@MetaData <- c(CMetaData(object)@MetaData, cmeta) |
342 : | if (!is.null(dmeta)) { | ||
343 : | feinerer | 719 | DMetaData(object) <- cbind(DMetaData(object), eval(dmeta)) |
344 : | feinerer | 712 | } |
345 : | feinerer | 52 | return(object) |
346 : | }) | ||
347 : | feinerer | 53 | |
348 : | feinerer | 698 | setGeneric("removeMeta", function(object, cname = NULL, dname = NULL) standardGeneric("removeMeta")) |
349 : | setMethod("removeMeta", | ||
350 : | feinerer | 816 | signature(object = "Corpus"), |
351 : | feinerer | 698 | function(object, cname = NULL, dname = NULL) { |
352 : | feinerer | 729 | if (!is.null(cname)) |
353 : | feinerer | 698 | object@CMetaData@MetaData <- CMetaData(object)@MetaData[names(CMetaData(object)@MetaData) != cname] |
354 : | feinerer | 729 | if (!is.null(dname)) |
355 : | DMetaData(object) <- DMetaData(object)[, names(DMetaData(object)) != dname, drop = FALSE] | ||
356 : | feinerer | 77 | return(object) |
357 : | }) | ||
358 : | feinerer | 69 | |
359 : | feinerer | 698 | setGeneric("prescindMeta", function(object, meta) standardGeneric("prescindMeta")) |
360 : | setMethod("prescindMeta", | ||
361 : | feinerer | 816 | signature(object = "Corpus", meta = "character"), |
362 : | feinerer | 73 | function(object, meta) { |
363 : | for (m in meta) { | ||
364 : | feinerer | 719 | if (m %in% c("Author", "DateTimeStamp", "Description", "ID", "Origin", "Heading", "Language")) { |
365 : | feinerer | 75 | local.m <- lapply(object, m) |
366 : | feinerer | 853 | local.m <- sapply(local.m, paste, collapse = " ") |
367 : | feinerer | 75 | local.m <- lapply(local.m, function(x) if (is.null(x)) return(NA) else return(x)) |
368 : | feinerer | 73 | local.m <- unlist(local.m) |
369 : | feinerer | 721 | DMetaData(object) <- cbind(DMetaData(object), data.frame(m = local.m, stringsAsFactors = FALSE)) |
370 : | names(DMetaData(object))[which(names(DMetaData(object)) == "m")] <- m | ||
371 : | feinerer | 75 | } |
372 : | else { | ||
373 : | local.meta <- lapply(object, LocalMetaData) | ||
374 : | local.m <- lapply(local.meta, "[[", m) | ||
375 : | local.m <- lapply(local.m, function(x) if (is.null(x)) return(NA) else return(x)) | ||
376 : | if (length(local.m) == length(unlist(local.m))) | ||
377 : | local.m <- unlist(local.m) | ||
378 : | else | ||
379 : | local.m <- I(local.m) | ||
380 : | feinerer | 721 | DMetaData(object) <- cbind(DMetaData(object), data.frame(m = local.m, stringsAsFactors = FALSE)) |
381 : | names(DMetaData(object))[which(names(DMetaData(object)) == "m")] <- m | ||
382 : | feinerer | 75 | } |
383 : | feinerer | 73 | } |
384 : | return(object) | ||
385 : | }) | ||
386 : | |||
387 : | feinerer | 53 | setMethod("[", |
388 : | feinerer | 816 | signature(x = "Corpus", i = "ANY", j = "ANY", drop = "ANY"), |
389 : | feinerer | 53 | function(x, i, j, ... , drop) { |
390 : | if(missing(i)) | ||
391 : | return(x) | ||
392 : | |||
393 : | object <- x | ||
394 : | object@.Data <- x@.Data[i, ..., drop = FALSE] | ||
395 : | feinerer | 886 | if (DBControl(object)[["useDb"]] && require("filehash")) { |
396 : | feinerer | 727 | index <- object@DMetaData[[1 , "subset"]] |
397 : | if (any(is.na(index))) | ||
398 : | object@DMetaData[[1 , "subset"]] <- i | ||
399 : | feinerer | 724 | else |
400 : | feinerer | 727 | object@DMetaData[[1 , "subset"]] <- index[i] |
401 : | feinerer | 724 | } |
402 : | feinerer | 729 | else |
403 : | DMetaData(object) <- DMetaData(x)[i, , drop = FALSE] | ||
404 : | feinerer | 53 | return(object) |
405 : | }) | ||
406 : | |||
407 : | feinerer | 63 | setMethod("[<-", |
408 : | feinerer | 816 | signature(x = "Corpus", i = "ANY", j = "ANY", value = "ANY"), |
409 : | feinerer | 63 | function(x, i, j, ... , value) { |
410 : | object <- x | ||
411 : | feinerer | 886 | if (DBControl(object)[["useDb"]] && require("filehash")) { |
412 : | feinerer | 724 | db <- dbInit(DBControl(object)[["dbName"]], DBControl(object)[["dbType"]]) |
413 : | counter <- 1 | ||
414 : | for (id in object@.Data[i, ...]) { | ||
415 : | if (length(value) == 1) | ||
416 : | db[[id]] <- value | ||
417 : | else { | ||
418 : | db[[id]] <- value[[counter]] | ||
419 : | } | ||
420 : | counter <- counter + 1 | ||
421 : | } | ||
422 : | } | ||
423 : | else | ||
424 : | object@.Data[i, ...] <- value | ||
425 : | feinerer | 63 | return(object) |
426 : | }) | ||
427 : | |||
428 : | setMethod("[[", | ||
429 : | feinerer | 816 | signature(x = "Corpus", i = "ANY", j = "ANY"), |
430 : | feinerer | 63 | function(x, i, j, ...) { |
431 : | feinerer | 886 | if (DBControl(x)[["useDb"]] && require("filehash")) { |
432 : | feinerer | 712 | db <- dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]]) |
433 : | result <- dbFetch(db, x@.Data[[i]]) | ||
434 : | return(loadDoc(result)) | ||
435 : | } | ||
436 : | feinerer | 829 | else { |
437 : | feinerer | 830 | lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus") |
438 : | if (!is.null(lazyTmMap)) | ||
439 : | .Call("copyCorpus", x, materialize(x, i)) | ||
440 : | feinerer | 712 | return(loadDoc(x@.Data[[i]])) |
441 : | feinerer | 829 | } |
442 : | feinerer | 63 | }) |
443 : | |||
444 : | setMethod("[[<-", | ||
445 : | feinerer | 816 | signature(x = "Corpus", i = "ANY", j = "ANY", value = "ANY"), |
446 : | feinerer | 63 | function(x, i, j, ..., value) { |
447 : | object <- x | ||
448 : | feinerer | 886 | if (DBControl(object)[["useDb"]] && require("filehash")) { |
449 : | feinerer | 712 | db <- dbInit(DBControl(object)[["dbName"]], DBControl(object)[["dbType"]]) |
450 : | index <- object@.Data[[i]] | ||
451 : | db[[index]] <- value | ||
452 : | } | ||
453 : | feinerer | 830 | else { |
454 : | # Mark new objects as not active for lazy mapping | ||
455 : | lazyTmMap <- meta(object, tag = "lazyTmMap", type = "corpus") | ||
456 : | feinerer | 831 | if (!is.null(lazyTmMap)) { |
457 : | feinerer | 830 | lazyTmMap$index[i] <- FALSE |
458 : | feinerer | 831 | meta(object, tag = "lazyTmMap", type = "corpus") <- lazyTmMap |
459 : | } | ||
460 : | feinerer | 830 | # Set the value |
461 : | feinerer | 712 | object@.Data[[i, ...]] <- value |
462 : | feinerer | 830 | } |
463 : | feinerer | 63 | return(object) |
464 : | }) | ||
465 : | |||
466 : | feinerer | 698 | # Update \code{NodeID}s of a CMetaData tree |
467 : | feinerer | 697 | update_id <- function(object, id = 0, mapping = NULL, left.mapping = NULL, level = 0) { |
468 : | feinerer | 698 | # Traversal of (binary) CMetaData tree with setup of \code{NodeID}s |
469 : | feinerer | 697 | set_id <- function(object) { |
470 : | object@NodeID <- id | ||
471 : | id <<- id + 1 | ||
472 : | level <<- level + 1 | ||
473 : | feinerer | 71 | |
474 : | feinerer | 697 | if (length(object@children) > 0) { |
475 : | mapping <<- cbind(mapping, c(object@children[[1]]@NodeID, id)) | ||
476 : | left <- set_id(object@children[[1]]) | ||
477 : | if (level == 1) { | ||
478 : | left.mapping <<- mapping | ||
479 : | mapping <<- NULL | ||
480 : | } | ||
481 : | mapping <<- cbind(mapping, c(object@children[[2]]@NodeID, id)) | ||
482 : | right <- set_id(object@children[[2]]) | ||
483 : | feinerer | 71 | |
484 : | feinerer | 697 | object@children <- list(left, right) |
485 : | feinerer | 71 | } |
486 : | feinerer | 697 | level <<- level - 1 |
487 : | feinerer | 71 | |
488 : | feinerer | 697 | return(object) |
489 : | feinerer | 71 | } |
490 : | |||
491 : | feinerer | 697 | return(list(root = set_id(object), left.mapping = left.mapping, right.mapping = mapping)) |
492 : | feinerer | 71 | } |
493 : | |||
494 : | feinerer | 53 | setMethod("c", |
495 : | feinerer | 816 | signature(x = "Corpus"), |
496 : | feinerer | 689 | function(x, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) { |
497 : | args <- list(...) | ||
498 : | feinerer | 720 | if (length(args) == 0) |
499 : | feinerer | 689 | return(x) |
500 : | feinerer | 71 | |
501 : | feinerer | 816 | if (!all(sapply(args, inherits, "Corpus"))) |
502 : | feinerer | 894 | stop("not all arguments are corpora") |
503 : | feinerer | 886 | if (DBControl(x)[["useDb"]] || any(unlist(sapply(args, DBControl)["useDb", ]))) |
504 : | feinerer | 894 | stop("concatenating corpora with activated database is not supported") |
505 : | feinerer | 720 | |
506 : | feinerer | 900 | Reduce(c2, list(x, args)) |
507 : | feinerer | 689 | }) |
508 : | |||
509 : | setGeneric("c2", function(x, y, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) standardGeneric("c2")) | ||
510 : | setMethod("c2", | ||
511 : | feinerer | 816 | signature(x = "Corpus", y = "Corpus"), |
512 : | feinerer | 689 | function(x, y, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) { |
513 : | feinerer | 71 | object <- x |
514 : | # Concatenate data slots | ||
515 : | object@.Data <- c(as(x, "list"), as(y, "list")) | ||
516 : | |||
517 : | feinerer | 720 | # Set the DBControl slot |
518 : | feinerer | 741 | object@DBControl <- list(useDb = FALSE, dbName = "", dbType = "DB1") |
519 : | feinerer | 720 | |
520 : | feinerer | 698 | # Update the CMetaData tree |
521 : | cmeta <- new("MetaDataNode", NodeID = 0, MetaData = meta, children = list(CMetaData(x), CMetaData(y))) | ||
522 : | update.struct <- update_id(cmeta) | ||
523 : | object@CMetaData <- update.struct$root | ||
524 : | feinerer | 71 | |
525 : | # Find indices to be updated for the left tree | ||
526 : | indices.mapping <- NULL | ||
527 : | for (m in levels(as.factor(DMetaData(x)$MetaID))) { | ||
528 : | indices <- (DMetaData(x)$MetaID == m) | ||
529 : | indices.mapping <- c(indices.mapping, list(m = indices)) | ||
530 : | names(indices.mapping)[length(indices.mapping)] <- m | ||
531 : | } | ||
532 : | |||
533 : | # Update the DMetaData data frames for the left tree | ||
534 : | for (i in 1:ncol(update.struct$left.mapping)) { | ||
535 : | map <- update.struct$left.mapping[,i] | ||
536 : | x@DMetaData$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2]) | ||
537 : | } | ||
538 : | |||
539 : | # Find indices to be updated for the right tree | ||
540 : | indices.mapping <- NULL | ||
541 : | for (m in levels(as.factor(DMetaData(y)$MetaID))) { | ||
542 : | indices <- (DMetaData(y)$MetaID == m) | ||
543 : | indices.mapping <- c(indices.mapping, list(m = indices)) | ||
544 : | names(indices.mapping)[length(indices.mapping)] <- m | ||
545 : | } | ||
546 : | |||
547 : | # Update the DMetaData data frames for the right tree | ||
548 : | for (i in 1:ncol(update.struct$right.mapping)) { | ||
549 : | map <- update.struct$right.mapping[,i] | ||
550 : | y@DMetaData$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2]) | ||
551 : | } | ||
552 : | |||
553 : | # Merge the DMetaData data frames | ||
554 : | labels <- setdiff(names(DMetaData(y)), names(DMetaData(x))) | ||
555 : | na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels)) | ||
556 : | x.dmeta.aug <- cbind(DMetaData(x), na.matrix) | ||
557 : | labels <- setdiff(names(DMetaData(x)), names(DMetaData(y))) | ||
558 : | na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels)) | ||
559 : | y.dmeta.aug <- cbind(DMetaData(y), na.matrix) | ||
560 : | object@DMetaData <- rbind(x.dmeta.aug, y.dmeta.aug) | ||
561 : | |||
562 : | return(object) | ||
563 : | feinerer | 72 | }) |
564 : | feinerer | 689 | |
565 : | feinerer | 72 | setMethod("c", |
566 : | signature(x = "TextDocument"), | ||
567 : | function(x, ..., recursive = TRUE){ | ||
568 : | args <- list(...) | ||
569 : | if(length(args) == 0) | ||
570 : | return(x) | ||
571 : | feinerer | 54 | |
572 : | feinerer | 73 | dmeta.df <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE) |
573 : | feinerer | 698 | cmeta.node <- new("MetaDataNode", |
574 : | feinerer | 72 | NodeID = 0, |
575 : | feinerer | 689 | MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")), |
576 : | feinerer | 72 | children = list()) |
577 : | |||
578 : | feinerer | 816 | return(new("Corpus", |
579 : | feinerer | 720 | .Data = list(x, ...), |
580 : | DMetaData = dmeta.df, | ||
581 : | CMetaData = cmeta.node, | ||
582 : | feinerer | 741 | DBControl = list(useDb = FALSE, dbName = "", dbType = "DB1"))) |
583 : | feinerer | 72 | }) |
584 : | |||
585 : | feinerer | 54 | setMethod("length", |
586 : | feinerer | 816 | signature(x = "Corpus"), |
587 : | feinerer | 54 | function(x){ |
588 : | return(length(as(x, "list"))) | ||
589 : | }) | ||
590 : | |||
591 : | setMethod("show", | ||
592 : | feinerer | 816 | signature(object = "Corpus"), |
593 : | feinerer | 54 | function(object){ |
594 : | feinerer | 70 | cat(sprintf(ngettext(length(object), |
595 : | feinerer | 894 | "A corpus with %d text document\n", |
596 : | "A corpus with %d text documents\n"), | ||
597 : | feinerer | 70 | length(object))) |
598 : | feinerer | 54 | }) |
599 : | |||
600 : | setMethod("summary", | ||
601 : | feinerer | 816 | signature(object = "Corpus"), |
602 : | feinerer | 54 | function(object){ |
603 : | show(object) | ||
604 : | feinerer | 71 | if (length(DMetaData(object)) > 0) { |
605 : | feinerer | 698 | cat(sprintf(ngettext(length(CMetaData(object)@MetaData), |
606 : | feinerer | 77 | "\nThe metadata consists of %d tag-value pair and a data frame\n", |
607 : | "\nThe metadata consists of %d tag-value pairs and a data frame\n"), | ||
608 : | feinerer | 698 | length(CMetaData(object)@MetaData))) |
609 : | feinerer | 54 | cat("Available tags are:\n") |
610 : | feinerer | 722 | cat(strwrap(paste(names(CMetaData(object)@MetaData), collapse = " "), indent = 2, exdent = 2), "\n") |
611 : | feinerer | 77 | cat("Available variables in the data frame are:\n") |
612 : | feinerer | 722 | cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n") |
613 : | feinerer | 54 | } |
614 : | }) | ||
615 : | feinerer | 55 | |
616 : | setGeneric("inspect", function(object) standardGeneric("inspect")) | ||
617 : | setMethod("inspect", | ||
618 : | feinerer | 816 | signature("Corpus"), |
619 : | feinerer | 55 | function(object) { |
620 : | summary(object) | ||
621 : | cat("\n") | ||
622 : | feinerer | 886 | if (DBControl(object)[["useDb"]] && require("filehash")) { |
623 : | feinerer | 719 | db <- dbInit(DBControl(object)[["dbName"]], DBControl(object)[["dbType"]]) |
624 : | show(dbMultiFetch(db, unlist(object))) | ||
625 : | } | ||
626 : | else | ||
627 : | feinerer | 837 | print(noquote(lapply(object, identity))) |
628 : | feinerer | 55 | }) |
629 : | feinerer | 65 | |
630 : | # No metadata is checked | ||
631 : | setGeneric("%IN%", function(x, y) standardGeneric("%IN%")) | ||
632 : | setMethod("%IN%", | ||
633 : | feinerer | 816 | signature(x = "TextDocument", y = "Corpus"), |
634 : | feinerer | 65 | function(x, y) { |
635 : | feinerer | 886 | if (DBControl(y)[["useDb"]] && require("filehash")) { |
636 : | feinerer | 729 | db <- dbInit(DBControl(y)[["dbName"]], DBControl(y)[["dbType"]]) |
637 : | feinerer | 816 | result <- any(sapply(y, function(x, z) {x %in% Content(z)}, x)) |
638 : | feinerer | 729 | } |
639 : | else | ||
640 : | result <- x %in% y | ||
641 : | return(result) | ||
642 : | feinerer | 65 | }) |
643 : | feinerer | 719 | |
644 : | setMethod("lapply", | ||
645 : | feinerer | 816 | signature(X = "Corpus"), |
646 : | feinerer | 719 | function(X, FUN, ...) { |
647 : | feinerer | 886 | if (DBControl(X)[["useDb"]] && require("filehash")) { |
648 : | feinerer | 719 | db <- dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]]) |
649 : | result <- lapply(dbMultiFetch(db, unlist(X)), FUN, ...) | ||
650 : | } | ||
651 : | feinerer | 830 | else { |
652 : | lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus") | ||
653 : | if (!is.null(lazyTmMap)) | ||
654 : | .Call("copyCorpus", X, materialize(X)) | ||
655 : | feinerer | 719 | result <- base::lapply(X, FUN, ...) |
656 : | feinerer | 830 | } |
657 : | feinerer | 719 | return(result) |
658 : | }) | ||
659 : | |||
660 : | setMethod("sapply", | ||
661 : | feinerer | 816 | signature(X = "Corpus"), |
662 : | feinerer | 719 | function(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE) { |
663 : | feinerer | 886 | if (DBControl(X)[["useDb"]] && require("filehash")) { |
664 : | feinerer | 719 | db <- dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]]) |
665 : | result <- sapply(dbMultiFetch(db, unlist(X)), FUN, ...) | ||
666 : | } | ||
667 : | feinerer | 830 | else { |
668 : | lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus") | ||
669 : | if (!is.null(lazyTmMap)) | ||
670 : | .Call("copyCorpus", X, materialize(X)) | ||
671 : | feinerer | 719 | result <- base::sapply(X, FUN, ...) |
672 : | feinerer | 830 | } |
673 : | feinerer | 719 | return(result) |
674 : | }) | ||
675 : | feinerer | 819 | |
676 : | feinerer | 836 | setAs("list", "Corpus", function(from) { |
677 : | cmeta.node <- new("MetaDataNode", | ||
678 : | NodeID = 0, | ||
679 : | MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")), | ||
680 : | children = list()) | ||
681 : | data <- list() | ||
682 : | counter <- 1 | ||
683 : | for (f in from) { | ||
684 : | doc <- new("PlainTextDocument", | ||
685 : | .Data = f, URI = NULL, Cached = TRUE, | ||
686 : | Author = "", DateTimeStamp = Sys.time(), | ||
687 : | Description = "", ID = as.character(counter), | ||
688 : | Origin = "", Heading = "", Language = "en_US") | ||
689 : | data <- c(data, list(doc)) | ||
690 : | counter <- counter + 1 | ||
691 : | } | ||
692 : | return(new("Corpus", .Data = data, | ||
693 : | DMetaData = data.frame(MetaID = rep(0, length(from)), stringsAsFactors = FALSE), | ||
694 : | CMetaData = cmeta.node, | ||
695 : | DBControl = dbControl <- list(useDb = FALSE, dbName = "", dbType = "DB1"))) | ||
696 : | }) | ||
697 : | |||
698 : | feinerer | 819 | setGeneric("writeCorpus", function(object, path = ".", filenames = NULL) standardGeneric("writeCorpus")) |
699 : | setMethod("writeCorpus", | ||
700 : | signature(object = "Corpus"), | ||
701 : | function(object, path = ".", filenames = NULL) { | ||
702 : | filenames <- file.path(path, | ||
703 : | if (is.null(filenames)) sapply(object, function(x) sprintf("%s.txt", ID(x))) | ||
704 : | else filenames) | ||
705 : | i <- 1 | ||
706 : | for (o in object) { | ||
707 : | feinerer | 859 | writeLines(asPlain(o), filenames[i]) |
708 : | feinerer | 819 | i <- i + 1 |
709 : | } | ||
710 : | }) |
R-Forge@R-project.org | ViewVC Help |
Powered by ViewVC 1.0.0 |