SCM

SCM Repository

[tm] Annotation of /pkg/R/corpus.R
ViewVC logotype

Annotation of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 986 - (view) (download)

1 : feinerer 17 # Author: Ingo Feinerer
2 :    
3 : feinerer 958 prepareReader <- function(readerControl, defaultReader = NULL, ...) {
4 :     if (is.null(readerControl$reader))
5 :     readerControl$reader <- defaultReader
6 : feinerer 985 if (inherits(readerControl$reader, "FunctionGenerator"))
7 : feinerer 958 readerControl$reader <- readerControl$reader(...)
8 : feinerer 950 if (is.null(readerControl$language))
9 :     readerControl$language <- "eng"
10 : feinerer 958 readerControl
11 :     }
12 : feinerer 950
13 : feinerer 985 # Node ID, actual meta data, and possibly other nodes as children
14 : feinerer 986 .MetaDataNode <- function(nodeid = 0, meta = list(create_date = as.POSIXlt(Sys.time(), tz = "GMT"), creator = Sys.getenv("LOGNAME")), children = NULL) {
15 :     structure(list(NodeID = nodeid, MetaData = meta, Children = children),
16 :     class = "MetaDataNode")
17 : feinerer 985 }
18 : feinerer 958
19 : feinerer 985 print.MetaDataNode <- function(x, ...)
20 : feinerer 986 print(x$MetaData)
21 : feinerer 985
22 :     .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
23 :     attr(x, "CMetaData") <- cmeta
24 :     attr(x, "DMetaData") <- dmeta
25 :     attr(x, "DBControl") <- dbcontrol
26 :     class(x) <- c("PCorpus", "Corpus", "list")
27 :     x
28 :     }
29 :    
30 :     PCorpus <- function(x,
31 :     readerControl = list(reader = x$DefaultReader, language = "eng"),
32 : feinerer 946 dbControl = list(dbName = "", dbType = "DB1"),
33 :     ...) {
34 : feinerer 985 readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
35 : feinerer 63
36 : feinerer 946 if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
37 :     stop("error in creating database")
38 :     db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
39 : feinerer 712
40 : feinerer 946 # Allocate memory in advance if length is known
41 : feinerer 985 tdl <- if (x$Length > 0)
42 :     vector("list", as.integer(x$Length))
43 : feinerer 946 else
44 :     list()
45 : feinerer 869
46 : feinerer 946 counter <- 1
47 : feinerer 985 while (!eoi(x)) {
48 :     x <- stepNext(x)
49 :     elem <- getElem(x)
50 : feinerer 946 doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
51 :     filehash::dbInsert(db, ID(doc), doc)
52 : feinerer 985 if (x$Length > 0) tdl[[counter]] <- ID(doc)
53 : feinerer 946 else tdl <- c(tdl, ID(doc))
54 :     counter <- counter + 1
55 :     }
56 : feinerer 63
57 : feinerer 946 df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
58 :     filehash::dbInsert(db, "DMetaData", df)
59 :     dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
60 : feinerer 712
61 : feinerer 985 .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
62 :     }
63 : feinerer 71
64 : feinerer 985 .VCorpus <- function(x, cmeta, dmeta) {
65 :     attr(x, "CMetaData") <- cmeta
66 :     attr(x, "DMetaData") <- dmeta
67 :     class(x) <- c("VCorpus", "Corpus", "list")
68 :     x
69 : feinerer 946 }
70 : feinerer 21
71 : feinerer 946 # The "..." are additional arguments for the FunctionGenerator reader
72 : feinerer 985 VCorpus <- Corpus <- function(x,
73 :     readerControl = list(reader = x$DefaultReader, language = "eng"),
74 : feinerer 946 ...) {
75 : feinerer 985 readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
76 : feinerer 49
77 : feinerer 946 # Allocate memory in advance if length is known
78 : feinerer 985 tdl <- if (x$Length > 0)
79 :     vector("list", as.integer(x$Length))
80 : feinerer 946 else
81 :     list()
82 : feinerer 72
83 : feinerer 985 if (x$Vectorized)
84 : feinerer 986 mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
85 :     pGetElem(x),
86 :     id = as.character(seq_len(x$Length)),
87 :     SIMPLIFY = FALSE)
88 : feinerer 946 else {
89 :     counter <- 1
90 : feinerer 985 while (!eoi(x)) {
91 :     x <- stepNext(x)
92 :     elem <- getElem(x)
93 : feinerer 946 doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
94 : feinerer 985 if (x$Length > 0)
95 : feinerer 946 tdl[[counter]] <- doc
96 :     else
97 :     tdl <- c(tdl, list(doc))
98 :     counter <- counter + 1
99 :     }
100 :     }
101 : feinerer 72
102 : feinerer 946 df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
103 : feinerer 985 .VCorpus(tdl, .MetaDataNode(), df)
104 :     }
105 : feinerer 72
106 : feinerer 985 `[.PCorpus` <- function(x, i) {
107 :     if (missing(i)) return(x)
108 :     cmeta <- CMetaData(x)
109 :     index <- attr(x, "DMetaData")[[1 , "subset"]]
110 :     attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
111 :     dmeta <- attr(x, "DMetaData")
112 :     dbcontrol <- DBControl(x)
113 :     class(x) <- "list"
114 :     .PCorpus(x[i, drop = FALSE], cmeta, dmeta, dbcontrol)
115 : feinerer 946 }
116 : feinerer 72
117 : feinerer 985 `[.VCorpus` <- function(x, i) {
118 :     if (missing(i)) return(x)
119 :     cmeta <- CMetaData(x)
120 :     dmeta <- DMetaData(x)[i, , drop = FALSE]
121 :     class(x) <- "list"
122 :     .VCorpus(x[i, drop = FALSE], cmeta, dmeta)
123 :     }
124 : feinerer 49
125 : feinerer 985 `[<-.PCorpus` <- function(x, i, value) {
126 :     db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
127 :     counter <- 1
128 :     for (id in unclass(x)[i]) {
129 :     if (identical(length(value), 1)) db[[id]] <- value
130 :     else db[[id]] <- value[[counter]]
131 :     counter <- counter + 1
132 : feinerer 829 }
133 : feinerer 985 x
134 : feinerer 828 }
135 :    
136 : feinerer 985 `[[.PCorpus` <- function(x, i) {
137 :     db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
138 :     class(x) <- "list"
139 :     filehash::dbFetch(db, x[[i]])
140 :     }
141 :     `[[.VCorpus` <- function(x, i) {
142 :     lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
143 :     if (!is.null(lazyTmMap))
144 :     .Call("copyCorpus", x, materialize(x, i))
145 :     class(x) <- "list"
146 :     x[[i]]
147 :     }
148 : feinerer 886
149 : feinerer 985 `[[<-.PCorpus` <- function(x, i, value) {
150 :     db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
151 :     index <- unclass(x)[[i]]
152 :     db[[index]] <- value
153 :     x
154 : feinerer 946 }
155 : feinerer 985 `[[<-.VCorpus` <- function(x, i, value) {
156 :     # Mark new objects as not active for lazy mapping
157 :     lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
158 :     if (!is.null(lazyTmMap)) {
159 :     lazyTmMap$index[i] <- FALSE
160 :     meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
161 :     }
162 :     # Set the value
163 :     cl <- class(x)
164 :     class(x) <- "list"
165 :     x[[i]] <- value
166 :     class(x) <- cl
167 :     x
168 :     }
169 : feinerer 946
170 : feinerer 698 # Update \code{NodeID}s of a CMetaData tree
171 : feinerer 985 update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
172 : feinerer 698 # Traversal of (binary) CMetaData tree with setup of \code{NodeID}s
173 : feinerer 985 set_id <- function(x) {
174 :     attrs <- attributes(x)
175 :     x <- id
176 :     attributes(x) <- attrs
177 : feinerer 697 id <<- id + 1
178 :     level <<- level + 1
179 : feinerer 985 if (length(attr(x, "Children")) > 0) {
180 :     mapping <<- cbind(mapping, c(as.numeric(attr(x, "Children")[[1]]), id))
181 :     left <- set_id(attr(x, "Children")[[1]])
182 : feinerer 697 if (level == 1) {
183 :     left.mapping <<- mapping
184 :     mapping <<- NULL
185 :     }
186 : feinerer 985 mapping <<- cbind(mapping, c(as.numeric(attr(x, "Children")[[2]]), id))
187 :     right <- set_id(attr(x, "Children")[[2]])
188 : feinerer 71
189 : feinerer 985 attr(x, "Children") <- list(left, right)
190 : feinerer 71 }
191 : feinerer 697 level <<- level - 1
192 : feinerer 985 x
193 : feinerer 71 }
194 :    
195 : feinerer 985 list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
196 : feinerer 71 }
197 :    
198 : feinerer 985 c2 <- function(x, y, ...) {
199 :     # Update the CMetaData tree
200 :     cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
201 :     update.struct <- update_id(cmeta)
202 : feinerer 71
203 : feinerer 985 new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
204 : feinerer 720
205 : feinerer 985 # Find indices to be updated for the left tree
206 :     indices.mapping <- NULL
207 :     for (m in levels(as.factor(DMetaData(x)$MetaID))) {
208 :     indices <- (DMetaData(x)$MetaID == m)
209 :     indices.mapping <- c(indices.mapping, list(m = indices))
210 :     names(indices.mapping)[length(indices.mapping)] <- m
211 :     }
212 : feinerer 958
213 : feinerer 985 # Update the DMetaData data frames for the left tree
214 :     for (i in 1:ncol(update.struct$left.mapping)) {
215 :     map <- update.struct$left.mapping[,i]
216 :     DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
217 :     }
218 : feinerer 689
219 : feinerer 985 # Find indices to be updated for the right tree
220 :     indices.mapping <- NULL
221 :     for (m in levels(as.factor(DMetaData(y)$MetaID))) {
222 :     indices <- (DMetaData(y)$MetaID == m)
223 :     indices.mapping <- c(indices.mapping, list(m = indices))
224 :     names(indices.mapping)[length(indices.mapping)] <- m
225 :     }
226 : feinerer 71
227 : feinerer 985 # Update the DMetaData data frames for the right tree
228 :     for (i in 1:ncol(update.struct$right.mapping)) {
229 :     map <- update.struct$right.mapping[,i]
230 :     DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
231 :     }
232 : feinerer 71
233 : feinerer 985 # Merge the DMetaData data frames
234 :     labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
235 :     na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
236 :     x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
237 :     labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
238 :     na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
239 :     y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
240 :     DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
241 : feinerer 71
242 : feinerer 985 new
243 :     }
244 : feinerer 71
245 : feinerer 985 c.Corpus <-
246 :     function(x, ..., recursive = FALSE)
247 :     {
248 :     args <- list(...)
249 : feinerer 71
250 : feinerer 985 if (identical(length(args), 0))
251 :     return(x)
252 : feinerer 71
253 : feinerer 985 if (!all(unlist(lapply(args, inherits, class(x)))))
254 :     stop("not all arguments are of the same corpus type")
255 : feinerer 71
256 : feinerer 985 if (inherits(x, "PCorpus"))
257 :     stop("concatenation of corpora with underlying databases is not supported")
258 : feinerer 689
259 : feinerer 985 Reduce(c2, base::c(list(x), args))
260 :     }
261 : feinerer 54
262 : feinerer 985 c.TextDocument <- function(x, ..., recursive = FALSE) {
263 :     args <- list(...)
264 : feinerer 72
265 : feinerer 985 if (identical(length(args), 0))
266 :     return(x)
267 : feinerer 72
268 : feinerer 985 if (!all(unlist(lapply(args, inherits, class(x)))))
269 :     stop("not all arguments are text documents")
270 : feinerer 54
271 : feinerer 985 dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
272 :     .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
273 :     }
274 : feinerer 55
275 : feinerer 985 print.Corpus <- function(x, ...) {
276 :     cat(sprintf(ngettext(length(x),
277 :     "A corpus with %d text document\n",
278 :     "A corpus with %d text documents\n"),
279 :     length(x)))
280 :     invisible(x)
281 :     }
282 :    
283 :     summary.Corpus <- function(x, ...) {
284 :     print(x)
285 :     if (length(DMetaData(x)) > 0) {
286 :     cat(sprintf(ngettext(length(attr(CMetaData(x), "MetaData")),
287 :     "\nThe metadata consists of %d tag-value pair and a data frame\n",
288 :     "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
289 :     length(attr(CMetaData(x), "MetaData"))))
290 :     cat("Available tags are:\n")
291 :     cat(strwrap(paste(names(attr(CMetaData(x), "MetaData")), collapse = " "), indent = 2, exdent = 2), "\n")
292 :     cat("Available variables in the data frame are:\n")
293 :     cat(strwrap(paste(names(DMetaData(x)), collapse = " "), indent = 2, exdent = 2), "\n")
294 :     }
295 :     }
296 :    
297 : feinerer 938 inspect <- function(x) UseMethod("inspect", x)
298 : feinerer 946 inspect.PCorpus <- function(x) {
299 : feinerer 938 summary(x)
300 :     cat("\n")
301 : feinerer 946 db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
302 :     show(filehash::dbMultiFetch(db, unlist(x)))
303 : feinerer 938 }
304 : feinerer 963 inspect.VCorpus <- function(x) {
305 : feinerer 946 summary(x)
306 :     cat("\n")
307 :     print(noquote(lapply(x, identity)))
308 :     }
309 : feinerer 65
310 :     # No metadata is checked
311 : feinerer 985 `%IN%` <- function(x, y) UseMethod("%IN%", y)
312 :     `%IN%.PCorpus` <- function(x, y) {
313 :     db <- filehash::dbInit(DBControl(y)[["dbName"]], DBControl(y)[["dbType"]])
314 :     any(unlist(lapply(y, function(x, z) {x %in% Content(z)}, x)))
315 :     }
316 :     `%IN%.VCorpus` <- function(x, y) x %in% y
317 : feinerer 719
318 : feinerer 985 lapply.PCorpus <- function(X, FUN, ...) {
319 :     db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
320 :     lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
321 :     }
322 :     lapply.VCorpus <- function(X, FUN, ...) {
323 :     lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
324 :     if (!is.null(lazyTmMap))
325 :     .Call("copyCorpus", X, materialize(X))
326 :     base::lapply(X, FUN, ...)
327 :     }
328 : feinerer 719
329 : feinerer 985 writeCorpus <- function(x, path = ".", filenames = NULL) {
330 :     filenames <- file.path(path,
331 :     if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
332 :     else filenames)
333 :     i <- 1
334 :     for (o in x) {
335 :     writeLines(as.PlainTextDocument(o), filenames[i])
336 :     i <- i + 1
337 : feinerer 836 }
338 : feinerer 985 }

R-Forge@R-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge