SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1108, Fri Oct 22 18:32:47 2010 UTC revision 1203, Fri Jan 11 19:43:37 2013 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  .PCorpus <-
4    function(x, cmeta, dmeta, dbcontrol)
5    {
6      attr(x, "CMetaData") <- cmeta      attr(x, "CMetaData") <- cmeta
7      attr(x, "DMetaData") <- dmeta      attr(x, "DMetaData") <- dmeta
8      attr(x, "DBControl") <- dbcontrol      attr(x, "DBControl") <- dbcontrol
9      class(x) <- c("PCorpus", "Corpus", "list")      class(x) <- c("PCorpus", "Corpus", "list")
10      x      x
11  }  }
 DBControl <- function(x) attr(x, "DBControl")  
12    
13  PCorpus <- function(x,  DBControl <-
14    function(x)
15        attr(x, "DBControl")
16    
17    PCorpus <-
18    function(x,
19                      readerControl = list(reader = x$DefaultReader, language = "en"),                      readerControl = list(reader = x$DefaultReader, language = "en"),
20                      dbControl = list(dbName = "", dbType = "DB1"),                      dbControl = list(dbName = "", dbType = "DB1"),
21                      ...) {           ...)
22    {
23      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
24    
25        if (is.function(readerControl$init))
26            readerControl$init()
27    
28        if (is.function(readerControl$exit))
29            on.exit(readerControl$exit())
30    
31      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
32          stop("error in creating database")          stop("error in creating database")
33      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
# Line 44  Line 57 
57      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
58  }  }
59    
60  .VCorpus <- function(x, cmeta, dmeta) {  .VCorpus <-
61    function(x, cmeta, dmeta)
62    {
63      attr(x, "CMetaData") <- cmeta      attr(x, "CMetaData") <- cmeta
64      attr(x, "DMetaData") <- dmeta      attr(x, "DMetaData") <- dmeta
65      class(x) <- c("VCorpus", "Corpus", "list")      class(x) <- c("VCorpus", "Corpus", "list")
# Line 58  Line 73 
73  setOldClass(c("VCorpus", "Corpus", "list"))  setOldClass(c("VCorpus", "Corpus", "list"))
74    
75  # The "..." are additional arguments for the FunctionGenerator reader  # The "..." are additional arguments for the FunctionGenerator reader
76  VCorpus <- Corpus <- function(x,  VCorpus <-
77    Corpus <-
78    function(x,
79                                readerControl = list(reader = x$DefaultReader, language = "en"),                                readerControl = list(reader = x$DefaultReader, language = "en"),
80                                ...) {           ...)
81    {
82      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
83    
84        if (is.function(readerControl$init))
85            readerControl$init()
86    
87        if (is.function(readerControl$exit))
88            on.exit(readerControl$exit())
89    
90      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
91      tdl <- if (x$Length > 0)      tdl <- if (x$Length > 0)
92          vector("list", as.integer(x$Length))          vector("list", as.integer(x$Length))
# Line 92  Line 116 
116      .VCorpus(tdl, .MetaDataNode(), df)      .VCorpus(tdl, .MetaDataNode(), df)
117  }  }
118    
119  `[.PCorpus` <- function(x, i) {  `[.PCorpus` <-
120    function(x, i)
121    {
122      if (missing(i)) return(x)      if (missing(i)) return(x)
123      index <- attr(x, "DMetaData")[[1 , "subset"]]      index <- attr(x, "DMetaData")[[1 , "subset"]]
124      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
# Line 100  Line 126 
126      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
127  }  }
128    
129  `[.VCorpus` <- function(x, i) {  `[.VCorpus` <-
130    function(x, i)
131    {
132      if (missing(i)) return(x)      if (missing(i)) return(x)
133      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
134  }  }
135    
136  `[<-.PCorpus` <- function(x, i, value) {  `[<-.PCorpus` <-
137    function(x, i, value)
138    {
139      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
140      counter <- 1      counter <- 1
141      for (id in unclass(x)[i]) {      for (id in unclass(x)[i]) {
# Line 116  Line 146 
146      x      x
147  }  }
148    
149  .map_name_index <- function(x, i) {  .map_name_index <-
150    function(x, i)
151    {
152      if (is.character(i)) {      if (is.character(i)) {
153          if (is.null(names(x)))          if (is.null(names(x)))
154              match(i, meta(x, "ID", type = "local"))              match(i, meta(x, "ID", type = "local"))
# Line 126  Line 158 
158      i      i
159  }  }
160    
161  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
162    function(x, i)
163    {
164      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
165      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
166      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, NextMethod("[["))
167  }  }
168  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
169    function(x, i)
170    {
171      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
172      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
173      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
# Line 139  Line 175 
175      NextMethod("[[")      NextMethod("[[")
176  }  }
177    
178  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
179    function(x, i, value)
180    {
181      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
182      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
183      index <- unclass(x)[[i]]      index <- unclass(x)[[i]]
184      db[[index]] <- value      db[[index]] <- value
185      x      x
186  }  }
187  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
188    function(x, i, value)
189    {
190      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
191      # Mark new objects as not active for lazy mapping      # Mark new objects as not active for lazy mapping
192      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
# Line 162  Line 202 
202  }  }
203    
204  # Update NodeIDs of a CMetaData tree  # Update NodeIDs of a CMetaData tree
205  .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  .update_id <-
206    function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0)
207    {
208      # Traversal of (binary) CMetaData tree with setup of NodeIDs      # Traversal of (binary) CMetaData tree with setup of NodeIDs
209      set_id <- function(x) {      set_id <- function(x) {
210          x$NodeID <- id          x$NodeID <- id
# Line 187  Line 229 
229  }  }
230    
231  # Find indices to be updated for a CMetaData tree  # Find indices to be updated for a CMetaData tree
232  .find_indices <- function(x) {  .find_indices <-
233    function(x)
234    {
235      indices.mapping <- NULL      indices.mapping <- NULL
236      for (m in levels(as.factor(DMetaData(x)$MetaID))) {      for (m in levels(as.factor(DMetaData(x)$MetaID))) {
237          indices <- (DMetaData(x)$MetaID == m)          indices <- (DMetaData(x)$MetaID == m)
# Line 197  Line 241 
241      indices.mapping      indices.mapping
242  }  }
243    
244  c2 <- function(x, y, ...) {  c2 <-
245    function(x, y, ...)
246    {
247      # Update the CMetaData tree      # Update the CMetaData tree
248      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
249      update.struct <- .update_id(cmeta)      update.struct <- .update_id(cmeta)
# Line 224  Line 270 
270    
271      # Merge the DMetaData data frames      # Merge the DMetaData data frames
272      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
273      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))      na.matrix <- matrix(NA,
274                            nrow = nrow(DMetaData(x)),
275                            ncol = length(labels),
276                            dimnames = list(row.names(DMetaData(x)), labels))
277      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
278      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
279      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))      na.matrix <- matrix(NA,
280                            nrow = nrow(DMetaData(y)),
281                            ncol = length(labels),
282                            dimnames = list(row.names(DMetaData(y)), labels))
283      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
284      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
285    
# Line 235  Line 287 
287  }  }
288    
289  c.Corpus <-  c.Corpus <-
290  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
291  {  {
292      args <- list(...)      args <- list(...)
293        x <- args[[1L]]
294    
295      if (identical(length(args), 0L))      if(length(args) == 1L)
296          return(x)          return(x)
297    
298      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
# Line 248  Line 301 
301      if (inherits(x, "PCorpus"))      if (inherits(x, "PCorpus"))
302          stop("concatenation of corpora with underlying databases is not supported")          stop("concatenation of corpora with underlying databases is not supported")
303    
     l <- base::c(list(x), args)  
304      if (recursive)      if (recursive)
305          Reduce(c2, l)          Reduce(c2, args)
306      else {      else {
307          l <- do.call("c", lapply(l, unclass))          args <- do.call("c", lapply(args, unclass))
308          .VCorpus(l,          .VCorpus(args,
309                   cmeta = .MetaDataNode(),                   cmeta = .MetaDataNode(),
310                   dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))                   dmeta = data.frame(MetaID = rep(0, length(args)),
311                                        stringsAsFactors = FALSE))
312      }      }
313  }  }
314    
315  c.TextDocument <- function(x, ..., recursive = FALSE) {  c.TextDocument <-
316    function(..., recursive = FALSE)
317    {
318      args <- list(...)      args <- list(...)
319        x <- args[[1L]]
320    
321      if (identical(length(args), 0L))      if(length(args) == 1L)
322          return(x)          return(x)
323    
324      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
325          stop("not all arguments are text documents")          stop("not all arguments are text documents")
326    
327      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)      dmeta <- data.frame(MetaID = rep(0, length(args)),
328      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)                          stringsAsFactors = FALSE)
329        .VCorpus(args, .MetaDataNode(), dmeta)
330  }  }
331    
332  print.Corpus <- function(x, ...) {  print.Corpus <-
333    function(x, ...)
334    {
335      cat(sprintf(ngettext(length(x),      cat(sprintf(ngettext(length(x),
336                           "A corpus with %d text document\n",                           "A corpus with %d text document\n",
337                           "A corpus with %d text documents\n"),                           "A corpus with %d text documents\n"),
# Line 280  Line 339 
339      invisible(x)      invisible(x)
340  }  }
341    
342  summary.Corpus <- function(object, ...) {  summary.Corpus <-
343    function(object, ...)
344    {
345      print(object)      print(object)
346      if (length(DMetaData(object)) > 0) {      if (length(DMetaData(object)) > 0) {
347          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
# Line 294  Line 355 
355      }      }
356  }  }
357    
358  inspect <- function(x) UseMethod("inspect", x)  inspect <-
359  inspect.PCorpus <- function(x) {  function(x)
360        UseMethod("inspect", x)
361    inspect.PCorpus <-
362    function(x)
363    {
364      summary(x)      summary(x)
365      cat("\n")      cat("\n")
366      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
367      show(filehash::dbMultiFetch(db, unlist(x)))      show(filehash::dbMultiFetch(db, unlist(x)))
368  }  }
369  inspect.VCorpus <- function(x) {  inspect.VCorpus <-
370    function(x)
371    {
372      summary(x)      summary(x)
373      cat("\n")      cat("\n")
374      print(noquote(lapply(x, identity)))      print(noquote(lapply(x, identity)))
375  }  }
376    
377  lapply.PCorpus <- function(X, FUN, ...) {  lapply.PCorpus <-
378    function(X, FUN, ...)
379    {
380      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
381      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
382  }  }
383  lapply.VCorpus <- function(X, FUN, ...) {  lapply.VCorpus <-
384    function(X, FUN, ...)
385    {
386      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
387      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
388          .Call("copyCorpus", X, materialize(X))          .Call("copyCorpus", X, materialize(X))
389      base::lapply(X, FUN, ...)      base::lapply(X, FUN, ...)
390  }  }
391    
392  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
393    function(x, path = ".", filenames = NULL)
394    {
395      filenames <- file.path(path,      filenames <- file.path(path,
396                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
397                             else filenames)                             else filenames)

Legend:
Removed from v.1108  
changed lines
  Added in v.1203

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge