SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1114, Fri Nov 26 14:05:54 2010 UTC revision 1242, Mon Aug 19 05:33:57 2013 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  .PCorpus <-
4    function(x, cmeta, dmeta, dbcontrol)
5    {
6      attr(x, "CMetaData") <- cmeta      attr(x, "CMetaData") <- cmeta
7      attr(x, "DMetaData") <- dmeta      attr(x, "DMetaData") <- dmeta
8      attr(x, "DBControl") <- dbcontrol      attr(x, "DBControl") <- dbcontrol
9      class(x) <- c("PCorpus", "Corpus", "list")      class(x) <- c("PCorpus", "Corpus", "list")
10      x      x
11  }  }
 DBControl <- function(x) attr(x, "DBControl")  
12    
13  PCorpus <- function(x,  DBControl <-
14    function(x)
15        attr(x, "DBControl")
16    
17    PCorpus <-
18    function(x,
19                      readerControl = list(reader = x$DefaultReader, language = "en"),                      readerControl = list(reader = x$DefaultReader, language = "en"),
20                      dbControl = list(dbName = "", dbType = "DB1"),                      dbControl = list(dbName = "", dbType = "DB1"),
21                      ...) {           ...)
22    {
23      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
24    
25      if (is.function(readerControl$init))      if (is.function(readerControl$init))
# Line 50  Line 57 
57      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
58  }  }
59    
60  .VCorpus <- function(x, cmeta, dmeta) {  .VCorpus <-
61    function(x, cmeta, dmeta)
62    {
63      attr(x, "CMetaData") <- cmeta      attr(x, "CMetaData") <- cmeta
64      attr(x, "DMetaData") <- dmeta      attr(x, "DMetaData") <- dmeta
65      class(x) <- c("VCorpus", "Corpus", "list")      class(x) <- c("VCorpus", "Corpus", "list")
66      x      x
67  }  }
68    
 # Register S3 corpus classes to be recognized by S4 methods. This is  
 # mainly a fix to be compatible with packages which were originally  
 # developed to cooperate with corresponding S4 tm classes. Necessary  
 # since tm's class architecture was changed to S3 since tm version 0.5.  
 setOldClass(c("VCorpus", "Corpus", "list"))  
   
69  # The "..." are additional arguments for the FunctionGenerator reader  # The "..." are additional arguments for the FunctionGenerator reader
70  VCorpus <- Corpus <- function(x,  VCorpus <-
71    Corpus <-
72    function(x,
73                                readerControl = list(reader = x$DefaultReader, language = "en"),                                readerControl = list(reader = x$DefaultReader, language = "en"),
74                                ...) {           ...)
75    {
76      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
77    
78      if (is.function(readerControl$init))      if (is.function(readerControl$init))
# Line 104  Line 110 
110      .VCorpus(tdl, .MetaDataNode(), df)      .VCorpus(tdl, .MetaDataNode(), df)
111  }  }
112    
113  `[.PCorpus` <- function(x, i) {  `[.PCorpus` <-
114    function(x, i)
115    {
116      if (missing(i)) return(x)      if (missing(i)) return(x)
117      index <- attr(x, "DMetaData")[[1 , "subset"]]      index <- attr(x, "DMetaData")[[1 , "subset"]]
118      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
# Line 112  Line 120 
120      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
121  }  }
122    
123  `[.VCorpus` <- function(x, i) {  `[.VCorpus` <-
124    function(x, i)
125    {
126      if (missing(i)) return(x)      if (missing(i)) return(x)
127      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
128  }  }
129    
130  `[<-.PCorpus` <- function(x, i, value) {  `[<-.PCorpus` <-
131    function(x, i, value)
132    {
133      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
134      counter <- 1      counter <- 1
135      for (id in unclass(x)[i]) {      for (id in unclass(x)[i]) {
# Line 128  Line 140 
140      x      x
141  }  }
142    
143  .map_name_index <- function(x, i) {  .map_name_index <-
144    function(x, i)
145    {
146      if (is.character(i)) {      if (is.character(i)) {
147          if (is.null(names(x)))          if (is.null(names(x)))
148              match(i, meta(x, "ID", type = "local"))              match(i, meta(x, "ID", type = "local"))
# Line 138  Line 152 
152      i      i
153  }  }
154    
155  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
156    function(x, i)
157    {
158      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
159      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
160      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, NextMethod("[["))
161  }  }
162  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
163    function(x, i)
164    {
165      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
166      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
167      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
# Line 151  Line 169 
169      NextMethod("[[")      NextMethod("[[")
170  }  }
171    
172  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
173    function(x, i, value)
174    {
175      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
176      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
177      index <- unclass(x)[[i]]      index <- unclass(x)[[i]]
178      db[[index]] <- value      db[[index]] <- value
179      x      x
180  }  }
181  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
182    function(x, i, value)
183    {
184      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
185      # Mark new objects as not active for lazy mapping      # Mark new objects as not active for lazy mapping
186      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
# Line 174  Line 196 
196  }  }
197    
198  # Update NodeIDs of a CMetaData tree  # Update NodeIDs of a CMetaData tree
199  .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  .update_id <-
200    function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0)
201    {
202      # Traversal of (binary) CMetaData tree with setup of NodeIDs      # Traversal of (binary) CMetaData tree with setup of NodeIDs
203      set_id <- function(x) {      set_id <- function(x) {
204          x$NodeID <- id          x$NodeID <- id
# Line 199  Line 223 
223  }  }
224    
225  # Find indices to be updated for a CMetaData tree  # Find indices to be updated for a CMetaData tree
226  .find_indices <- function(x) {  .find_indices <-
227    function(x)
228    {
229      indices.mapping <- NULL      indices.mapping <- NULL
230      for (m in levels(as.factor(DMetaData(x)$MetaID))) {      for (m in levels(as.factor(DMetaData(x)$MetaID))) {
231          indices <- (DMetaData(x)$MetaID == m)          indices <- (DMetaData(x)$MetaID == m)
# Line 209  Line 235 
235      indices.mapping      indices.mapping
236  }  }
237    
238  c2 <- function(x, y, ...) {  c2 <-
239    function(x, y, ...)
240    {
241      # Update the CMetaData tree      # Update the CMetaData tree
242      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
243      update.struct <- .update_id(cmeta)      update.struct <- .update_id(cmeta)
# Line 236  Line 264 
264    
265      # Merge the DMetaData data frames      # Merge the DMetaData data frames
266      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
267      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))      na.matrix <- matrix(NA,
268                            nrow = nrow(DMetaData(x)),
269                            ncol = length(labels),
270                            dimnames = list(row.names(DMetaData(x)), labels))
271      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
272      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
273      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))      na.matrix <- matrix(NA,
274                            nrow = nrow(DMetaData(y)),
275                            ncol = length(labels),
276                            dimnames = list(row.names(DMetaData(y)), labels))
277      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
278      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
279    
# Line 247  Line 281 
281  }  }
282    
283  c.Corpus <-  c.Corpus <-
284  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
285  {  {
286      args <- list(...)      args <- list(...)
287        x <- args[[1L]]
288    
289      if (identical(length(args), 0L))      if(length(args) == 1L)
290          return(x)          return(x)
291    
292      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
# Line 260  Line 295 
295      if (inherits(x, "PCorpus"))      if (inherits(x, "PCorpus"))
296          stop("concatenation of corpora with underlying databases is not supported")          stop("concatenation of corpora with underlying databases is not supported")
297    
     l <- base::c(list(x), args)  
298      if (recursive)      if (recursive)
299          Reduce(c2, l)          Reduce(c2, args)
300      else {      else {
301          l <- do.call("c", lapply(l, unclass))          args <- do.call("c", lapply(args, unclass))
302          .VCorpus(l,          .VCorpus(args,
303                   cmeta = .MetaDataNode(),                   cmeta = .MetaDataNode(),
304                   dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))                   dmeta = data.frame(MetaID = rep(0, length(args)),
305                                        stringsAsFactors = FALSE))
306      }      }
307  }  }
308    
309  c.TextDocument <- function(x, ..., recursive = FALSE) {  c.TextDocument <-
310    function(..., recursive = FALSE)
311    {
312      args <- list(...)      args <- list(...)
313        x <- args[[1L]]
314    
315      if (identical(length(args), 0L))      if(length(args) == 1L)
316          return(x)          return(x)
317    
318      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
319          stop("not all arguments are text documents")          stop("not all arguments are text documents")
320    
321      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)      dmeta <- data.frame(MetaID = rep(0, length(args)),
322      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)                          stringsAsFactors = FALSE)
323        .VCorpus(args, .MetaDataNode(), dmeta)
324  }  }
325    
326  print.Corpus <- function(x, ...) {  print.Corpus <-
327    function(x, ...)
328    {
329      cat(sprintf(ngettext(length(x),      cat(sprintf(ngettext(length(x),
330                           "A corpus with %d text document\n",                           "A corpus with %d text document\n",
331                           "A corpus with %d text documents\n"),                           "A corpus with %d text documents\n"),
# Line 292  Line 333 
333      invisible(x)      invisible(x)
334  }  }
335    
336  summary.Corpus <- function(object, ...) {  summary.Corpus <-
337    function(object, ...)
338    {
339      print(object)      print(object)
340      if (length(DMetaData(object)) > 0) {      if (length(DMetaData(object)) > 0) {
341          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
# Line 306  Line 349 
349      }      }
350  }  }
351    
352  inspect <- function(x) UseMethod("inspect", x)  inspect <-
353  inspect.PCorpus <- function(x) {  function(x)
354        UseMethod("inspect", x)
355    inspect.PCorpus <-
356    function(x)
357    {
358      summary(x)      summary(x)
359      cat("\n")      cat("\n")
360      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
361      show(filehash::dbMultiFetch(db, unlist(x)))      show(filehash::dbMultiFetch(db, unlist(x)))
362  }  }
363  inspect.VCorpus <- function(x) {  inspect.VCorpus <-
364    function(x)
365    {
366      summary(x)      summary(x)
367      cat("\n")      cat("\n")
368      print(noquote(lapply(x, identity)))      print(noquote(lapply(x, identity)))
369  }  }
370    
371  lapply.PCorpus <- function(X, FUN, ...) {  lapply.PCorpus <-
372    function(X, FUN, ...)
373    {
374      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
375      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
376  }  }
377  lapply.VCorpus <- function(X, FUN, ...) {  lapply.VCorpus <-
378    function(X, FUN, ...)
379    {
380      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
381      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
382          .Call("copyCorpus", X, materialize(X))          .Call("copyCorpus", X, materialize(X))
383      base::lapply(X, FUN, ...)      base::lapply(X, FUN, ...)
384  }  }
385    
386  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
387    function(x, path = ".", filenames = NULL)
388    {
389      filenames <- file.path(path,      filenames <- file.path(path,
390                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
391                             else filenames)                             else filenames)

Legend:
Removed from v.1114  
changed lines
  Added in v.1242

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge