SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1306, Tue Mar 25 08:37:05 2014 UTC revision 1307, Tue Mar 25 12:15:51 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <-  .PCorpus <-
4  function(x, cmeta, dmeta, dbcontrol)  function(x, meta, dmeta, dbcontrol)
5  {      structure(list(content = as.list(x), meta = meta, dmeta = dmeta,
6      attr(x, "CMetaData") <- cmeta                     dbcontrol = dbcontrol),
7      attr(x, "DMetaData") <- dmeta                class = c("PCorpus", "Corpus"))
     attr(x, "DBControl") <- dbcontrol  
     class(x) <- c("PCorpus", "Corpus", "list")  
     x  
 }  
   
 DBControl <-  
 function(x)  
     attr(x, "DBControl")  
8    
9  PCorpus <-  PCorpus <-
10  function(x,  function(x,
# Line 57  Line 49 
49          names(tdl) <- x$names          names(tdl) <- x$names
50    
51      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
52      filehash::dbInsert(db, "DMetaData", df)      filehash::dbInsert(db, "CorpusDMeta", df)
53      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))      dmeta.df <- data.frame(key = "CorpusDMeta", subset = I(list(NA)))
54    
55      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      .PCorpus(tdl, CorpusMeta(), dmeta.df, dbControl)
56  }  }
57    
58  .VCorpus <-  .VCorpus <-
59  function(x, cmeta, dmeta)  function(x, meta, dmeta)
60  {      structure(list(content = as.list(x), meta = meta, dmeta = dmeta),
61      attr(x, "CMetaData") <- cmeta                class = c("VCorpus", "Corpus"))
     attr(x, "DMetaData") <- dmeta  
     class(x) <- c("VCorpus", "Corpus", "list")  
     x  
 }  
62    
63  VCorpus <-  VCorpus <-
64  Corpus <-  Corpus <-
# Line 93  Line 81 
81          list()          list()
82    
83      if (x$vectorized)      if (x$vectorized)
84          tdl <- mapply(function(elem, id) readerControl$reader(elem, readerControl$language, id),          tdl <- mapply(function(elem, id)
85                              readerControl$reader(elem, readerControl$language, id),
86                        pGetElem(x),                        pGetElem(x),
87                        id = if (is.null(x$names) || is.na(x$names)) as.character(seq_len(x$length)) else x$names,                        id = if (is.null(x$names) || is.na(x$names))
88                              as.character(seq_len(x$length))
89                          else x$names,
90                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
91      else {      else {
92          counter <- 1          counter <- 1
# Line 117  Line 108 
108      if (!is.null(x$names) && !is.na(x$names))      if (!is.null(x$names) && !is.na(x$names))
109          names(tdl) <- x$names          names(tdl) <- x$names
110      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
111      .VCorpus(tdl, .MetaDataNode(), df)      .VCorpus(tdl, CorpusMeta(), df)
112  }  }
113    
114  `[.PCorpus` <-  `[.PCorpus` <-
115  function(x, i)  function(x, i)
116  {  {
117      if (missing(i)) return(x)      if (!missing(i)) {
118      index <- attr(x, "DMetaData")[[1 , "subset"]]          x$content <- x$content[i]
119      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i          index <- x$dmeta[[1 , "subset"]]
120      dmeta <- attr(x, "DMetaData")          x$dmeta[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
121      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))      }
122        x
123  }  }
124    
125  `[.VCorpus` <-  `[.VCorpus` <-
126  function(x, i)  function(x, i)
127  {  {
128      if (missing(i)) return(x)      if (!missing(i)) {
129      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])          x$content <- x$content[i]
130            x$dmeta <- x$dmeta[i, , drop = FALSE]
131        }
132        x
133  }  }
134    
135  `[<-.PCorpus` <-  `[<-.PCorpus` <-
136  function(x, i, value)  function(x, i, value)
137  {  {
138      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
139      counter <- 1      counter <- 1
140      for (id in unclass(x)[i]) {      for (id in x$content[i]) {
141          if (identical(length(value), 1L)) db[[id]] <- value          db[[id]] <- if (identical(length(value), 1L))
142          else db[[id]] <- value[[counter]]              value
143            else
144                value[[counter]]
145          counter <- counter + 1          counter <- counter + 1
146      }      }
147      x      x
# Line 153  Line 150 
150  .map_name_index <-  .map_name_index <-
151  function(x, i)  function(x, i)
152  {  {
153      if (is.character(i)) {      if (is.character(i))
154          if (is.null(names(x)))          match(i, if (is.null(names(x))) meta(x, "id", "local") else names(x))
             match(i, meta(x, "id", type = "local"))  
155          else          else
             match(i, names(x))  
     }  
156      i      i
157  }  }
158    
# Line 166  Line 160 
160  function(x, i)  function(x, i)
161  {  {
162      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
163      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
164      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, x$content[[i]])
165  }  }
166  `[[.VCorpus` <-  `[[.VCorpus` <-
167  function(x, i)  function(x, i)
# Line 176  Line 170 
170      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
171      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
172          .Call("copyCorpus", x, materialize(x, i))          .Call("copyCorpus", x, materialize(x, i))
173      NextMethod("[[")      x$content[[i]]
174  }  }
175    
176  `[[<-.PCorpus` <-  `[[<-.PCorpus` <-
177  function(x, i, value)  function(x, i, value)
178  {  {
179      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
180      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
181      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
182      x      x
183  }  }
184  `[[<-.VCorpus` <-  `[[<-.VCorpus` <-
# Line 198  Line 191 
191          lazyTmMap$index[i] <- FALSE          lazyTmMap$index[i] <- FALSE
192          meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap          meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
193      }      }
194      # Set the value      x$content[[i]] <- value
195      cl <- class(x)      x
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
196  }  }
197    
198  # Update NodeIDs of a CMetaData tree  # Update NodeIDs of a CMetaData tree
# Line 237  Line 227 
227  function(x)  function(x)
228  {  {
229      indices.mapping <- NULL      indices.mapping <- NULL
230      for (m in levels(as.factor(DMetaData(x)$MetaID))) {      for (m in levels(as.factor(CorpusDMeta(x)$MetaID))) {
231          indices <- (DMetaData(x)$MetaID == m)          indices <- (CorpusDMeta(x)$MetaID == m)
232          indices.mapping <- c(indices.mapping, list(m = indices))          indices.mapping <- c(indices.mapping, list(m = indices))
233          names(indices.mapping)[length(indices.mapping)] <- m          names(indices.mapping)[length(indices.mapping)] <- m
234      }      }
235      indices.mapping      indices.mapping
236  }  }
237    
238  c2 <-  #c2 <-
239  function(x, y, ...)  #function(x, y, ...)
240  {  #{
241      # Update the CMetaData tree  #    # Update the CMetaData tree
242      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  #    cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
243      update.struct <- .update_id(cmeta)  #    update.struct <- .update_id(cmeta)
244    #
245      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)  #    new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
246    #
247      # Find indices to be updated for the left tree  #    # Find indices to be updated for the left tree
248      indices.mapping <- .find_indices(x)  #    indices.mapping <- .find_indices(x)
249    #
250      # Update the DMetaData data frames for the left tree  #    # Update the CorpusDMeta data frames for the left tree
251      for (i in 1:ncol(update.struct$left.mapping)) {  #    for (i in 1:ncol(update.struct$left.mapping)) {
252          map <- update.struct$left.mapping[,i]  #        map <- update.struct$left.mapping[,i]
253          DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  #        DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
254      }  #    }
255    #
256      # Find indices to be updated for the right tree  #    # Find indices to be updated for the right tree
257      indices.mapping <- .find_indices(y)  #    indices.mapping <- .find_indices(y)
258    #
259      # Update the DMetaData data frames for the right tree  #    # Update the CorpusDMeta data frames for the right tree
260      for (i in 1:ncol(update.struct$right.mapping)) {  #    for (i in 1:ncol(update.struct$right.mapping)) {
261          map <- update.struct$right.mapping[,i]  #        map <- update.struct$right.mapping[,i]
262          DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  #        DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
263      }  #    }
264    #
265      # Merge the DMetaData data frames  #    # Merge the CorpusDMeta data frames
266      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  #    labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
267      na.matrix <- matrix(NA,  #    na.matrix <- matrix(NA,
268                          nrow = nrow(DMetaData(x)),  #                        nrow = nrow(DMetaData(x)),
269                          ncol = length(labels),  #                        ncol = length(labels),
270                          dimnames = list(row.names(DMetaData(x)), labels))  #                        dimnames = list(row.names(DMetaData(x)), labels))
271      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  #    x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
272      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  #    labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
273      na.matrix <- matrix(NA,  #    na.matrix <- matrix(NA,
274                          nrow = nrow(DMetaData(y)),  #                        nrow = nrow(DMetaData(y)),
275                          ncol = length(labels),  #                        ncol = length(labels),
276                          dimnames = list(row.names(DMetaData(y)), labels))  #                        dimnames = list(row.names(DMetaData(y)), labels))
277      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)  #    y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
278      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  #    DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
279    #
280      new  #    new
281  }  #}
282    
283  c.Corpus <-  c.Corpus <-
284  function(..., recursive = FALSE)  function(..., recursive = FALSE)
# Line 310  Line 300 
300      else {      else {
301          args <- do.call("c", lapply(args, unclass))          args <- do.call("c", lapply(args, unclass))
302          .VCorpus(args,          .VCorpus(args,
303                   cmeta = .MetaDataNode(),                   CorpusMeta(),
304                   dmeta = data.frame(MetaID = rep(0, length(args)),                   data.frame(MetaID = rep(0, length(args)),
305                                      stringsAsFactors = FALSE))                                      stringsAsFactors = FALSE))
306      }      }
307  }  }
# Line 330  Line 320 
320    
321      dmeta <- data.frame(MetaID = rep(0, length(args)),      dmeta <- data.frame(MetaID = rep(0, length(args)),
322                          stringsAsFactors = FALSE)                          stringsAsFactors = FALSE)
323      .VCorpus(args, .MetaDataNode(), dmeta)      .VCorpus(args, CorpusMeta(), dmeta)
324  }  }
325    
326    content.Corpus <-
327    function(x)
328        x$content
329    
330    `content<-.Corpus` <-
331    function(x, value)
332    {
333        x$content <- value
334        x
335    }
336    
337    length.Corpus <-
338    function(x)
339        length(content(x))
340    
341  print.Corpus <-  print.Corpus <-
342  function(x, ...)  function(x, ...)
343  {  {
344      cat(sprintf(ngettext(length(x),      cat(sprintf(ngettext(length(x),
345                           "A corpus with %d text document\n",                           "A corpus with %d text document\n\n",
346                           "A corpus with %d text documents\n"),                           "A corpus with %d text documents\n\n"),
347                  length(x)))                  length(x)))
     invisible(x)  
 }  
348    
349  summary.Corpus <-      meta <- meta(x, type = "corpus")$value
350  function(object, ...)      dmeta <- meta(x, type = "indexed")
351  {  
352      print(object)      cat("Metadata:\n")
353      if (length(DMetaData(object))) {      cat(sprintf("  Tag-value pairs. Tags: %s\n",
354          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),                  paste(names(meta), collapse = " ")))
355                               "\nThe metadata consists of %d tag-value pair and a data frame\n",      cat("  Data frame. Variables:", colnames(dmeta), "\n")
356                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  
357                      length(CMetaData(object)$MetaData)))      invisible(x)
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
358  }  }
359    
360  inspect <-  inspect <-
# Line 365  Line 363 
363  inspect.PCorpus <-  inspect.PCorpus <-
364  function(x)  function(x)
365  {  {
366      summary(x)      print(x)
367      cat("\n")      cat("\n")
368      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
369      show(filehash::dbMultiFetch(db, unlist(x)))      show(filehash::dbMultiFetch(db, unlist(x)))
370        invisible(x)
371  }  }
372  inspect.VCorpus <-  inspect.VCorpus <-
373  function(x)  function(x)
374  {  {
375      summary(x)      print(x)
376      cat("\n")      cat("\n")
377      print(noquote(lapply(x, identity)))      print(noquote(content(x)))
378        invisible(x)
379  }  }
380    
381  lapply.PCorpus <-  lapply.PCorpus <-
382  function(X, FUN, ...)  function(X, FUN, ...)
383  {  {
384      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])      db <- filehash::dbInit(X$dbcontrol[["dbName"]], X$dbcontrol[["dbType"]])
385      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      lapply(filehash::dbMultiFetch(db, unlist(content(X))), FUN, ...)
386  }  }
387  lapply.VCorpus <-  lapply.VCorpus <-
388  function(X, FUN, ...)  function(X, FUN, ...)
# Line 390  Line 390 
390      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
391      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
392          .Call("copyCorpus", X, materialize(X))          .Call("copyCorpus", X, materialize(X))
393      base::lapply(X, FUN, ...)      lapply(content(X), FUN, ...)
394  }  }
395    
396  writeCorpus <-  writeCorpus <-

Legend:
Removed from v.1306  
changed lines
  Added in v.1307

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge