SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1312, Sat Mar 29 09:35:44 2014 UTC revision 1333, Fri Apr 18 10:38:46 2014 UTC
# Line 20  Line 20 
20      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
23      tdl <- if (x$length > 0)      tdl <- if (x$length > 0) vector("list", as.integer(x$length)) else list()
         vector("list", as.integer(x$length))  
     else  
         list()  
24    
25      counter <- 1      counter <- 1
26      while (!eoi(x)) {      while (!eoi(x)) {
# Line 49  Line 46 
46                class = c("PCorpus", "Corpus"))                class = c("PCorpus", "Corpus"))
47  }  }
48    
49  VCorpus <- Corpus <-  VCorpus <-
50  function(x, readerControl = list(reader = x$defaultreader, language = "en"))  function(x, readerControl = list(reader = x$defaultreader, language = "en"))
51  {  {
52      stopifnot(inherits(x, "Source"))      stopifnot(inherits(x, "Source"))
# Line 63  Line 60 
60          on.exit(readerControl$exit())          on.exit(readerControl$exit())
61    
62      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
63      tdl <- if (x$length > 0)      tdl <- if (x$length > 0) vector("list", as.integer(x$length)) else list()
         vector("list", as.integer(x$length))  
     else  
         list()  
64    
65      if (x$vectorized)      if (x$vectorized)
66          tdl <- mapply(function(elem, id)          tdl <- mapply(function(elem, id)
# Line 102  Line 96 
96                class = c("VCorpus", "Corpus"))                class = c("VCorpus", "Corpus"))
97  }  }
98    
99  `[.PCorpus` <- `[.VCorpus` <-  `[.PCorpus` <-
100  function(x, i)  function(x, i)
101  {  {
102      if (!missing(i)) {      if (!missing(i)) {
# Line 112  Line 106 
106      x      x
107  }  }
108    
109    `[.VCorpus` <-
110    function(x, i)
111    {
112        if (!missing(i)) {
113            x$content <- x$content[i]
114            x$dmeta <- x$dmeta[i, , drop = FALSE]
115            if (!is.null(x$lazy))
116                x$lazy$index <- x$lazy$index[i]
117        }
118        x
119    }
120    
121  .map_name_index <-  .map_name_index <-
122  function(x, i)  function(x, i)
123  {  {
# Line 132  Line 138 
138  function(x, i)  function(x, i)
139  {  {
140      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
141      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
142      if (!is.null(lazyTmMap))          .Call(copyCorpus, x, materialize(x, i))
         .Call("copyCorpus", x, materialize(x, i))  
143      x$content[[i]]      x$content[[i]]
144  }  }
145    
# Line 150  Line 155 
155  function(x, i, value)  function(x, i, value)
156  {  {
157      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
158      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
159      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
160      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
         lazyTmMap$index[i] <- FALSE  
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
161      x$content[[i]] <- value      x$content[[i]] <- value
162      x      x
163  }  }
164    
165  # Update NodeIDs of a CMetaData tree  outer_union <-
166  .update_id <-  function(x, y, ...)
 function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0)  
 {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children)) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
         x  
     }  
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
   
 # Find indices to be updated for a CMetaData tree  
 .find_indices <-  
 function(x)  
167  {  {
168      indices.mapping <- NULL      if (nrow(x) > 0L)
169      for (m in levels(as.factor(CorpusDMeta(x)$MetaID))) {          x[, setdiff(names(y), names(x))] <- NA
170          indices <- (CorpusDMeta(x)$MetaID == m)      if (nrow(y) > 0L)
171          indices.mapping <- c(indices.mapping, list(m = indices))          y[, setdiff(names(x), names(y))] <- NA
172          names(indices.mapping)[length(indices.mapping)] <- m      res <- rbind(x, y)
173      }      if (ncol(res) == 0L)
174      indices.mapping          res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
175  }      res
176    }
 #c2 <-  
 #function(x, y, ...)  
 #{  
 #    # Update the CMetaData tree  
 #    cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  
 #    update.struct <- .update_id(cmeta)  
 #  
 #    new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)  
 #  
 #    # Find indices to be updated for the left tree  
 #    indices.mapping <- .find_indices(x)  
 #  
 #    # Update the CorpusDMeta data frames for the left tree  
 #    for (i in 1:ncol(update.struct$left.mapping)) {  
 #        map <- update.struct$left.mapping[,i]  
 #        DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
 #    }  
 #  
 #    # Find indices to be updated for the right tree  
 #    indices.mapping <- .find_indices(y)  
 #  
 #    # Update the CorpusDMeta data frames for the right tree  
 #    for (i in 1:ncol(update.struct$right.mapping)) {  
 #        map <- update.struct$right.mapping[,i]  
 #        DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
 #    }  
 #  
 #    # Merge the CorpusDMeta data frames  
 #    labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  
 #    na.matrix <- matrix(NA,  
 #                        nrow = nrow(DMetaData(x)),  
 #                        ncol = length(labels),  
 #                        dimnames = list(row.names(DMetaData(x)), labels))  
 #    x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  
 #    labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  
 #    na.matrix <- matrix(NA,  
 #                        nrow = nrow(DMetaData(y)),  
 #                        ncol = length(labels),  
 #                        dimnames = list(row.names(DMetaData(y)), labels))  
 #    y.dmeta.aug <- cbind(DMetaData(y), na.matrix)  
 #    DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  
 #  
 #    new  
 #}  
177    
178  c.Corpus <-  c.VCorpus <-
179  function(..., recursive = FALSE)  function(..., recursive = FALSE)
180  {  {
181      args <- list(...)      args <- list(...)
# Line 257  Line 187 
187      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
188          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
189    
190      if (inherits(x, "PCorpus"))      structure(list(content = do.call("c", lapply(args, content)),
191          stop("concatenation of corpora with underlying databases is not supported")                     meta = structure(do.call("c",
192                         lapply(args, function(a) meta(a, type = "corpus"))),
193      if (recursive)                                      class = "CorpusMeta"),
194          Reduce(c2, args)                     dmeta = Reduce(outer_union, lapply(args, meta))),
195      else {                class = c("VCorpus", "Corpus"))
         args <- do.call("c", lapply(args, content))  
         .VCorpus(args,  
                  CorpusMeta(),  
                  data.frame(MetaID = rep(0, length(args)),  
                             stringsAsFactors = FALSE))  
     }  
196  }  }
197    
198  c.TextDocument <-  as.list.PCorpus <- as.list.VCorpus <-
199  function(..., recursive = FALSE)  function(x, ...)
200  {      content(x)
     args <- list(...)  
     x <- args[[1L]]  
   
     if(length(args) == 1L)  
         return(x)  
   
     if (!all(unlist(lapply(args, inherits, class(x)))))  
         stop("not all arguments are text documents")  
   
     .VCorpus(args,  
              CorpusMeta(),  
              data.frame(MetaID = rep(0, length(args)),  
                         stringsAsFactors = FALSE))  
 }  
201    
202  content.Corpus <-  content.VCorpus <-
203  function(x)  function(x)
204    {
205        if (!is.null(x$lazy))
206            .Call(copyCorpus, x, materialize(x))
207      x$content      x$content
208    }
209    
210  `content<-.Corpus` <-  content.PCorpus <-
211  function(x, value)  function(x)
212  {  {
213      x$content <- value      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
214      x      filehash::dbMultiFetch(db, unlist(x$content))
215  }  }
216    
217  length.Corpus <-  length.PCorpus <- length.VCorpus <-
218  function(x)  function(x)
219      length(content(x))      length(x$content)
220    
221  print.Corpus <-  print.PCorpus <- print.VCorpus <-
222  function(x, ...)  function(x, ...)
223  {  {
224      cat(sprintf(ngettext(length(x),      writeLines(sprintf("<<%s (documents: %d, metadata (corpus/indexed): %d/%d)>>",
225                           "A corpus with %d text document\n\n",                         class(x)[1],
226                           "A corpus with %d text documents\n\n"),                         length(x),
227                  length(x)))                         length(meta(x, type = "corpus")),
228                           ncol(meta(x, type = "indexed"))))
     meta <- meta(x, type = "corpus")  
     dmeta <- meta(x, type = "indexed")  
   
     cat("Metadata:\n")  
     cat(sprintf("  Tag-value pairs. Tags: %s\n",  
                 paste(names(meta), collapse = " ")))  
     cat("  Data frame. Variables:", colnames(dmeta), "\n")  
   
229      invisible(x)      invisible(x)
230  }  }
231    
232  inspect <-  inspect <-
233  function(x)  function(x)
234      UseMethod("inspect", x)      UseMethod("inspect", x)
235  inspect.PCorpus <-  inspect.PCorpus <- inspect.VCorpus <-
 function(x)  
 {  
     print(x)  
     cat("\n")  
     db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])  
     show(filehash::dbMultiFetch(db, unlist(content(x))))  
     invisible(x)  
 }  
 inspect.VCorpus <-  
236  function(x)  function(x)
237  {  {
238      print(x)      print(x)
# Line 344  Line 241 
241      invisible(x)      invisible(x)
242  }  }
243    
 # TODO: lapply() is not generic but as.list() is  
 #  
 #lapply.PCorpus <-  
 #function(X, FUN, ...)  
 #{  
 #    db <- filehash::dbInit(X$dbcontrol[["dbName"]], X$dbcontrol[["dbType"]])  
 #    lapply(filehash::dbMultiFetch(db, unlist(content(X))), FUN, ...)  
 #}  
 #lapply.VCorpus <-  
 #function(X, FUN, ...)  
 #{  
 #    lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")  
 #    if (!is.null(lazyTmMap))  
 #        .Call("copyCorpus", X, materialize(X))  
 #    lapply(content(X), FUN, ...)  
 #}  
   
244  writeCorpus <-  writeCorpus <-
245  function(x, path = ".", filenames = NULL)  function(x, path = ".", filenames = NULL)
246  {  {

Legend:
Removed from v.1312  
changed lines
  Added in v.1333

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge