SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1313, Sun Mar 30 09:28:00 2014 UTC revision 1333, Fri Apr 18 10:38:46 2014 UTC
# Line 20  Line 20 
20      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
23      tdl <- if (x$length > 0)      tdl <- if (x$length > 0) vector("list", as.integer(x$length)) else list()
         vector("list", as.integer(x$length))  
     else  
         list()  
24    
25      counter <- 1      counter <- 1
26      while (!eoi(x)) {      while (!eoi(x)) {
# Line 49  Line 46 
46                class = c("PCorpus", "Corpus"))                class = c("PCorpus", "Corpus"))
47  }  }
48    
49  VCorpus <- Corpus <-  VCorpus <-
50  function(x, readerControl = list(reader = x$defaultreader, language = "en"))  function(x, readerControl = list(reader = x$defaultreader, language = "en"))
51  {  {
52      stopifnot(inherits(x, "Source"))      stopifnot(inherits(x, "Source"))
# Line 63  Line 60 
60          on.exit(readerControl$exit())          on.exit(readerControl$exit())
61    
62      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
63      tdl <- if (x$length > 0)      tdl <- if (x$length > 0) vector("list", as.integer(x$length)) else list()
         vector("list", as.integer(x$length))  
     else  
         list()  
64    
65      if (x$vectorized)      if (x$vectorized)
66          tdl <- mapply(function(elem, id)          tdl <- mapply(function(elem, id)
# Line 102  Line 96 
96                class = c("VCorpus", "Corpus"))                class = c("VCorpus", "Corpus"))
97  }  }
98    
99  `[.PCorpus` <- `[.VCorpus` <-  `[.PCorpus` <-
100    function(x, i)
101    {
102        if (!missing(i)) {
103            x$content <- x$content[i]
104            x$dmeta <- x$dmeta[i, , drop = FALSE]
105        }
106        x
107    }
108    
109    `[.VCorpus` <-
110  function(x, i)  function(x, i)
111  {  {
112      if (!missing(i)) {      if (!missing(i)) {
113          x$content <- x$content[i]          x$content <- x$content[i]
114          x$dmeta <- x$dmeta[i, , drop = FALSE]          x$dmeta <- x$dmeta[i, , drop = FALSE]
115            if (!is.null(x$lazy))
116                x$lazy$index <- x$lazy$index[i]
117      }      }
118      x      x
119  }  }
# Line 132  Line 138 
138  function(x, i)  function(x, i)
139  {  {
140      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
141      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
142      if (!is.null(lazyTmMap))          .Call(copyCorpus, x, materialize(x, i))
         .Call("copyCorpus", x, materialize(x, i))  
143      x$content[[i]]      x$content[[i]]
144  }  }
145    
# Line 150  Line 155 
155  function(x, i, value)  function(x, i, value)
156  {  {
157      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
158      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
159      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
160      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
         lazyTmMap$index[i] <- FALSE  
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
161      x$content[[i]] <- value      x$content[[i]] <- value
162      x      x
163  }  }
164    
165  # Update NodeIDs of a CMetaData tree  outer_union <-
166  .update_id <-  function(x, y, ...)
 function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0)  
 {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children)) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
         x  
     }  
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
   
 # Find indices to be updated for a CMetaData tree  
 .find_indices <-  
 function(x)  
167  {  {
168      indices.mapping <- NULL      if (nrow(x) > 0L)
169      for (m in levels(as.factor(CorpusDMeta(x)$MetaID))) {          x[, setdiff(names(y), names(x))] <- NA
170          indices <- (CorpusDMeta(x)$MetaID == m)      if (nrow(y) > 0L)
171          indices.mapping <- c(indices.mapping, list(m = indices))          y[, setdiff(names(x), names(y))] <- NA
172          names(indices.mapping)[length(indices.mapping)] <- m      res <- rbind(x, y)
173      }      if (ncol(res) == 0L)
174      indices.mapping          res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
175  }      res
176    }
 #c2 <-  
 #function(x, y, ...)  
 #{  
 #    # Update the CMetaData tree  
 #    cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  
 #    update.struct <- .update_id(cmeta)  
 #  
 #    new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)  
 #  
 #    # Find indices to be updated for the left tree  
 #    indices.mapping <- .find_indices(x)  
 #  
 #    # Update the CorpusDMeta data frames for the left tree  
 #    for (i in 1:ncol(update.struct$left.mapping)) {  
 #        map <- update.struct$left.mapping[,i]  
 #        DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
 #    }  
 #  
 #    # Find indices to be updated for the right tree  
 #    indices.mapping <- .find_indices(y)  
 #  
 #    # Update the CorpusDMeta data frames for the right tree  
 #    for (i in 1:ncol(update.struct$right.mapping)) {  
 #        map <- update.struct$right.mapping[,i]  
 #        DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
 #    }  
 #  
 #    # Merge the CorpusDMeta data frames  
 #    labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  
 #    na.matrix <- matrix(NA,  
 #                        nrow = nrow(DMetaData(x)),  
 #                        ncol = length(labels),  
 #                        dimnames = list(row.names(DMetaData(x)), labels))  
 #    x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  
 #    labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  
 #    na.matrix <- matrix(NA,  
 #                        nrow = nrow(DMetaData(y)),  
 #                        ncol = length(labels),  
 #                        dimnames = list(row.names(DMetaData(y)), labels))  
 #    y.dmeta.aug <- cbind(DMetaData(y), na.matrix)  
 #    DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  
 #  
 #    new  
 #}  
177    
178  c.VCorpus <-  c.VCorpus <-
179  function(..., recursive = FALSE)  function(..., recursive = FALSE)
# Line 257  Line 187 
187      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
188          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
189    
190      if (recursive)      structure(list(content = do.call("c", lapply(args, content)),
191          Reduce(c2, args)                     meta = structure(do.call("c",
192      else {                       lapply(args, function(a) meta(a, type = "corpus"))),
193          args <- do.call("c", lapply(args, content))                                      class = "CorpusMeta"),
194          structure(list(content = args,                     dmeta = Reduce(outer_union, lapply(args, meta))),
                        meta = CorpusMeta(),  
                        dmeta = data.frame(row.names = seq_along(args))),  
                   class = c("VCorpus", "Corpus"))  
     }  
 }  
   
 c.TextDocument <-  
 function(..., recursive = FALSE)  
 {  
     args <- list(...)  
     x <- args[[1L]]  
   
     if (length(args) == 1L)  
         return(x)  
   
     if (!all(unlist(lapply(args, inherits, class(x)))))  
         stop("not all arguments are text documents")  
   
     structure(list(content = args,  
                    meta = CorpusMeta(),  
                    dmeta = data.frame(row.names = seq_along(args))),  
195                class = c("VCorpus", "Corpus"))                class = c("VCorpus", "Corpus"))
196  }  }
197    
# Line 293  Line 202 
202  content.VCorpus <-  content.VCorpus <-
203  function(x)  function(x)
204  {  {
205      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
206      if (!is.null(lazyTmMap))          .Call(copyCorpus, x, materialize(x))
         .Call("copyCorpus", x, materialize(x))  
207      x$content      x$content
208  }  }
209    
# Line 313  Line 221 
221  print.PCorpus <- print.VCorpus <-  print.PCorpus <- print.VCorpus <-
222  function(x, ...)  function(x, ...)
223  {  {
224      cat(sprintf(ngettext(length(x),      writeLines(sprintf("<<%s (documents: %d, metadata (corpus/indexed): %d/%d)>>",
225                           "A corpus with %d text document\n\n",                         class(x)[1],
226                           "A corpus with %d text documents\n\n"),                         length(x),
227                  length(x)))                         length(meta(x, type = "corpus")),
228                           ncol(meta(x, type = "indexed"))))
     meta <- meta(x, type = "corpus")  
     dmeta <- meta(x, type = "indexed")  
   
     cat("Metadata:\n")  
     cat(sprintf("  Tag-value pairs. Tags: %s\n",  
                 paste(names(meta), collapse = " ")))  
     cat("  Data frame. Variables:", colnames(dmeta), "\n")  
   
229      invisible(x)      invisible(x)
230  }  }
231    

Legend:
Removed from v.1313  
changed lines
  Added in v.1333

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge