SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 995, Mon Sep 7 07:54:08 2009 UTC revision 1108, Fri Oct 22 18:32:47 2010 UTC
# Line 10  Line 10 
10  DBControl <- function(x) attr(x, "DBControl")  DBControl <- function(x) attr(x, "DBControl")
11    
12  PCorpus <- function(x,  PCorpus <- function(x,
13                      readerControl = list(reader = x$DefaultReader, language = "eng"),                      readerControl = list(reader = x$DefaultReader, language = "en"),
14                      dbControl = list(dbName = "", dbType = "DB1"),                      dbControl = list(dbName = "", dbType = "DB1"),
15                      ...) {                      ...) {
16      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
# Line 29  Line 29 
29      while (!eoi(x)) {      while (!eoi(x)) {
30          x <- stepNext(x)          x <- stepNext(x)
31          elem <- getElem(x)          elem <- getElem(x)
32          doc <- readerControl$reader(elem, readerControl$language, as.character(counter))          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
33          filehash::dbInsert(db, ID(doc), doc)          filehash::dbInsert(db, ID(doc), doc)
34          if (x$Length > 0) tdl[[counter]] <- ID(doc)          if (x$Length > 0) tdl[[counter]] <- ID(doc)
35          else tdl <- c(tdl, ID(doc))          else tdl <- c(tdl, ID(doc))
36          counter <- counter + 1          counter <- counter + 1
37      }      }
38        names(tdl) <- x$Names
39    
40      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
41      filehash::dbInsert(db, "DMetaData", df)      filehash::dbInsert(db, "DMetaData", df)
# Line 50  Line 51 
51      x      x
52  }  }
53    
54    # Register S3 corpus classes to be recognized by S4 methods. This is
55    # mainly a fix to be compatible with packages which were originally
56    # developed to cooperate with corresponding S4 tm classes. Necessary
57    # since tm's class architecture was changed to S3 since tm version 0.5.
58    setOldClass(c("VCorpus", "Corpus", "list"))
59    
60  # The "..." are additional arguments for the FunctionGenerator reader  # The "..." are additional arguments for the FunctionGenerator reader
61  VCorpus <- Corpus <- function(x,  VCorpus <- Corpus <- function(x,
62                      readerControl = list(reader = x$DefaultReader, language = "eng"),                                readerControl = list(reader = x$DefaultReader, language = "en"),
63                      ...) {                      ...) {
64      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
65    
# Line 65  Line 72 
72      if (x$Vectorized)      if (x$Vectorized)
73          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
74                        pGetElem(x),                        pGetElem(x),
75                        id = as.character(seq_len(x$Length)),                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,
76                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
77      else {      else {
78          counter <- 1          counter <- 1
79          while (!eoi(x)) {          while (!eoi(x)) {
80              x <- stepNext(x)              x <- stepNext(x)
81              elem <- getElem(x)              elem <- getElem(x)
82              doc <- readerControl$reader(elem, readerControl$language, as.character(counter))              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
83              if (x$Length > 0)              if (x$Length > 0)
84                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
85              else              else
# Line 80  Line 87 
87              counter <- counter + 1              counter <- counter + 1
88          }          }
89      }      }
90        names(tdl) <- x$Names
91      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
92      .VCorpus(tdl, .MetaDataNode(), df)      .VCorpus(tdl, .MetaDataNode(), df)
93  }  }
# Line 102  Line 109 
109      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
110      counter <- 1      counter <- 1
111      for (id in unclass(x)[i]) {      for (id in unclass(x)[i]) {
112          if (identical(length(value), 1)) db[[id]] <- value          if (identical(length(value), 1L)) db[[id]] <- value
113          else db[[id]] <- value[[counter]]          else db[[id]] <- value[[counter]]
114          counter <- counter + 1          counter <- counter + 1
115      }      }
116      x      x
117  }  }
118    
119    .map_name_index <- function(x, i) {
120        if (is.character(i)) {
121            if (is.null(names(x)))
122                match(i, meta(x, "ID", type = "local"))
123            else
124                match(i, names(x))
125        }
126        i
127    }
128    
129  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-  function(x, i) {
130        i <- .map_name_index(x, i)
131      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
132      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, NextMethod("[["))
133  }  }
134  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-  function(x, i) {
135        i <- .map_name_index(x, i)
136      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
137      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
138          .Call("copyCorpus", x, materialize(x, i))          .Call("copyCorpus", x, materialize(x, i))
# Line 121  Line 140 
140  }  }
141    
142  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-  function(x, i, value) {
143        i <- .map_name_index(x, i)
144      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
145      index <- unclass(x)[[i]]      index <- unclass(x)[[i]]
146      db[[index]] <- value      db[[index]] <- value
147      x      x
148  }  }
149  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-  function(x, i, value) {
150        i <- .map_name_index(x, i)
151      # Mark new objects as not active for lazy mapping      # Mark new objects as not active for lazy mapping
152      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
153      if (!is.null(lazyTmMap)) {      if (!is.null(lazyTmMap)) {
# Line 140  Line 161 
161      y      y
162  }  }
163    
164  # Update \code{NodeID}s of a CMetaData tree  # Update NodeIDs of a CMetaData tree
165  update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
166      # Traversal of (binary) CMetaData tree with setup of \code{NodeID}s      # Traversal of (binary) CMetaData tree with setup of NodeIDs
167      set_id <- function(x) {      set_id <- function(x) {
168          x$NodeID <- id          x$NodeID <- id
169          id <<- id + 1          id <<- id + 1
# Line 165  Line 186 
186      list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)      list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
187  }  }
188    
189    # Find indices to be updated for a CMetaData tree
190    .find_indices <- function(x) {
191        indices.mapping <- NULL
192        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
193            indices <- (DMetaData(x)$MetaID == m)
194            indices.mapping <- c(indices.mapping, list(m = indices))
195            names(indices.mapping)[length(indices.mapping)] <- m
196        }
197        indices.mapping
198    }
199    
200  c2 <- function(x, y, ...) {  c2 <- function(x, y, ...) {
201      # Update the CMetaData tree      # Update the CMetaData tree
202      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
203      update.struct <- update_id(cmeta)      update.struct <- .update_id(cmeta)
204    
205      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
206    
207      # Find indices to be updated for the left tree      # Find indices to be updated for the left tree
208      indices.mapping <- NULL      indices.mapping <- .find_indices(x)
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
209    
210      # Update the DMetaData data frames for the left tree      # Update the DMetaData data frames for the left tree
211      for (i in 1:ncol(update.struct$left.mapping)) {      for (i in 1:ncol(update.struct$left.mapping)) {
# Line 187  Line 214 
214      }      }
215    
216      # Find indices to be updated for the right tree      # Find indices to be updated for the right tree
217      indices.mapping <- NULL      indices.mapping <- .find_indices(y)
     for (m in levels(as.factor(DMetaData(y)$MetaID))) {  
         indices <- (DMetaData(y)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
218    
219      # Update the DMetaData data frames for the right tree      # Update the DMetaData data frames for the right tree
220      for (i in 1:ncol(update.struct$right.mapping)) {      for (i in 1:ncol(update.struct$right.mapping)) {
# Line 217  Line 239 
239  {  {
240      args <- list(...)      args <- list(...)
241    
242      if (identical(length(args), 0))      if (identical(length(args), 0L))
243          return(x)          return(x)
244    
245      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
# Line 226  Line 248 
248      if (inherits(x, "PCorpus"))      if (inherits(x, "PCorpus"))
249          stop("concatenation of corpora with underlying databases is not supported")          stop("concatenation of corpora with underlying databases is not supported")
250    
251      Reduce(c2, base::c(list(x), args))      l <- base::c(list(x), args)
252        if (recursive)
253            Reduce(c2, l)
254        else {
255            l <- do.call("c", lapply(l, unclass))
256            .VCorpus(l,
257                     cmeta = .MetaDataNode(),
258                     dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))
259        }
260  }  }
261    
262  c.TextDocument <- function(x, ..., recursive = FALSE) {  c.TextDocument <- function(x, ..., recursive = FALSE) {
263      args <- list(...)      args <- list(...)
264    
265      if (identical(length(args), 0))      if (identical(length(args), 0L))
266          return(x)          return(x)
267    
268      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))

Legend:
Removed from v.995  
changed lines
  Added in v.1108

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge