SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 988, Fri Sep 4 12:27:12 2009 UTC revision 1108, Fri Oct 22 18:32:47 2010 UTC
# Line 10  Line 10 
10  DBControl <- function(x) attr(x, "DBControl")  DBControl <- function(x) attr(x, "DBControl")
11    
12  PCorpus <- function(x,  PCorpus <- function(x,
13                      readerControl = list(reader = x$DefaultReader, language = "eng"),                      readerControl = list(reader = x$DefaultReader, language = "en"),
14                      dbControl = list(dbName = "", dbType = "DB1"),                      dbControl = list(dbName = "", dbType = "DB1"),
15                      ...) {                      ...) {
16      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
# Line 29  Line 29 
29      while (!eoi(x)) {      while (!eoi(x)) {
30          x <- stepNext(x)          x <- stepNext(x)
31          elem <- getElem(x)          elem <- getElem(x)
32          doc <- readerControl$reader(elem, readerControl$language, as.character(counter))          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
33          filehash::dbInsert(db, ID(doc), doc)          filehash::dbInsert(db, ID(doc), doc)
34          if (x$Length > 0) tdl[[counter]] <- ID(doc)          if (x$Length > 0) tdl[[counter]] <- ID(doc)
35          else tdl <- c(tdl, ID(doc))          else tdl <- c(tdl, ID(doc))
36          counter <- counter + 1          counter <- counter + 1
37      }      }
38        names(tdl) <- x$Names
39    
40      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
41      filehash::dbInsert(db, "DMetaData", df)      filehash::dbInsert(db, "DMetaData", df)
# Line 50  Line 51 
51      x      x
52  }  }
53    
54    # Register S3 corpus classes to be recognized by S4 methods. This is
55    # mainly a fix to be compatible with packages which were originally
56    # developed to cooperate with corresponding S4 tm classes. Necessary
57    # since tm's class architecture was changed to S3 since tm version 0.5.
58    setOldClass(c("VCorpus", "Corpus", "list"))
59    
60  # The "..." are additional arguments for the FunctionGenerator reader  # The "..." are additional arguments for the FunctionGenerator reader
61  VCorpus <- Corpus <- function(x,  VCorpus <- Corpus <- function(x,
62                      readerControl = list(reader = x$DefaultReader, language = "eng"),                                readerControl = list(reader = x$DefaultReader, language = "en"),
63                      ...) {                      ...) {
64      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
65    
# Line 65  Line 72 
72      if (x$Vectorized)      if (x$Vectorized)
73          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
74                        pGetElem(x),                        pGetElem(x),
75                        id = as.character(seq_len(x$Length)),                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,
76                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
77      else {      else {
78          counter <- 1          counter <- 1
79          while (!eoi(x)) {          while (!eoi(x)) {
80              x <- stepNext(x)              x <- stepNext(x)
81              elem <- getElem(x)              elem <- getElem(x)
82              doc <- readerControl$reader(elem, readerControl$language, as.character(counter))              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
83              if (x$Length > 0)              if (x$Length > 0)
84                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
85              else              else
# Line 80  Line 87 
87              counter <- counter + 1              counter <- counter + 1
88          }          }
89      }      }
90        names(tdl) <- x$Names
91      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
92      .VCorpus(tdl, .MetaDataNode(), df)      .VCorpus(tdl, .MetaDataNode(), df)
93  }  }
94    
95  `[.PCorpus` <- function(x, i) {  `[.PCorpus` <- function(x, i) {
96      if (missing(i)) return(x)      if (missing(i)) return(x)
     cmeta <- CMetaData(x)  
97      index <- attr(x, "DMetaData")[[1 , "subset"]]      index <- attr(x, "DMetaData")[[1 , "subset"]]
98      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
99      dmeta <- attr(x, "DMetaData")      dmeta <- attr(x, "DMetaData")
100      dbcontrol <- DBControl(x)      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
     class(x) <- "list"  
     .PCorpus(x[i, drop = FALSE], cmeta, dmeta, dbcontrol)  
101  }  }
102    
103  `[.VCorpus` <- function(x, i) {  `[.VCorpus` <- function(x, i) {
104      if (missing(i)) return(x)      if (missing(i)) return(x)
105      cmeta <- CMetaData(x)      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
     dmeta <- DMetaData(x)[i, , drop = FALSE]  
     class(x) <- "list"  
     .VCorpus(x[i, drop = FALSE], cmeta, dmeta)  
106  }  }
107    
108  `[<-.PCorpus` <- function(x, i, value) {  `[<-.PCorpus` <- function(x, i, value) {
109      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
110      counter <- 1      counter <- 1
111      for (id in unclass(x)[i]) {      for (id in unclass(x)[i]) {
112          if (identical(length(value), 1)) db[[id]] <- value          if (identical(length(value), 1L)) db[[id]] <- value
113          else db[[id]] <- value[[counter]]          else db[[id]] <- value[[counter]]
114          counter <- counter + 1          counter <- counter + 1
115      }      }
116      x      x
117  }  }
118    
119    .map_name_index <- function(x, i) {
120        if (is.character(i)) {
121            if (is.null(names(x)))
122                match(i, meta(x, "ID", type = "local"))
123            else
124                match(i, names(x))
125        }
126        i
127    }
128    
129  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-  function(x, i) {
130        i <- .map_name_index(x, i)
131      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
132      class(x) <- "list"      filehash::dbFetch(db, NextMethod("[["))
     filehash::dbFetch(db, x[[i]])  
133  }  }
134  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-  function(x, i) {
135        i <- .map_name_index(x, i)
136      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
137      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
138          .Call("copyCorpus", x, materialize(x, i))          .Call("copyCorpus", x, materialize(x, i))
139      class(x) <- "list"      NextMethod("[[")
     x[[i]]  
140  }  }
141    
142  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-  function(x, i, value) {
143        i <- .map_name_index(x, i)
144      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
145      index <- unclass(x)[[i]]      index <- unclass(x)[[i]]
146      db[[index]] <- value      db[[index]] <- value
147      x      x
148  }  }
149  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-  function(x, i, value) {
150        i <- .map_name_index(x, i)
151      # Mark new objects as not active for lazy mapping      # Mark new objects as not active for lazy mapping
152      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
153      if (!is.null(lazyTmMap)) {      if (!is.null(lazyTmMap)) {
# Line 143  Line 156 
156      }      }
157      # Set the value      # Set the value
158      cl <- class(x)      cl <- class(x)
159      class(x) <- "list"      y <- NextMethod("[[<-")
160      x[[i]] <- value      class(y) <- cl
161      class(x) <- cl      y
     x  
162  }  }
163    
164  # Update \code{NodeID}s of a CMetaData tree  # Update NodeIDs of a CMetaData tree
165  update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
166      # Traversal of (binary) CMetaData tree with setup of \code{NodeID}s      # Traversal of (binary) CMetaData tree with setup of NodeIDs
167      set_id <- function(x) {      set_id <- function(x) {
168          x$NodeID <- id          x$NodeID <- id
169          id <<- id + 1          id <<- id + 1
# Line 174  Line 186 
186      list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)      list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
187  }  }
188    
189    # Find indices to be updated for a CMetaData tree
190    .find_indices <- function(x) {
191        indices.mapping <- NULL
192        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
193            indices <- (DMetaData(x)$MetaID == m)
194            indices.mapping <- c(indices.mapping, list(m = indices))
195            names(indices.mapping)[length(indices.mapping)] <- m
196        }
197        indices.mapping
198    }
199    
200  c2 <- function(x, y, ...) {  c2 <- function(x, y, ...) {
201      # Update the CMetaData tree      # Update the CMetaData tree
202      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
203      update.struct <- update_id(cmeta)      update.struct <- .update_id(cmeta)
204    
205      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
206    
207      # Find indices to be updated for the left tree      # Find indices to be updated for the left tree
208      indices.mapping <- NULL      indices.mapping <- .find_indices(x)
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
209    
210      # Update the DMetaData data frames for the left tree      # Update the DMetaData data frames for the left tree
211      for (i in 1:ncol(update.struct$left.mapping)) {      for (i in 1:ncol(update.struct$left.mapping)) {
# Line 196  Line 214 
214      }      }
215    
216      # Find indices to be updated for the right tree      # Find indices to be updated for the right tree
217      indices.mapping <- NULL      indices.mapping <- .find_indices(y)
     for (m in levels(as.factor(DMetaData(y)$MetaID))) {  
         indices <- (DMetaData(y)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
218    
219      # Update the DMetaData data frames for the right tree      # Update the DMetaData data frames for the right tree
220      for (i in 1:ncol(update.struct$right.mapping)) {      for (i in 1:ncol(update.struct$right.mapping)) {
# Line 226  Line 239 
239  {  {
240      args <- list(...)      args <- list(...)
241    
242      if (identical(length(args), 0))      if (identical(length(args), 0L))
243          return(x)          return(x)
244    
245      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
# Line 235  Line 248 
248      if (inherits(x, "PCorpus"))      if (inherits(x, "PCorpus"))
249          stop("concatenation of corpora with underlying databases is not supported")          stop("concatenation of corpora with underlying databases is not supported")
250    
251      Reduce(c2, base::c(list(x), args))      l <- base::c(list(x), args)
252        if (recursive)
253            Reduce(c2, l)
254        else {
255            l <- do.call("c", lapply(l, unclass))
256            .VCorpus(l,
257                     cmeta = .MetaDataNode(),
258                     dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))
259        }
260  }  }
261    
262  c.TextDocument <- function(x, ..., recursive = FALSE) {  c.TextDocument <- function(x, ..., recursive = FALSE) {
263      args <- list(...)      args <- list(...)
264    
265      if (identical(length(args), 0))      if (identical(length(args), 0L))
266          return(x)          return(x)
267    
268      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))

Legend:
Removed from v.988  
changed lines
  Added in v.1108

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge