SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 987, Wed Sep 2 17:54:45 2009 UTC revision 1108, Fri Oct 22 18:32:47 2010 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
 prepareReader <- function(readerControl, defaultReader = NULL, ...) {  
     if (is.null(readerControl$reader))  
         readerControl$reader <- defaultReader  
     if (inherits(readerControl$reader, "FunctionGenerator"))  
         readerControl$reader <- readerControl$reader(...)  
     if (is.null(readerControl$language))  
         readerControl$language <- "eng"  
     readerControl  
 }  
   
 # Node ID, actual meta data, and possibly other nodes as children  
 .MetaDataNode <- function(nodeid = 0, meta = list(create_date = as.POSIXlt(Sys.time(), tz = "GMT"), creator = Sys.getenv("LOGNAME")), children = NULL) {  
     structure(list(NodeID = nodeid, MetaData = meta, Children = children),  
               class = "MetaDataNode")  
 }  
   
 print.MetaDataNode <- function(x, ...)  
     print(x$MetaData)  
   
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4      attr(x, "CMetaData") <- cmeta      attr(x, "CMetaData") <- cmeta
5      attr(x, "DMetaData") <- dmeta      attr(x, "DMetaData") <- dmeta
# Line 26  Line 7 
7      class(x) <- c("PCorpus", "Corpus", "list")      class(x) <- c("PCorpus", "Corpus", "list")
8      x      x
9  }  }
10    DBControl <- function(x) attr(x, "DBControl")
11    
12  PCorpus <- function(x,  PCorpus <- function(x,
13                      readerControl = list(reader = x$DefaultReader, language = "eng"),                      readerControl = list(reader = x$DefaultReader, language = "en"),
14                      dbControl = list(dbName = "", dbType = "DB1"),                      dbControl = list(dbName = "", dbType = "DB1"),
15                      ...) {                      ...) {
16      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
# Line 47  Line 29 
29      while (!eoi(x)) {      while (!eoi(x)) {
30          x <- stepNext(x)          x <- stepNext(x)
31          elem <- getElem(x)          elem <- getElem(x)
32          doc <- readerControl$reader(elem, readerControl$language, as.character(counter))          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
33          filehash::dbInsert(db, ID(doc), doc)          filehash::dbInsert(db, ID(doc), doc)
34          if (x$Length > 0) tdl[[counter]] <- ID(doc)          if (x$Length > 0) tdl[[counter]] <- ID(doc)
35          else tdl <- c(tdl, ID(doc))          else tdl <- c(tdl, ID(doc))
36          counter <- counter + 1          counter <- counter + 1
37      }      }
38        names(tdl) <- x$Names
39    
40      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
41      filehash::dbInsert(db, "DMetaData", df)      filehash::dbInsert(db, "DMetaData", df)
# Line 68  Line 51 
51      x      x
52  }  }
53    
54    # Register S3 corpus classes to be recognized by S4 methods. This is
55    # mainly a fix to be compatible with packages which were originally
56    # developed to cooperate with corresponding S4 tm classes. Necessary
57    # since tm's class architecture was changed to S3 since tm version 0.5.
58    setOldClass(c("VCorpus", "Corpus", "list"))
59    
60  # The "..." are additional arguments for the FunctionGenerator reader  # The "..." are additional arguments for the FunctionGenerator reader
61  VCorpus <- Corpus <- function(x,  VCorpus <- Corpus <- function(x,
62                      readerControl = list(reader = x$DefaultReader, language = "eng"),                                readerControl = list(reader = x$DefaultReader, language = "en"),
63                      ...) {                      ...) {
64      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
65    
# Line 83  Line 72 
72      if (x$Vectorized)      if (x$Vectorized)
73          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
74                        pGetElem(x),                        pGetElem(x),
75                        id = as.character(seq_len(x$Length)),                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,
76                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
77      else {      else {
78          counter <- 1          counter <- 1
79          while (!eoi(x)) {          while (!eoi(x)) {
80              x <- stepNext(x)              x <- stepNext(x)
81              elem <- getElem(x)              elem <- getElem(x)
82              doc <- readerControl$reader(elem, readerControl$language, as.character(counter))              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
83              if (x$Length > 0)              if (x$Length > 0)
84                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
85              else              else
# Line 98  Line 87 
87              counter <- counter + 1              counter <- counter + 1
88          }          }
89      }      }
90        names(tdl) <- x$Names
91      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
92      .VCorpus(tdl, .MetaDataNode(), df)      .VCorpus(tdl, .MetaDataNode(), df)
93  }  }
94    
95  `[.PCorpus` <- function(x, i) {  `[.PCorpus` <- function(x, i) {
96      if (missing(i)) return(x)      if (missing(i)) return(x)
     cmeta <- CMetaData(x)  
97      index <- attr(x, "DMetaData")[[1 , "subset"]]      index <- attr(x, "DMetaData")[[1 , "subset"]]
98      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
99      dmeta <- attr(x, "DMetaData")      dmeta <- attr(x, "DMetaData")
100      dbcontrol <- DBControl(x)      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
     class(x) <- "list"  
     .PCorpus(x[i, drop = FALSE], cmeta, dmeta, dbcontrol)  
101  }  }
102    
103  `[.VCorpus` <- function(x, i) {  `[.VCorpus` <- function(x, i) {
104      if (missing(i)) return(x)      if (missing(i)) return(x)
105      cmeta <- CMetaData(x)      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
     dmeta <- DMetaData(x)[i, , drop = FALSE]  
     class(x) <- "list"  
     .VCorpus(x[i, drop = FALSE], cmeta, dmeta)  
106  }  }
107    
108  `[<-.PCorpus` <- function(x, i, value) {  `[<-.PCorpus` <- function(x, i, value) {
109      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
110      counter <- 1      counter <- 1
111      for (id in unclass(x)[i]) {      for (id in unclass(x)[i]) {
112          if (identical(length(value), 1)) db[[id]] <- value          if (identical(length(value), 1L)) db[[id]] <- value
113          else db[[id]] <- value[[counter]]          else db[[id]] <- value[[counter]]
114          counter <- counter + 1          counter <- counter + 1
115      }      }
116      x      x
117  }  }
118    
119    .map_name_index <- function(x, i) {
120        if (is.character(i)) {
121            if (is.null(names(x)))
122                match(i, meta(x, "ID", type = "local"))
123            else
124                match(i, names(x))
125        }
126        i
127    }
128    
129  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-  function(x, i) {
130        i <- .map_name_index(x, i)
131      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
132      class(x) <- "list"      filehash::dbFetch(db, NextMethod("[["))
     filehash::dbFetch(db, x[[i]])  
133  }  }
134  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-  function(x, i) {
135        i <- .map_name_index(x, i)
136      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
137      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
138          .Call("copyCorpus", x, materialize(x, i))          .Call("copyCorpus", x, materialize(x, i))
139      class(x) <- "list"      NextMethod("[[")
     x[[i]]  
140  }  }
141    
142  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-  function(x, i, value) {
143        i <- .map_name_index(x, i)
144      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
145      index <- unclass(x)[[i]]      index <- unclass(x)[[i]]
146      db[[index]] <- value      db[[index]] <- value
147      x      x
148  }  }
149  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-  function(x, i, value) {
150        i <- .map_name_index(x, i)
151      # Mark new objects as not active for lazy mapping      # Mark new objects as not active for lazy mapping
152      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
153      if (!is.null(lazyTmMap)) {      if (!is.null(lazyTmMap)) {
# Line 161  Line 156 
156      }      }
157      # Set the value      # Set the value
158      cl <- class(x)      cl <- class(x)
159      class(x) <- "list"      y <- NextMethod("[[<-")
160      x[[i]] <- value      class(y) <- cl
161      class(x) <- cl      y
     x  
162  }  }
163    
164  # Update \code{NodeID}s of a CMetaData tree  # Update NodeIDs of a CMetaData tree
165  update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
166      # Traversal of (binary) CMetaData tree with setup of \code{NodeID}s      # Traversal of (binary) CMetaData tree with setup of NodeIDs
167      set_id <- function(x) {      set_id <- function(x) {
168          attrs <- attributes(x)          x$NodeID <- id
         x <- id  
         attributes(x) <- attrs  
169          id <<- id + 1          id <<- id + 1
170          level <<- level + 1          level <<- level + 1
171          if (length(attr(x, "Children")) > 0) {          if (length(x$Children) > 0) {
172              mapping <<- cbind(mapping, c(as.numeric(attr(x, "Children")[[1]]), id))              mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
173              left <- set_id(attr(x, "Children")[[1]])              left <- set_id(x$Children[[1]])
174              if (level == 1) {              if (level == 1) {
175                  left.mapping <<- mapping                  left.mapping <<- mapping
176                  mapping <<- NULL                  mapping <<- NULL
177              }              }
178              mapping <<- cbind(mapping, c(as.numeric(attr(x, "Children")[[2]]), id))              mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
179              right <- set_id(attr(x, "Children")[[2]])              right <- set_id(x$Children[[2]])
180    
181              attr(x, "Children") <- list(left, right)              x$Children <- list(left, right)
182          }          }
183          level <<- level - 1          level <<- level - 1
184          x          x
185      }      }
   
186      list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)      list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
187  }  }
188    
189    # Find indices to be updated for a CMetaData tree
190    .find_indices <- function(x) {
191        indices.mapping <- NULL
192        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
193            indices <- (DMetaData(x)$MetaID == m)
194            indices.mapping <- c(indices.mapping, list(m = indices))
195            names(indices.mapping)[length(indices.mapping)] <- m
196        }
197        indices.mapping
198    }
199    
200  c2 <- function(x, y, ...) {  c2 <- function(x, y, ...) {
201      # Update the CMetaData tree      # Update the CMetaData tree
202      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
203      update.struct <- update_id(cmeta)      update.struct <- .update_id(cmeta)
204    
205      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
206    
207      # Find indices to be updated for the left tree      # Find indices to be updated for the left tree
208      indices.mapping <- NULL      indices.mapping <- .find_indices(x)
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
209    
210      # Update the DMetaData data frames for the left tree      # Update the DMetaData data frames for the left tree
211      for (i in 1:ncol(update.struct$left.mapping)) {      for (i in 1:ncol(update.struct$left.mapping)) {
# Line 217  Line 214 
214      }      }
215    
216      # Find indices to be updated for the right tree      # Find indices to be updated for the right tree
217      indices.mapping <- NULL      indices.mapping <- .find_indices(y)
     for (m in levels(as.factor(DMetaData(y)$MetaID))) {  
         indices <- (DMetaData(y)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
218    
219      # Update the DMetaData data frames for the right tree      # Update the DMetaData data frames for the right tree
220      for (i in 1:ncol(update.struct$right.mapping)) {      for (i in 1:ncol(update.struct$right.mapping)) {
# Line 247  Line 239 
239  {  {
240      args <- list(...)      args <- list(...)
241    
242      if (identical(length(args), 0))      if (identical(length(args), 0L))
243          return(x)          return(x)
244    
245      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
# Line 256  Line 248 
248      if (inherits(x, "PCorpus"))      if (inherits(x, "PCorpus"))
249          stop("concatenation of corpora with underlying databases is not supported")          stop("concatenation of corpora with underlying databases is not supported")
250    
251      Reduce(c2, base::c(list(x), args))      l <- base::c(list(x), args)
252        if (recursive)
253            Reduce(c2, l)
254        else {
255            l <- do.call("c", lapply(l, unclass))
256            .VCorpus(l,
257                     cmeta = .MetaDataNode(),
258                     dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))
259        }
260  }  }
261    
262  c.TextDocument <- function(x, ..., recursive = FALSE) {  c.TextDocument <- function(x, ..., recursive = FALSE) {
263      args <- list(...)      args <- list(...)
264    
265      if (identical(length(args), 0))      if (identical(length(args), 0L))
266          return(x)          return(x)
267    
268      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
# Line 280  Line 280 
280      invisible(x)      invisible(x)
281  }  }
282    
283  summary.Corpus <- function(x, ...) {  summary.Corpus <- function(object, ...) {
284      print(x)      print(object)
285      if (length(DMetaData(x)) > 0) {      if (length(DMetaData(object)) > 0) {
286          cat(sprintf(ngettext(length(attr(CMetaData(x), "MetaData")),          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
287                               "\nThe metadata consists of %d tag-value pair and a data frame\n",                               "\nThe metadata consists of %d tag-value pair and a data frame\n",
288                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
289                      length(attr(CMetaData(x), "MetaData"))))                      length(CMetaData(object)$MetaData)))
290          cat("Available tags are:\n")          cat("Available tags are:\n")
291          cat(strwrap(paste(names(attr(CMetaData(x), "MetaData")), collapse = " "), indent = 2, exdent = 2), "\n")          cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
292          cat("Available variables in the data frame are:\n")          cat("Available variables in the data frame are:\n")
293          cat(strwrap(paste(names(DMetaData(x)), collapse = " "), indent = 2, exdent = 2), "\n")          cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
294      }      }
295  }  }
296    

Legend:
Removed from v.987  
changed lines
  Added in v.1108

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge