SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 986, Tue Sep 1 15:33:30 2009 UTC revision 1114, Fri Nov 26 14:05:54 2010 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
 prepareReader <- function(readerControl, defaultReader = NULL, ...) {  
     if (is.null(readerControl$reader))  
         readerControl$reader <- defaultReader  
     if (inherits(readerControl$reader, "FunctionGenerator"))  
         readerControl$reader <- readerControl$reader(...)  
     if (is.null(readerControl$language))  
         readerControl$language <- "eng"  
     readerControl  
 }  
   
 # Node ID, actual meta data, and possibly other nodes as children  
 .MetaDataNode <- function(nodeid = 0, meta = list(create_date = as.POSIXlt(Sys.time(), tz = "GMT"), creator = Sys.getenv("LOGNAME")), children = NULL) {  
     structure(list(NodeID = nodeid, MetaData = meta, Children = children),  
               class = "MetaDataNode")  
 }  
   
 print.MetaDataNode <- function(x, ...)  
     print(x$MetaData)  
   
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4      attr(x, "CMetaData") <- cmeta      attr(x, "CMetaData") <- cmeta
5      attr(x, "DMetaData") <- dmeta      attr(x, "DMetaData") <- dmeta
# Line 26  Line 7 
7      class(x) <- c("PCorpus", "Corpus", "list")      class(x) <- c("PCorpus", "Corpus", "list")
8      x      x
9  }  }
10    DBControl <- function(x) attr(x, "DBControl")
11    
12  PCorpus <- function(x,  PCorpus <- function(x,
13                      readerControl = list(reader = x$DefaultReader, language = "eng"),                      readerControl = list(reader = x$DefaultReader, language = "en"),
14                      dbControl = list(dbName = "", dbType = "DB1"),                      dbControl = list(dbName = "", dbType = "DB1"),
15                      ...) {                      ...) {
16      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
17    
18        if (is.function(readerControl$init))
19            readerControl$init()
20    
21        if (is.function(readerControl$exit))
22            on.exit(readerControl$exit())
23    
24      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
25          stop("error in creating database")          stop("error in creating database")
26      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
# Line 47  Line 35 
35      while (!eoi(x)) {      while (!eoi(x)) {
36          x <- stepNext(x)          x <- stepNext(x)
37          elem <- getElem(x)          elem <- getElem(x)
38          doc <- readerControl$reader(elem, readerControl$language, as.character(counter))          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
39          filehash::dbInsert(db, ID(doc), doc)          filehash::dbInsert(db, ID(doc), doc)
40          if (x$Length > 0) tdl[[counter]] <- ID(doc)          if (x$Length > 0) tdl[[counter]] <- ID(doc)
41          else tdl <- c(tdl, ID(doc))          else tdl <- c(tdl, ID(doc))
42          counter <- counter + 1          counter <- counter + 1
43      }      }
44        names(tdl) <- x$Names
45    
46      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
47      filehash::dbInsert(db, "DMetaData", df)      filehash::dbInsert(db, "DMetaData", df)
# Line 68  Line 57 
57      x      x
58  }  }
59    
60    # Register S3 corpus classes to be recognized by S4 methods. This is
61    # mainly a fix to be compatible with packages which were originally
62    # developed to cooperate with corresponding S4 tm classes. Necessary
63    # since tm's class architecture was changed to S3 since tm version 0.5.
64    setOldClass(c("VCorpus", "Corpus", "list"))
65    
66  # The "..." are additional arguments for the FunctionGenerator reader  # The "..." are additional arguments for the FunctionGenerator reader
67  VCorpus <- Corpus <- function(x,  VCorpus <- Corpus <- function(x,
68                      readerControl = list(reader = x$DefaultReader, language = "eng"),                                readerControl = list(reader = x$DefaultReader, language = "en"),
69                      ...) {                      ...) {
70      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
71    
72        if (is.function(readerControl$init))
73            readerControl$init()
74    
75        if (is.function(readerControl$exit))
76            on.exit(readerControl$exit())
77    
78      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
79      tdl <- if (x$Length > 0)      tdl <- if (x$Length > 0)
80          vector("list", as.integer(x$Length))          vector("list", as.integer(x$Length))
# Line 81  Line 82 
82          list()          list()
83    
84      if (x$Vectorized)      if (x$Vectorized)
85          mapply(function(x, id) readerControl$reader(x, readerControl$language, id),          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
86                 pGetElem(x),                 pGetElem(x),
87                 id = as.character(seq_len(x$Length)),                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,
88                 SIMPLIFY = FALSE)                 SIMPLIFY = FALSE)
89      else {      else {
90          counter <- 1          counter <- 1
91          while (!eoi(x)) {          while (!eoi(x)) {
92              x <- stepNext(x)              x <- stepNext(x)
93              elem <- getElem(x)              elem <- getElem(x)
94              doc <- readerControl$reader(elem, readerControl$language, as.character(counter))              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
95              if (x$Length > 0)              if (x$Length > 0)
96                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
97              else              else
# Line 98  Line 99 
99              counter <- counter + 1              counter <- counter + 1
100          }          }
101      }      }
102        names(tdl) <- x$Names
103      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
104      .VCorpus(tdl, .MetaDataNode(), df)      .VCorpus(tdl, .MetaDataNode(), df)
105  }  }
106    
107  `[.PCorpus` <- function(x, i) {  `[.PCorpus` <- function(x, i) {
108      if (missing(i)) return(x)      if (missing(i)) return(x)
     cmeta <- CMetaData(x)  
109      index <- attr(x, "DMetaData")[[1 , "subset"]]      index <- attr(x, "DMetaData")[[1 , "subset"]]
110      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
111      dmeta <- attr(x, "DMetaData")      dmeta <- attr(x, "DMetaData")
112      dbcontrol <- DBControl(x)      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
     class(x) <- "list"  
     .PCorpus(x[i, drop = FALSE], cmeta, dmeta, dbcontrol)  
113  }  }
114    
115  `[.VCorpus` <- function(x, i) {  `[.VCorpus` <- function(x, i) {
116      if (missing(i)) return(x)      if (missing(i)) return(x)
117      cmeta <- CMetaData(x)      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
     dmeta <- DMetaData(x)[i, , drop = FALSE]  
     class(x) <- "list"  
     .VCorpus(x[i, drop = FALSE], cmeta, dmeta)  
118  }  }
119    
120  `[<-.PCorpus` <- function(x, i, value) {  `[<-.PCorpus` <- function(x, i, value) {
121      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
122      counter <- 1      counter <- 1
123      for (id in unclass(x)[i]) {      for (id in unclass(x)[i]) {
124          if (identical(length(value), 1)) db[[id]] <- value          if (identical(length(value), 1L)) db[[id]] <- value
125          else db[[id]] <- value[[counter]]          else db[[id]] <- value[[counter]]
126          counter <- counter + 1          counter <- counter + 1
127      }      }
128      x      x
129  }  }
130    
131    .map_name_index <- function(x, i) {
132        if (is.character(i)) {
133            if (is.null(names(x)))
134                match(i, meta(x, "ID", type = "local"))
135            else
136                match(i, names(x))
137        }
138        i
139    }
140    
141  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-  function(x, i) {
142        i <- .map_name_index(x, i)
143      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
144      class(x) <- "list"      filehash::dbFetch(db, NextMethod("[["))
     filehash::dbFetch(db, x[[i]])  
145  }  }
146  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-  function(x, i) {
147        i <- .map_name_index(x, i)
148      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
149      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
150          .Call("copyCorpus", x, materialize(x, i))          .Call("copyCorpus", x, materialize(x, i))
151      class(x) <- "list"      NextMethod("[[")
     x[[i]]  
152  }  }
153    
154  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-  function(x, i, value) {
155        i <- .map_name_index(x, i)
156      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
157      index <- unclass(x)[[i]]      index <- unclass(x)[[i]]
158      db[[index]] <- value      db[[index]] <- value
159      x      x
160  }  }
161  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-  function(x, i, value) {
162        i <- .map_name_index(x, i)
163      # Mark new objects as not active for lazy mapping      # Mark new objects as not active for lazy mapping
164      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
165      if (!is.null(lazyTmMap)) {      if (!is.null(lazyTmMap)) {
# Line 161  Line 168 
168      }      }
169      # Set the value      # Set the value
170      cl <- class(x)      cl <- class(x)
171      class(x) <- "list"      y <- NextMethod("[[<-")
172      x[[i]] <- value      class(y) <- cl
173      class(x) <- cl      y
     x  
174  }  }
175    
176  # Update \code{NodeID}s of a CMetaData tree  # Update NodeIDs of a CMetaData tree
177  update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
178      # Traversal of (binary) CMetaData tree with setup of \code{NodeID}s      # Traversal of (binary) CMetaData tree with setup of NodeIDs
179      set_id <- function(x) {      set_id <- function(x) {
180          attrs <- attributes(x)          x$NodeID <- id
         x <- id  
         attributes(x) <- attrs  
181          id <<- id + 1          id <<- id + 1
182          level <<- level + 1          level <<- level + 1
183          if (length(attr(x, "Children")) > 0) {          if (length(x$Children) > 0) {
184              mapping <<- cbind(mapping, c(as.numeric(attr(x, "Children")[[1]]), id))              mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
185              left <- set_id(attr(x, "Children")[[1]])              left <- set_id(x$Children[[1]])
186              if (level == 1) {              if (level == 1) {
187                  left.mapping <<- mapping                  left.mapping <<- mapping
188                  mapping <<- NULL                  mapping <<- NULL
189              }              }
190              mapping <<- cbind(mapping, c(as.numeric(attr(x, "Children")[[2]]), id))              mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
191              right <- set_id(attr(x, "Children")[[2]])              right <- set_id(x$Children[[2]])
192    
193              attr(x, "Children") <- list(left, right)              x$Children <- list(left, right)
194          }          }
195          level <<- level - 1          level <<- level - 1
196          x          x
197      }      }
   
198      list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)      list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
199  }  }
200    
201    # Find indices to be updated for a CMetaData tree
202    .find_indices <- function(x) {
203        indices.mapping <- NULL
204        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
205            indices <- (DMetaData(x)$MetaID == m)
206            indices.mapping <- c(indices.mapping, list(m = indices))
207            names(indices.mapping)[length(indices.mapping)] <- m
208        }
209        indices.mapping
210    }
211    
212  c2 <- function(x, y, ...) {  c2 <- function(x, y, ...) {
213      # Update the CMetaData tree      # Update the CMetaData tree
214      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
215      update.struct <- update_id(cmeta)      update.struct <- .update_id(cmeta)
216    
217      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
218    
219      # Find indices to be updated for the left tree      # Find indices to be updated for the left tree
220      indices.mapping <- NULL      indices.mapping <- .find_indices(x)
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
221    
222      # Update the DMetaData data frames for the left tree      # Update the DMetaData data frames for the left tree
223      for (i in 1:ncol(update.struct$left.mapping)) {      for (i in 1:ncol(update.struct$left.mapping)) {
# Line 217  Line 226 
226      }      }
227    
228      # Find indices to be updated for the right tree      # Find indices to be updated for the right tree
229      indices.mapping <- NULL      indices.mapping <- .find_indices(y)
     for (m in levels(as.factor(DMetaData(y)$MetaID))) {  
         indices <- (DMetaData(y)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
230    
231      # Update the DMetaData data frames for the right tree      # Update the DMetaData data frames for the right tree
232      for (i in 1:ncol(update.struct$right.mapping)) {      for (i in 1:ncol(update.struct$right.mapping)) {
# Line 247  Line 251 
251  {  {
252      args <- list(...)      args <- list(...)
253    
254      if (identical(length(args), 0))      if (identical(length(args), 0L))
255          return(x)          return(x)
256    
257      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
# Line 256  Line 260 
260      if (inherits(x, "PCorpus"))      if (inherits(x, "PCorpus"))
261          stop("concatenation of corpora with underlying databases is not supported")          stop("concatenation of corpora with underlying databases is not supported")
262    
263      Reduce(c2, base::c(list(x), args))      l <- base::c(list(x), args)
264        if (recursive)
265            Reduce(c2, l)
266        else {
267            l <- do.call("c", lapply(l, unclass))
268            .VCorpus(l,
269                     cmeta = .MetaDataNode(),
270                     dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))
271        }
272  }  }
273    
274  c.TextDocument <- function(x, ..., recursive = FALSE) {  c.TextDocument <- function(x, ..., recursive = FALSE) {
275      args <- list(...)      args <- list(...)
276    
277      if (identical(length(args), 0))      if (identical(length(args), 0L))
278          return(x)          return(x)
279    
280      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
# Line 280  Line 292 
292      invisible(x)      invisible(x)
293  }  }
294    
295  summary.Corpus <- function(x, ...) {  summary.Corpus <- function(object, ...) {
296      print(x)      print(object)
297      if (length(DMetaData(x)) > 0) {      if (length(DMetaData(object)) > 0) {
298          cat(sprintf(ngettext(length(attr(CMetaData(x), "MetaData")),          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
299                               "\nThe metadata consists of %d tag-value pair and a data frame\n",                               "\nThe metadata consists of %d tag-value pair and a data frame\n",
300                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
301                      length(attr(CMetaData(x), "MetaData"))))                      length(CMetaData(object)$MetaData)))
302          cat("Available tags are:\n")          cat("Available tags are:\n")
303          cat(strwrap(paste(names(attr(CMetaData(x), "MetaData")), collapse = " "), indent = 2, exdent = 2), "\n")          cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
304          cat("Available variables in the data frame are:\n")          cat("Available variables in the data frame are:\n")
305          cat(strwrap(paste(names(DMetaData(x)), collapse = " "), indent = 2, exdent = 2), "\n")          cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
306      }      }
307  }  }
308    
# Line 307  Line 319 
319      print(noquote(lapply(x, identity)))      print(noquote(lapply(x, identity)))
320  }  }
321    
 # No metadata is checked  
 `%IN%` <- function(x, y) UseMethod("%IN%", y)  
 `%IN%.PCorpus` <- function(x, y) {  
     db <- filehash::dbInit(DBControl(y)[["dbName"]], DBControl(y)[["dbType"]])  
     any(unlist(lapply(y, function(x, z) {x %in% Content(z)}, x)))  
 }  
 `%IN%.VCorpus` <- function(x, y) x %in% y  
   
322  lapply.PCorpus <- function(X, FUN, ...) {  lapply.PCorpus <- function(X, FUN, ...) {
323      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
324      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)

Legend:
Removed from v.986  
changed lines
  Added in v.1114

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge