SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1108, Fri Oct 22 18:32:47 2010 UTC revision 1258, Fri Sep 20 12:15:42 2013 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  .PCorpus <-
4    function(x, cmeta, dmeta, dbcontrol)
5    {
6      attr(x, "CMetaData") <- cmeta      attr(x, "CMetaData") <- cmeta
7      attr(x, "DMetaData") <- dmeta      attr(x, "DMetaData") <- dmeta
8      attr(x, "DBControl") <- dbcontrol      attr(x, "DBControl") <- dbcontrol
9      class(x) <- c("PCorpus", "Corpus", "list")      class(x) <- c("PCorpus", "Corpus", "list")
10      x      x
11  }  }
 DBControl <- function(x) attr(x, "DBControl")  
12    
13  PCorpus <- function(x,  DBControl <-
14    function(x)
15        attr(x, "DBControl")
16    
17    PCorpus <-
18    function(x,
19                      readerControl = list(reader = x$DefaultReader, language = "en"),                      readerControl = list(reader = x$DefaultReader, language = "en"),
20                      dbControl = list(dbName = "", dbType = "DB1"),                      dbControl = list(dbName = "", dbType = "DB1"),
21                      ...) {           ...)
22    {
23      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
24    
25        if (is.function(readerControl$init))
26            readerControl$init()
27    
28        if (is.function(readerControl$exit))
29            on.exit(readerControl$exit())
30    
31      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
32          stop("error in creating database")          stop("error in creating database")
33      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
# Line 44  Line 57 
57      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
58  }  }
59    
60  .VCorpus <- function(x, cmeta, dmeta) {  .VCorpus <-
61    function(x, cmeta, dmeta)
62    {
63      attr(x, "CMetaData") <- cmeta      attr(x, "CMetaData") <- cmeta
64      attr(x, "DMetaData") <- dmeta      attr(x, "DMetaData") <- dmeta
65      class(x) <- c("VCorpus", "Corpus", "list")      class(x) <- c("VCorpus", "Corpus", "list")
66      x      x
67  }  }
68    
 # Register S3 corpus classes to be recognized by S4 methods. This is  
 # mainly a fix to be compatible with packages which were originally  
 # developed to cooperate with corresponding S4 tm classes. Necessary  
 # since tm's class architecture was changed to S3 since tm version 0.5.  
 setOldClass(c("VCorpus", "Corpus", "list"))  
   
69  # The "..." are additional arguments for the FunctionGenerator reader  # The "..." are additional arguments for the FunctionGenerator reader
70  VCorpus <- Corpus <- function(x,  VCorpus <-
71    Corpus <-
72    function(x,
73                                readerControl = list(reader = x$DefaultReader, language = "en"),                                readerControl = list(reader = x$DefaultReader, language = "en"),
74                                ...) {           ...)
75    {
76      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
77    
78        if (is.function(readerControl$init))
79            readerControl$init()
80    
81        if (is.function(readerControl$exit))
82            on.exit(readerControl$exit())
83    
84      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
85      tdl <- if (x$Length > 0)      tdl <- if (x$Length > 0)
86          vector("list", as.integer(x$Length))          vector("list", as.integer(x$Length))
# Line 72  Line 90 
90      if (x$Vectorized)      if (x$Vectorized)
91          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
92                        pGetElem(x),                        pGetElem(x),
93                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = if (is.null(x$Names) || is.na(x$Names)) as.character(seq_len(x$Length)) else x$Names,
94                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
95      else {      else {
96          counter <- 1          counter <- 1
97          while (!eoi(x)) {          while (!eoi(x)) {
98              x <- stepNext(x)              x <- stepNext(x)
99              elem <- getElem(x)              elem <- getElem(x)
100              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              id <- if (is.null(x$Names) || is.na(x$Names))
101                    as.character(counter)
102                else
103                    x$Names[counter]
104                doc <- readerControl$reader(elem, readerControl$language, id)
105              if (x$Length > 0)              if (x$Length > 0)
106                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
107              else              else
# Line 92  Line 114 
114      .VCorpus(tdl, .MetaDataNode(), df)      .VCorpus(tdl, .MetaDataNode(), df)
115  }  }
116    
117  `[.PCorpus` <- function(x, i) {  `[.PCorpus` <-
118    function(x, i)
119    {
120      if (missing(i)) return(x)      if (missing(i)) return(x)
121      index <- attr(x, "DMetaData")[[1 , "subset"]]      index <- attr(x, "DMetaData")[[1 , "subset"]]
122      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
# Line 100  Line 124 
124      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
125  }  }
126    
127  `[.VCorpus` <- function(x, i) {  `[.VCorpus` <-
128    function(x, i)
129    {
130      if (missing(i)) return(x)      if (missing(i)) return(x)
131      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
132  }  }
133    
134  `[<-.PCorpus` <- function(x, i, value) {  `[<-.PCorpus` <-
135    function(x, i, value)
136    {
137      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
138      counter <- 1      counter <- 1
139      for (id in unclass(x)[i]) {      for (id in unclass(x)[i]) {
# Line 116  Line 144 
144      x      x
145  }  }
146    
147  .map_name_index <- function(x, i) {  .map_name_index <-
148    function(x, i)
149    {
150      if (is.character(i)) {      if (is.character(i)) {
151          if (is.null(names(x)))          if (is.null(names(x)))
152              match(i, meta(x, "ID", type = "local"))              match(i, meta(x, "ID", type = "local"))
# Line 126  Line 156 
156      i      i
157  }  }
158    
159  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
160    function(x, i)
161    {
162      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
163      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
164      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, NextMethod("[["))
165  }  }
166  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
167    function(x, i)
168    {
169      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
170      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
171      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
# Line 139  Line 173 
173      NextMethod("[[")      NextMethod("[[")
174  }  }
175    
176  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
177    function(x, i, value)
178    {
179      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
180      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
181      index <- unclass(x)[[i]]      index <- unclass(x)[[i]]
182      db[[index]] <- value      db[[index]] <- value
183      x      x
184  }  }
185  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
186    function(x, i, value)
187    {
188      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
189      # Mark new objects as not active for lazy mapping      # Mark new objects as not active for lazy mapping
190      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
# Line 162  Line 200 
200  }  }
201    
202  # Update NodeIDs of a CMetaData tree  # Update NodeIDs of a CMetaData tree
203  .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  .update_id <-
204    function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0)
205    {
206      # Traversal of (binary) CMetaData tree with setup of NodeIDs      # Traversal of (binary) CMetaData tree with setup of NodeIDs
207      set_id <- function(x) {      set_id <- function(x) {
208          x$NodeID <- id          x$NodeID <- id
# Line 187  Line 227 
227  }  }
228    
229  # Find indices to be updated for a CMetaData tree  # Find indices to be updated for a CMetaData tree
230  .find_indices <- function(x) {  .find_indices <-
231    function(x)
232    {
233      indices.mapping <- NULL      indices.mapping <- NULL
234      for (m in levels(as.factor(DMetaData(x)$MetaID))) {      for (m in levels(as.factor(DMetaData(x)$MetaID))) {
235          indices <- (DMetaData(x)$MetaID == m)          indices <- (DMetaData(x)$MetaID == m)
# Line 197  Line 239 
239      indices.mapping      indices.mapping
240  }  }
241    
242  c2 <- function(x, y, ...) {  c2 <-
243    function(x, y, ...)
244    {
245      # Update the CMetaData tree      # Update the CMetaData tree
246      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
247      update.struct <- .update_id(cmeta)      update.struct <- .update_id(cmeta)
# Line 224  Line 268 
268    
269      # Merge the DMetaData data frames      # Merge the DMetaData data frames
270      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
271      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))      na.matrix <- matrix(NA,
272                            nrow = nrow(DMetaData(x)),
273                            ncol = length(labels),
274                            dimnames = list(row.names(DMetaData(x)), labels))
275      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
276      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
277      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))      na.matrix <- matrix(NA,
278                            nrow = nrow(DMetaData(y)),
279                            ncol = length(labels),
280                            dimnames = list(row.names(DMetaData(y)), labels))
281      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
282      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
283    
# Line 235  Line 285 
285  }  }
286    
287  c.Corpus <-  c.Corpus <-
288  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
289  {  {
290      args <- list(...)      args <- list(...)
291        x <- args[[1L]]
292    
293      if (identical(length(args), 0L))      if(length(args) == 1L)
294          return(x)          return(x)
295    
296      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
# Line 248  Line 299 
299      if (inherits(x, "PCorpus"))      if (inherits(x, "PCorpus"))
300          stop("concatenation of corpora with underlying databases is not supported")          stop("concatenation of corpora with underlying databases is not supported")
301    
     l <- base::c(list(x), args)  
302      if (recursive)      if (recursive)
303          Reduce(c2, l)          Reduce(c2, args)
304      else {      else {
305          l <- do.call("c", lapply(l, unclass))          args <- do.call("c", lapply(args, unclass))
306          .VCorpus(l,          .VCorpus(args,
307                   cmeta = .MetaDataNode(),                   cmeta = .MetaDataNode(),
308                   dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))                   dmeta = data.frame(MetaID = rep(0, length(args)),
309                                        stringsAsFactors = FALSE))
310      }      }
311  }  }
312    
313  c.TextDocument <- function(x, ..., recursive = FALSE) {  c.TextDocument <-
314    function(..., recursive = FALSE)
315    {
316      args <- list(...)      args <- list(...)
317        x <- args[[1L]]
318    
319      if (identical(length(args), 0L))      if(length(args) == 1L)
320          return(x)          return(x)
321    
322      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
323          stop("not all arguments are text documents")          stop("not all arguments are text documents")
324    
325      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)      dmeta <- data.frame(MetaID = rep(0, length(args)),
326      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)                          stringsAsFactors = FALSE)
327        .VCorpus(args, .MetaDataNode(), dmeta)
328  }  }
329    
330  print.Corpus <- function(x, ...) {  print.Corpus <-
331    function(x, ...)
332    {
333      cat(sprintf(ngettext(length(x),      cat(sprintf(ngettext(length(x),
334                           "A corpus with %d text document\n",                           "A corpus with %d text document\n",
335                           "A corpus with %d text documents\n"),                           "A corpus with %d text documents\n"),
# Line 280  Line 337 
337      invisible(x)      invisible(x)
338  }  }
339    
340  summary.Corpus <- function(object, ...) {  summary.Corpus <-
341    function(object, ...)
342    {
343      print(object)      print(object)
344      if (length(DMetaData(object)) > 0) {      if (length(DMetaData(object)) > 0) {
345          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
# Line 294  Line 353 
353      }      }
354  }  }
355    
356  inspect <- function(x) UseMethod("inspect", x)  inspect <-
357  inspect.PCorpus <- function(x) {  function(x)
358        UseMethod("inspect", x)
359    inspect.PCorpus <-
360    function(x)
361    {
362      summary(x)      summary(x)
363      cat("\n")      cat("\n")
364      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
365      show(filehash::dbMultiFetch(db, unlist(x)))      show(filehash::dbMultiFetch(db, unlist(x)))
366  }  }
367  inspect.VCorpus <- function(x) {  inspect.VCorpus <-
368    function(x)
369    {
370      summary(x)      summary(x)
371      cat("\n")      cat("\n")
372      print(noquote(lapply(x, identity)))      print(noquote(lapply(x, identity)))
373  }  }
374    
375  lapply.PCorpus <- function(X, FUN, ...) {  lapply.PCorpus <-
376    function(X, FUN, ...)
377    {
378      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
379      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
380  }  }
381  lapply.VCorpus <- function(X, FUN, ...) {  lapply.VCorpus <-
382    function(X, FUN, ...)
383    {
384      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
385      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
386          .Call("copyCorpus", X, materialize(X))          .Call("copyCorpus", X, materialize(X))
387      base::lapply(X, FUN, ...)      base::lapply(X, FUN, ...)
388  }  }
389    
390  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
391    function(x, path = ".", filenames = NULL)
392    {
393      filenames <- file.path(path,      filenames <- file.path(path,
394                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
395                             else filenames)                             else filenames)

Legend:
Removed from v.1108  
changed lines
  Added in v.1258

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge