SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1108, Fri Oct 22 18:32:47 2010 UTC revision 1300, Fri Mar 21 14:30:05 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  .PCorpus <-
4    function(x, cmeta, dmeta, dbcontrol)
5    {
6      attr(x, "CMetaData") <- cmeta      attr(x, "CMetaData") <- cmeta
7      attr(x, "DMetaData") <- dmeta      attr(x, "DMetaData") <- dmeta
8      attr(x, "DBControl") <- dbcontrol      attr(x, "DBControl") <- dbcontrol
9      class(x) <- c("PCorpus", "Corpus", "list")      class(x) <- c("PCorpus", "Corpus", "list")
10      x      x
11  }  }
 DBControl <- function(x) attr(x, "DBControl")  
12    
13  PCorpus <- function(x,  DBControl <-
14    function(x)
15        attr(x, "DBControl")
16    
17    PCorpus <-
18    function(x,
19                      readerControl = list(reader = x$DefaultReader, language = "en"),                      readerControl = list(reader = x$DefaultReader, language = "en"),
20                      dbControl = list(dbName = "", dbType = "DB1"),           dbControl = list(dbName = "", dbType = "DB1"))
21                      ...) {  {
22      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      stopifnot(inherits(x, "Source"))
23    
24        readerControl <- prepareReader(readerControl, x$DefaultReader)
25    
26        if (is.function(readerControl$init))
27            readerControl$init()
28    
29        if (is.function(readerControl$exit))
30            on.exit(readerControl$exit())
31    
32      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
33          stop("error in creating database")          stop("error in creating database")
# Line 29  Line 43 
43      while (!eoi(x)) {      while (!eoi(x)) {
44          x <- stepNext(x)          x <- stepNext(x)
45          elem <- getElem(x)          elem <- getElem(x)
46          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          id <- if (is.null(x$Names) || is.na(x$Names))
47          filehash::dbInsert(db, ID(doc), doc)                  as.character(counter)
48          if (x$Length > 0) tdl[[counter]] <- ID(doc)              else
49          else tdl <- c(tdl, ID(doc))                  x$Names[counter]
50            doc <- readerControl$reader(elem, readerControl$language, id)
51            filehash::dbInsert(db, meta(doc, "ID"), doc)
52            if (x$Length > 0) tdl[[counter]] <- meta(doc, "ID")
53            else tdl <- c(tdl, meta(doc, "ID"))
54          counter <- counter + 1          counter <- counter + 1
55      }      }
56        if (!is.null(x$Names) && !is.na(x$Names))
57      names(tdl) <- x$Names      names(tdl) <- x$Names
58    
59      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
# Line 44  Line 63 
63      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
64  }  }
65    
66  .VCorpus <- function(x, cmeta, dmeta) {  .VCorpus <-
67    function(x, cmeta, dmeta)
68    {
69      attr(x, "CMetaData") <- cmeta      attr(x, "CMetaData") <- cmeta
70      attr(x, "DMetaData") <- dmeta      attr(x, "DMetaData") <- dmeta
71      class(x) <- c("VCorpus", "Corpus", "list")      class(x) <- c("VCorpus", "Corpus", "list")
72      x      x
73  }  }
74    
75  # Register S3 corpus classes to be recognized by S4 methods. This is  VCorpus <-
76  # mainly a fix to be compatible with packages which were originally  Corpus <-
77  # developed to cooperate with corresponding S4 tm classes. Necessary  function(x, readerControl = list(reader = x$DefaultReader, language = "en"))
78  # since tm's class architecture was changed to S3 since tm version 0.5.  {
79  setOldClass(c("VCorpus", "Corpus", "list"))      stopifnot(inherits(x, "Source"))
80    
81  # The "..." are additional arguments for the FunctionGenerator reader      readerControl <- prepareReader(readerControl, x$DefaultReader)
82  VCorpus <- Corpus <- function(x,  
83                                readerControl = list(reader = x$DefaultReader, language = "en"),      if (is.function(readerControl$init))
84                                ...) {          readerControl$init()
85      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
86        if (is.function(readerControl$exit))
87            on.exit(readerControl$exit())
88    
89      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
90      tdl <- if (x$Length > 0)      tdl <- if (x$Length > 0)
# Line 72  Line 95 
95      if (x$Vectorized)      if (x$Vectorized)
96          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
97                        pGetElem(x),                        pGetElem(x),
98                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = if (is.null(x$Names) || is.na(x$Names)) as.character(seq_len(x$Length)) else x$Names,
99                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
100      else {      else {
101          counter <- 1          counter <- 1
102          while (!eoi(x)) {          while (!eoi(x)) {
103              x <- stepNext(x)              x <- stepNext(x)
104              elem <- getElem(x)              elem <- getElem(x)
105              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              id <- if (is.null(x$Names) || is.na(x$Names))
106                    as.character(counter)
107                else
108                    x$Names[counter]
109                doc <- readerControl$reader(elem, readerControl$language, id)
110              if (x$Length > 0)              if (x$Length > 0)
111                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
112              else              else
# Line 87  Line 114 
114              counter <- counter + 1              counter <- counter + 1
115          }          }
116      }      }
117        if (!is.null(x$Names) && !is.na(x$Names))
118      names(tdl) <- x$Names      names(tdl) <- x$Names
119      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
120      .VCorpus(tdl, .MetaDataNode(), df)      .VCorpus(tdl, .MetaDataNode(), df)
121  }  }
122    
123  `[.PCorpus` <- function(x, i) {  `[.PCorpus` <-
124    function(x, i)
125    {
126      if (missing(i)) return(x)      if (missing(i)) return(x)
127      index <- attr(x, "DMetaData")[[1 , "subset"]]      index <- attr(x, "DMetaData")[[1 , "subset"]]
128      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
# Line 100  Line 130 
130      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
131  }  }
132    
133  `[.VCorpus` <- function(x, i) {  `[.VCorpus` <-
134    function(x, i)
135    {
136      if (missing(i)) return(x)      if (missing(i)) return(x)
137      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
138  }  }
139    
140  `[<-.PCorpus` <- function(x, i, value) {  `[<-.PCorpus` <-
141    function(x, i, value)
142    {
143      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
144      counter <- 1      counter <- 1
145      for (id in unclass(x)[i]) {      for (id in unclass(x)[i]) {
# Line 116  Line 150 
150      x      x
151  }  }
152    
153  .map_name_index <- function(x, i) {  .map_name_index <-
154    function(x, i)
155    {
156      if (is.character(i)) {      if (is.character(i)) {
157          if (is.null(names(x)))          if (is.null(names(x)))
158              match(i, meta(x, "ID", type = "local"))              match(i, meta(x, "ID", type = "local"))
# Line 126  Line 162 
162      i      i
163  }  }
164    
165  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
166    function(x, i)
167    {
168      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
169      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
170      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, NextMethod("[["))
171  }  }
172  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
173    function(x, i)
174    {
175      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
176      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
177      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
# Line 139  Line 179 
179      NextMethod("[[")      NextMethod("[[")
180  }  }
181    
182  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
183    function(x, i, value)
184    {
185      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
186      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
187      index <- unclass(x)[[i]]      index <- unclass(x)[[i]]
188      db[[index]] <- value      db[[index]] <- value
189      x      x
190  }  }
191  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
192    function(x, i, value)
193    {
194      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
195      # Mark new objects as not active for lazy mapping      # Mark new objects as not active for lazy mapping
196      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
# Line 162  Line 206 
206  }  }
207    
208  # Update NodeIDs of a CMetaData tree  # Update NodeIDs of a CMetaData tree
209  .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  .update_id <-
210    function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0)
211    {
212      # Traversal of (binary) CMetaData tree with setup of NodeIDs      # Traversal of (binary) CMetaData tree with setup of NodeIDs
213      set_id <- function(x) {      set_id <- function(x) {
214          x$NodeID <- id          x$NodeID <- id
215          id <<- id + 1          id <<- id + 1
216          level <<- level + 1          level <<- level + 1
217          if (length(x$Children) > 0) {          if (length(x$Children)) {
218              mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))              mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
219              left <- set_id(x$Children[[1]])              left <- set_id(x$Children[[1]])
220              if (level == 1) {              if (level == 1) {
# Line 187  Line 233 
233  }  }
234    
235  # Find indices to be updated for a CMetaData tree  # Find indices to be updated for a CMetaData tree
236  .find_indices <- function(x) {  .find_indices <-
237    function(x)
238    {
239      indices.mapping <- NULL      indices.mapping <- NULL
240      for (m in levels(as.factor(DMetaData(x)$MetaID))) {      for (m in levels(as.factor(DMetaData(x)$MetaID))) {
241          indices <- (DMetaData(x)$MetaID == m)          indices <- (DMetaData(x)$MetaID == m)
# Line 197  Line 245 
245      indices.mapping      indices.mapping
246  }  }
247    
248  c2 <- function(x, y, ...) {  c2 <-
249    function(x, y, ...)
250    {
251      # Update the CMetaData tree      # Update the CMetaData tree
252      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
253      update.struct <- .update_id(cmeta)      update.struct <- .update_id(cmeta)
# Line 224  Line 274 
274    
275      # Merge the DMetaData data frames      # Merge the DMetaData data frames
276      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
277      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))      na.matrix <- matrix(NA,
278                            nrow = nrow(DMetaData(x)),
279                            ncol = length(labels),
280                            dimnames = list(row.names(DMetaData(x)), labels))
281      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
282      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
283      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))      na.matrix <- matrix(NA,
284                            nrow = nrow(DMetaData(y)),
285                            ncol = length(labels),
286                            dimnames = list(row.names(DMetaData(y)), labels))
287      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
288      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
289    
# Line 235  Line 291 
291  }  }
292    
293  c.Corpus <-  c.Corpus <-
294  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
295  {  {
296      args <- list(...)      args <- list(...)
297        x <- args[[1L]]
298    
299      if (identical(length(args), 0L))      if(length(args) == 1L)
300          return(x)          return(x)
301    
302      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
# Line 248  Line 305 
305      if (inherits(x, "PCorpus"))      if (inherits(x, "PCorpus"))
306          stop("concatenation of corpora with underlying databases is not supported")          stop("concatenation of corpora with underlying databases is not supported")
307    
     l <- base::c(list(x), args)  
308      if (recursive)      if (recursive)
309          Reduce(c2, l)          Reduce(c2, args)
310      else {      else {
311          l <- do.call("c", lapply(l, unclass))          args <- do.call("c", lapply(args, unclass))
312          .VCorpus(l,          .VCorpus(args,
313                   cmeta = .MetaDataNode(),                   cmeta = .MetaDataNode(),
314                   dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))                   dmeta = data.frame(MetaID = rep(0, length(args)),
315                                        stringsAsFactors = FALSE))
316      }      }
317  }  }
318    
319  c.TextDocument <- function(x, ..., recursive = FALSE) {  c.TextDocument <-
320    function(..., recursive = FALSE)
321    {
322      args <- list(...)      args <- list(...)
323        x <- args[[1L]]
324    
325      if (identical(length(args), 0L))      if(length(args) == 1L)
326          return(x)          return(x)
327    
328      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
329          stop("not all arguments are text documents")          stop("not all arguments are text documents")
330    
331      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)      dmeta <- data.frame(MetaID = rep(0, length(args)),
332      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)                          stringsAsFactors = FALSE)
333        .VCorpus(args, .MetaDataNode(), dmeta)
334  }  }
335    
336  print.Corpus <- function(x, ...) {  print.Corpus <-
337    function(x, ...)
338    {
339      cat(sprintf(ngettext(length(x),      cat(sprintf(ngettext(length(x),
340                           "A corpus with %d text document\n",                           "A corpus with %d text document\n",
341                           "A corpus with %d text documents\n"),                           "A corpus with %d text documents\n"),
# Line 280  Line 343 
343      invisible(x)      invisible(x)
344  }  }
345    
346  summary.Corpus <- function(object, ...) {  summary.Corpus <-
347    function(object, ...)
348    {
349      print(object)      print(object)
350      if (length(DMetaData(object)) > 0) {      if (length(DMetaData(object))) {
351          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
352                               "\nThe metadata consists of %d tag-value pair and a data frame\n",                               "\nThe metadata consists of %d tag-value pair and a data frame\n",
353                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
# Line 294  Line 359 
359      }      }
360  }  }
361    
362  inspect <- function(x) UseMethod("inspect", x)  inspect <-
363  inspect.PCorpus <- function(x) {  function(x)
364        UseMethod("inspect", x)
365    inspect.PCorpus <-
366    function(x)
367    {
368      summary(x)      summary(x)
369      cat("\n")      cat("\n")
370      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
371      show(filehash::dbMultiFetch(db, unlist(x)))      show(filehash::dbMultiFetch(db, unlist(x)))
372  }  }
373  inspect.VCorpus <- function(x) {  inspect.VCorpus <-
374    function(x)
375    {
376      summary(x)      summary(x)
377      cat("\n")      cat("\n")
378      print(noquote(lapply(x, identity)))      print(noquote(lapply(x, identity)))
379  }  }
380    
381  lapply.PCorpus <- function(X, FUN, ...) {  lapply.PCorpus <-
382    function(X, FUN, ...)
383    {
384      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
385      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
386  }  }
387  lapply.VCorpus <- function(X, FUN, ...) {  lapply.VCorpus <-
388    function(X, FUN, ...)
389    {
390      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
391      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
392          .Call("copyCorpus", X, materialize(X))          .Call("copyCorpus", X, materialize(X))
393      base::lapply(X, FUN, ...)      base::lapply(X, FUN, ...)
394  }  }
395    
396  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
397    function(x, path = ".", filenames = NULL)
398    {
399      filenames <- file.path(path,      filenames <- file.path(path,
400                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", meta(x, "ID"))))
401                             else filenames)                             else filenames)
402      i <- 1      i <- 1
403      for (o in x) {      for (o in x) {

Legend:
Removed from v.1108  
changed lines
  Added in v.1300

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge