SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1114, Fri Nov 26 14:05:54 2010 UTC revision 1306, Tue Mar 25 08:37:05 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  .PCorpus <-
4    function(x, cmeta, dmeta, dbcontrol)
5    {
6      attr(x, "CMetaData") <- cmeta      attr(x, "CMetaData") <- cmeta
7      attr(x, "DMetaData") <- dmeta      attr(x, "DMetaData") <- dmeta
8      attr(x, "DBControl") <- dbcontrol      attr(x, "DBControl") <- dbcontrol
9      class(x) <- c("PCorpus", "Corpus", "list")      class(x) <- c("PCorpus", "Corpus", "list")
10      x      x
11  }  }
 DBControl <- function(x) attr(x, "DBControl")  
12    
13  PCorpus <- function(x,  DBControl <-
14                      readerControl = list(reader = x$DefaultReader, language = "en"),  function(x)
15                      dbControl = list(dbName = "", dbType = "DB1"),      attr(x, "DBControl")
16                      ...) {  
17      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  PCorpus <-
18    function(x,
19             readerControl = list(reader = x$defaultreader, language = "en"),
20             dbControl = list(dbName = "", dbType = "DB1"))
21    {
22        stopifnot(inherits(x, "Source"))
23    
24        readerControl <- prepareReader(readerControl, x$defaultreader)
25    
26      if (is.function(readerControl$init))      if (is.function(readerControl$init))
27          readerControl$init()          readerControl$init()
# Line 26  Line 34 
34      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
35    
36      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
37      tdl <- if (x$Length > 0)      tdl <- if (x$length > 0)
38          vector("list", as.integer(x$Length))          vector("list", as.integer(x$length))
39      else      else
40          list()          list()
41    
# Line 35  Line 43 
43      while (!eoi(x)) {      while (!eoi(x)) {
44          x <- stepNext(x)          x <- stepNext(x)
45          elem <- getElem(x)          elem <- getElem(x)
46          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          id <- if (is.null(x$names) || is.na(x$names))
47          filehash::dbInsert(db, ID(doc), doc)                  as.character(counter)
48          if (x$Length > 0) tdl[[counter]] <- ID(doc)              else
49          else tdl <- c(tdl, ID(doc))                  x$names[counter]
50            doc <- readerControl$reader(elem, readerControl$language, id)
51            filehash::dbInsert(db, meta(doc, "id"), doc)
52            if (x$length > 0) tdl[[counter]] <- meta(doc, "id")
53            else tdl <- c(tdl, meta(doc, "id"))
54          counter <- counter + 1          counter <- counter + 1
55      }      }
56      names(tdl) <- x$Names      if (!is.null(x$names) && !is.na(x$names))
57            names(tdl) <- x$names
58    
59      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
60      filehash::dbInsert(db, "DMetaData", df)      filehash::dbInsert(db, "DMetaData", df)
# Line 50  Line 63 
63      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
64  }  }
65    
66  .VCorpus <- function(x, cmeta, dmeta) {  .VCorpus <-
67    function(x, cmeta, dmeta)
68    {
69      attr(x, "CMetaData") <- cmeta      attr(x, "CMetaData") <- cmeta
70      attr(x, "DMetaData") <- dmeta      attr(x, "DMetaData") <- dmeta
71      class(x) <- c("VCorpus", "Corpus", "list")      class(x) <- c("VCorpus", "Corpus", "list")
72      x      x
73  }  }
74    
75  # Register S3 corpus classes to be recognized by S4 methods. This is  VCorpus <-
76  # mainly a fix to be compatible with packages which were originally  Corpus <-
77  # developed to cooperate with corresponding S4 tm classes. Necessary  function(x, readerControl = list(reader = x$defaultreader, language = "en"))
78  # since tm's class architecture was changed to S3 since tm version 0.5.  {
79  setOldClass(c("VCorpus", "Corpus", "list"))      stopifnot(inherits(x, "Source"))
80    
81  # The "..." are additional arguments for the FunctionGenerator reader      readerControl <- prepareReader(readerControl, x$defaultreader)
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "en"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
82    
83      if (is.function(readerControl$init))      if (is.function(readerControl$init))
84          readerControl$init()          readerControl$init()
# Line 76  Line 87 
87          on.exit(readerControl$exit())          on.exit(readerControl$exit())
88    
89      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
90      tdl <- if (x$Length > 0)      tdl <- if (x$length > 0)
91          vector("list", as.integer(x$Length))          vector("list", as.integer(x$length))
92      else      else
93          list()          list()
94    
95      if (x$Vectorized)      if (x$vectorized)
96          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),          tdl <- mapply(function(elem, id) readerControl$reader(elem, readerControl$language, id),
97                        pGetElem(x),                        pGetElem(x),
98                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = if (is.null(x$names) || is.na(x$names)) as.character(seq_len(x$length)) else x$names,
99                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
100      else {      else {
101          counter <- 1          counter <- 1
102          while (!eoi(x)) {          while (!eoi(x)) {
103              x <- stepNext(x)              x <- stepNext(x)
104              elem <- getElem(x)              elem <- getElem(x)
105              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              id <- if (is.null(x$names) || is.na(x$names))
106              if (x$Length > 0)                  as.character(counter)
107                else
108                    x$names[counter]
109                doc <- readerControl$reader(elem, readerControl$language, id)
110                if (x$length > 0)
111                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
112              else              else
113                  tdl <- c(tdl, list(doc))                  tdl <- c(tdl, list(doc))
114              counter <- counter + 1              counter <- counter + 1
115          }          }
116      }      }
117      names(tdl) <- x$Names      if (!is.null(x$names) && !is.na(x$names))
118            names(tdl) <- x$names
119      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
120      .VCorpus(tdl, .MetaDataNode(), df)      .VCorpus(tdl, .MetaDataNode(), df)
121  }  }
122    
123  `[.PCorpus` <- function(x, i) {  `[.PCorpus` <-
124    function(x, i)
125    {
126      if (missing(i)) return(x)      if (missing(i)) return(x)
127      index <- attr(x, "DMetaData")[[1 , "subset"]]      index <- attr(x, "DMetaData")[[1 , "subset"]]
128      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
# Line 112  Line 130 
130      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
131  }  }
132    
133  `[.VCorpus` <- function(x, i) {  `[.VCorpus` <-
134    function(x, i)
135    {
136      if (missing(i)) return(x)      if (missing(i)) return(x)
137      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
138  }  }
139    
140  `[<-.PCorpus` <- function(x, i, value) {  `[<-.PCorpus` <-
141    function(x, i, value)
142    {
143      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
144      counter <- 1      counter <- 1
145      for (id in unclass(x)[i]) {      for (id in unclass(x)[i]) {
# Line 128  Line 150 
150      x      x
151  }  }
152    
153  .map_name_index <- function(x, i) {  .map_name_index <-
154    function(x, i)
155    {
156      if (is.character(i)) {      if (is.character(i)) {
157          if (is.null(names(x)))          if (is.null(names(x)))
158              match(i, meta(x, "ID", type = "local"))              match(i, meta(x, "id", type = "local"))
159          else          else
160              match(i, names(x))              match(i, names(x))
161      }      }
162      i      i
163  }  }
164    
165  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
166    function(x, i)
167    {
168      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
169      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
170      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, NextMethod("[["))
171  }  }
172  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
173    function(x, i)
174    {
175      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
176      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
177      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
# Line 151  Line 179 
179      NextMethod("[[")      NextMethod("[[")
180  }  }
181    
182  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
183    function(x, i, value)
184    {
185      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
186      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
187      index <- unclass(x)[[i]]      index <- unclass(x)[[i]]
188      db[[index]] <- value      db[[index]] <- value
189      x      x
190  }  }
191  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
192    function(x, i, value)
193    {
194      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
195      # Mark new objects as not active for lazy mapping      # Mark new objects as not active for lazy mapping
196      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
# Line 174  Line 206 
206  }  }
207    
208  # Update NodeIDs of a CMetaData tree  # Update NodeIDs of a CMetaData tree
209  .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  .update_id <-
210    function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0)
211    {
212      # Traversal of (binary) CMetaData tree with setup of NodeIDs      # Traversal of (binary) CMetaData tree with setup of NodeIDs
213      set_id <- function(x) {      set_id <- function(x) {
214          x$NodeID <- id          x$NodeID <- id
215          id <<- id + 1          id <<- id + 1
216          level <<- level + 1          level <<- level + 1
217          if (length(x$Children) > 0) {          if (length(x$Children)) {
218              mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))              mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
219              left <- set_id(x$Children[[1]])              left <- set_id(x$Children[[1]])
220              if (level == 1) {              if (level == 1) {
# Line 199  Line 233 
233  }  }
234    
235  # Find indices to be updated for a CMetaData tree  # Find indices to be updated for a CMetaData tree
236  .find_indices <- function(x) {  .find_indices <-
237    function(x)
238    {
239      indices.mapping <- NULL      indices.mapping <- NULL
240      for (m in levels(as.factor(DMetaData(x)$MetaID))) {      for (m in levels(as.factor(DMetaData(x)$MetaID))) {
241          indices <- (DMetaData(x)$MetaID == m)          indices <- (DMetaData(x)$MetaID == m)
# Line 209  Line 245 
245      indices.mapping      indices.mapping
246  }  }
247    
248  c2 <- function(x, y, ...) {  c2 <-
249    function(x, y, ...)
250    {
251      # Update the CMetaData tree      # Update the CMetaData tree
252      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
253      update.struct <- .update_id(cmeta)      update.struct <- .update_id(cmeta)
# Line 236  Line 274 
274    
275      # Merge the DMetaData data frames      # Merge the DMetaData data frames
276      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
277      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))      na.matrix <- matrix(NA,
278                            nrow = nrow(DMetaData(x)),
279                            ncol = length(labels),
280                            dimnames = list(row.names(DMetaData(x)), labels))
281      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
282      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
283      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))      na.matrix <- matrix(NA,
284                            nrow = nrow(DMetaData(y)),
285                            ncol = length(labels),
286                            dimnames = list(row.names(DMetaData(y)), labels))
287      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
288      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
289    
# Line 247  Line 291 
291  }  }
292    
293  c.Corpus <-  c.Corpus <-
294  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
295  {  {
296      args <- list(...)      args <- list(...)
297        x <- args[[1L]]
298    
299      if (identical(length(args), 0L))      if(length(args) == 1L)
300          return(x)          return(x)
301    
302      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
# Line 260  Line 305 
305      if (inherits(x, "PCorpus"))      if (inherits(x, "PCorpus"))
306          stop("concatenation of corpora with underlying databases is not supported")          stop("concatenation of corpora with underlying databases is not supported")
307    
     l <- base::c(list(x), args)  
308      if (recursive)      if (recursive)
309          Reduce(c2, l)          Reduce(c2, args)
310      else {      else {
311          l <- do.call("c", lapply(l, unclass))          args <- do.call("c", lapply(args, unclass))
312          .VCorpus(l,          .VCorpus(args,
313                   cmeta = .MetaDataNode(),                   cmeta = .MetaDataNode(),
314                   dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))                   dmeta = data.frame(MetaID = rep(0, length(args)),
315                                        stringsAsFactors = FALSE))
316      }      }
317  }  }
318    
319  c.TextDocument <- function(x, ..., recursive = FALSE) {  c.TextDocument <-
320    function(..., recursive = FALSE)
321    {
322      args <- list(...)      args <- list(...)
323        x <- args[[1L]]
324    
325      if (identical(length(args), 0L))      if(length(args) == 1L)
326          return(x)          return(x)
327    
328      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
329          stop("not all arguments are text documents")          stop("not all arguments are text documents")
330    
331      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)      dmeta <- data.frame(MetaID = rep(0, length(args)),
332      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)                          stringsAsFactors = FALSE)
333        .VCorpus(args, .MetaDataNode(), dmeta)
334  }  }
335    
336  print.Corpus <- function(x, ...) {  print.Corpus <-
337    function(x, ...)
338    {
339      cat(sprintf(ngettext(length(x),      cat(sprintf(ngettext(length(x),
340                           "A corpus with %d text document\n",                           "A corpus with %d text document\n",
341                           "A corpus with %d text documents\n"),                           "A corpus with %d text documents\n"),
# Line 292  Line 343 
343      invisible(x)      invisible(x)
344  }  }
345    
346  summary.Corpus <- function(object, ...) {  summary.Corpus <-
347    function(object, ...)
348    {
349      print(object)      print(object)
350      if (length(DMetaData(object)) > 0) {      if (length(DMetaData(object))) {
351          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
352                               "\nThe metadata consists of %d tag-value pair and a data frame\n",                               "\nThe metadata consists of %d tag-value pair and a data frame\n",
353                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
# Line 306  Line 359 
359      }      }
360  }  }
361    
362  inspect <- function(x) UseMethod("inspect", x)  inspect <-
363  inspect.PCorpus <- function(x) {  function(x)
364        UseMethod("inspect", x)
365    inspect.PCorpus <-
366    function(x)
367    {
368      summary(x)      summary(x)
369      cat("\n")      cat("\n")
370      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
371      show(filehash::dbMultiFetch(db, unlist(x)))      show(filehash::dbMultiFetch(db, unlist(x)))
372  }  }
373  inspect.VCorpus <- function(x) {  inspect.VCorpus <-
374    function(x)
375    {
376      summary(x)      summary(x)
377      cat("\n")      cat("\n")
378      print(noquote(lapply(x, identity)))      print(noquote(lapply(x, identity)))
379  }  }
380    
381  lapply.PCorpus <- function(X, FUN, ...) {  lapply.PCorpus <-
382    function(X, FUN, ...)
383    {
384      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
385      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
386  }  }
387  lapply.VCorpus <- function(X, FUN, ...) {  lapply.VCorpus <-
388    function(X, FUN, ...)
389    {
390      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
391      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
392          .Call("copyCorpus", X, materialize(X))          .Call("copyCorpus", X, materialize(X))
393      base::lapply(X, FUN, ...)      base::lapply(X, FUN, ...)
394  }  }
395    
396  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
397    function(x, path = ".", filenames = NULL)
398    {
399      filenames <- file.path(path,      filenames <- file.path(path,
400                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
401              sprintf("%s.txt", as.character(meta(x, "id", "local")))
402                             else filenames)                             else filenames)
403      i <- 1  
404      for (o in x) {      stopifnot(length(x) == length(filenames))
405          writeLines(as.PlainTextDocument(o), filenames[i])  
406          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
407      }  
408        invisible(x)
409  }  }

Legend:
Removed from v.1114  
changed lines
  Added in v.1306

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge