SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1114, Fri Nov 26 14:05:54 2010 UTC revision 1366, Mon Apr 28 14:48:37 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  PCorpus <-
4      attr(x, "CMetaData") <- cmeta  function(x,
5      attr(x, "DMetaData") <- dmeta           readerControl = list(reader = reader(x), language = "en"),
6      attr(x, "DBControl") <- dbcontrol           dbControl = list(dbName = "", dbType = "DB1"))
7      class(x) <- c("PCorpus", "Corpus", "list")  {
8      x      stopifnot(inherits(x, "Source"))
 }  
 DBControl <- function(x) attr(x, "DBControl")  
9    
10  PCorpus <- function(x,      readerControl <- prepareReader(readerControl, reader(x))
                     readerControl = list(reader = x$DefaultReader, language = "en"),  
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
11    
12      if (is.function(readerControl$init))      if (is.function(readerControl$init))
13          readerControl$init()          readerControl$init()
# Line 25  Line 19 
19          stop("error in creating database")          stop("error in creating database")
20      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22      # Allocate memory in advance if length is known      tdl <- vector("list", length(x))
     tdl <- if (x$Length > 0)  
         vector("list", as.integer(x$Length))  
     else  
         list()  
   
23      counter <- 1      counter <- 1
24      while (!eoi(x)) {      while (!eoi(x)) {
25          x <- stepNext(x)          x <- stepNext(x)
26          elem <- getElem(x)          elem <- getElem(x)
27          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          id <- if (is.null(names(x)))
28          filehash::dbInsert(db, ID(doc), doc)              as.character(counter)
29          if (x$Length > 0) tdl[[counter]] <- ID(doc)          else
30          else tdl <- c(tdl, ID(doc))              names(x)[counter]
31            doc <- readerControl$reader(elem, readerControl$language, id)
32            filehash::dbInsert(db, meta(doc, "id"), doc)
33            tdl[[counter]] <- meta(doc, "id")
34          counter <- counter + 1          counter <- counter + 1
35      }      }
36      names(tdl) <- x$Names      if (!is.null(names(x)))
37            names(tdl) <- names(x)
     df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
     filehash::dbInsert(db, "DMetaData", df)  
     dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))  
38    
39      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      structure(list(content = tdl,
40                       meta = CorpusMeta(),
41                       dmeta = data.frame(row.names = seq_along(tdl)),
42                       dbcontrol = dbControl),
43                  class = c("PCorpus", "Corpus"))
44  }  }
45    
46  .VCorpus <- function(x, cmeta, dmeta) {  Corpus <-
47      attr(x, "CMetaData") <- cmeta  VCorpus <-
48      attr(x, "DMetaData") <- dmeta  function(x, readerControl = list(reader = reader(x), language = "en"))
49      class(x) <- c("VCorpus", "Corpus", "list")  {
50      x      stopifnot(inherits(x, "Source"))
 }  
51    
52  # Register S3 corpus classes to be recognized by S4 methods. This is      readerControl <- prepareReader(readerControl, reader(x))
 # mainly a fix to be compatible with packages which were originally  
 # developed to cooperate with corresponding S4 tm classes. Necessary  
 # since tm's class architecture was changed to S3 since tm version 0.5.  
 setOldClass(c("VCorpus", "Corpus", "list"))  
   
 # The "..." are additional arguments for the FunctionGenerator reader  
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "en"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
53    
54      if (is.function(readerControl$init))      if (is.function(readerControl$init))
55          readerControl$init()          readerControl$init()
# Line 75  Line 57 
57      if (is.function(readerControl$exit))      if (is.function(readerControl$exit))
58          on.exit(readerControl$exit())          on.exit(readerControl$exit())
59    
60      # Allocate memory in advance if length is known      tdl <- vector("list", length(x))
61      tdl <- if (x$Length > 0)      # Check for parallel element access
62          vector("list", as.integer(x$Length))      if (is.function(getS3method("pGetElem", class(x), TRUE)))
63      else          tdl <- mapply(function(elem, id)
64          list()                            readerControl$reader(elem, readerControl$language, id),
   
     if (x$Vectorized)  
         tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),  
65                        pGetElem(x),                        pGetElem(x),
66                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = if (is.null(names(x)))
67                              as.character(seq_len(length(x)))
68                          else names(x),
69                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
70      else {      else {
71          counter <- 1          counter <- 1
72          while (!eoi(x)) {          while (!eoi(x)) {
73              x <- stepNext(x)              x <- stepNext(x)
74              elem <- getElem(x)              elem <- getElem(x)
75              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              id <- if (is.null(names(x)))
76              if (x$Length > 0)                  as.character(counter)
                 tdl[[counter]] <- doc  
77              else              else
78                  tdl <- c(tdl, list(doc))                  names(x)[counter]
79                doc <- readerControl$reader(elem, readerControl$language, id)
80                tdl[[counter]] <- doc
81              counter <- counter + 1              counter <- counter + 1
82          }          }
83      }      }
84      names(tdl) <- x$Names      if (!is.null(names(x)))
85      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)          names(tdl) <- names(x)
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
86    
87  `[.PCorpus` <- function(x, i) {      structure(list(content = tdl,
88      if (missing(i)) return(x)                     meta = CorpusMeta(),
89      index <- attr(x, "DMetaData")[[1 , "subset"]]                     dmeta = data.frame(row.names = seq_along(tdl))),
90      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i                class = c("VCorpus", "Corpus"))
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
91  }  }
92    
93  `[.VCorpus` <- function(x, i) {  `[.PCorpus` <-
94      if (missing(i)) return(x)  function(x, i)
95      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])  {
96        if (!missing(i)) {
97            x$content <- x$content[i]
98            x$dmeta <- x$dmeta[i, , drop = FALSE]
99        }
100        x
101  }  }
102    
103  `[<-.PCorpus` <- function(x, i, value) {  `[.VCorpus` <-
104      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
105      counter <- 1  {
106      for (id in unclass(x)[i]) {      if (!missing(i)) {
107          if (identical(length(value), 1L)) db[[id]] <- value          x$content <- x$content[i]
108          else db[[id]] <- value[[counter]]          x$dmeta <- x$dmeta[i, , drop = FALSE]
109          counter <- counter + 1          if (!is.null(x$lazy))
110                x$lazy$index <- x$lazy$index[i]
111      }      }
112      x      x
113  }  }
114    
115  .map_name_index <- function(x, i) {  .map_name_index <-
116    function(x, i)
117    {
118      if (is.character(i)) {      if (is.character(i)) {
119          if (is.null(names(x)))          n <- names(x$content)
120              match(i, meta(x, "ID", type = "local"))          match(i, if (is.null(n)) meta(x, "id", "local") else n)
121          else      } else
             match(i, names(x))  
     }  
122      i      i
123  }  }
124    
125  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
126    function(x, i)
127    {
128      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
129      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
130      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, x$content[[i]])
131  }  }
132  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
133    function(x, i)
134    {
135      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
136      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
137      if (!is.null(lazyTmMap))          .Call(copyCorpus, x, materialize(x, i))
138          .Call("copyCorpus", x, materialize(x, i))      x$content[[i]]
     NextMethod("[[")  
139  }  }
140    
141  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
142    function(x, i, value)
143    {
144      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
145      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
146      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
147      x      x
148  }  }
149  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
150    function(x, i, value)
151    {
152      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
153      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
154      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
155      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
156          lazyTmMap$index[i] <- FALSE      x$content[[i]] <- value
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
157          x          x
158      }      }
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
   
 # Find indices to be updated for a CMetaData tree  
 .find_indices <- function(x) {  
     indices.mapping <- NULL  
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
     indices.mapping  
 }  
   
 c2 <- function(x, y, ...) {  
     # Update the CMetaData tree  
     cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  
     update.struct <- .update_id(cmeta)  
   
     new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)  
159    
160      # Find indices to be updated for the left tree  as.list.PCorpus <- as.list.VCorpus <-
161      indices.mapping <- .find_indices(x)  function(x, ...)
162        content(x)
163    
164    as.VCorpus <-
165    function(x)
166        UseMethod("as.VCorpus")
167    as.VCorpus.VCorpus <- identity
168    
169      # Update the DMetaData data frames for the left tree  outer_union <-
170      for (i in 1:ncol(update.struct$left.mapping)) {  function(x, y, ...)
171          map <- update.struct$left.mapping[,i]  {
172          DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])      if (nrow(x) > 0L)
173      }          x[, setdiff(names(y), names(x))] <- NA
174        if (nrow(y) > 0L)
175      # Find indices to be updated for the right tree          y[, setdiff(names(x), names(y))] <- NA
176      indices.mapping <- .find_indices(y)      res <- rbind(x, y)
177        if (ncol(res) == 0L)
178      # Update the DMetaData data frames for the right tree          res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
179      for (i in 1:ncol(update.struct$right.mapping)) {      res
         map <- update.struct$right.mapping[,i]  
         DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
   
     # Merge the DMetaData data frames  
     labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  
     na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))  
     x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  
     labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  
     na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))  
     y.dmeta.aug <- cbind(DMetaData(y), na.matrix)  
     DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  
   
     new  
180  }  }
181    
182  c.Corpus <-  c.VCorpus <-
183  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
184  {  {
185      args <- list(...)      args <- list(...)
186        x <- args[[1L]]
187    
188      if (identical(length(args), 0L))      if (length(args) == 1L)
189          return(x)          return(x)
190    
191      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
192          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
193    
194      if (inherits(x, "PCorpus"))      structure(list(content = do.call("c", lapply(args, content)),
195          stop("concatenation of corpora with underlying databases is not supported")                     meta = CorpusMeta(meta = do.call("c",
196                         lapply(args, function(a) meta(a, type = "corpus")))),
197      l <- base::c(list(x), args)                     dmeta = Reduce(outer_union, lapply(args, meta))),
198      if (recursive)                class = c("VCorpus", "Corpus"))
         Reduce(c2, l)  
     else {  
         l <- do.call("c", lapply(l, unclass))  
         .VCorpus(l,  
                  cmeta = .MetaDataNode(),  
                  dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))  
199      }      }
 }  
   
 c.TextDocument <- function(x, ..., recursive = FALSE) {  
     args <- list(...)  
   
     if (identical(length(args), 0L))  
         return(x)  
200    
201      if (!all(unlist(lapply(args, inherits, class(x)))))  content.VCorpus <-
202          stop("not all arguments are text documents")  function(x)
203    {
204      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)      if (!is.null(x$lazy))
205      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)          .Call(copyCorpus, x, materialize(x))
206        x$content
207  }  }
208    
209  print.Corpus <- function(x, ...) {  content.PCorpus <-
210      cat(sprintf(ngettext(length(x),  function(x)
211                           "A corpus with %d text document\n",  {
212                           "A corpus with %d text documents\n"),      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
213                  length(x)))      filehash::dbMultiFetch(db, unlist(x$content))
     invisible(x)  
214  }  }
215    
216  summary.Corpus <- function(object, ...) {  length.PCorpus <- length.VCorpus <-
217      print(object)  function(x)
218      if (length(DMetaData(object)) > 0) {      length(x$content)
         cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  
                              "\nThe metadata consists of %d tag-value pair and a data frame\n",  
                              "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  
                     length(CMetaData(object)$MetaData)))  
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
 }  
219    
220  inspect <- function(x) UseMethod("inspect", x)  print.PCorpus <- print.VCorpus <-
221  inspect.PCorpus <- function(x) {  function(x, ...)
222      summary(x)  {
223      cat("\n")      writeLines(sprintf("<<%s (documents: %d, metadata (corpus/indexed): %d/%d)>>",
224      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])                         class(x)[1],
225      show(filehash::dbMultiFetch(db, unlist(x)))                         length(x),
226  }                         length(meta(x, type = "corpus")),
227  inspect.VCorpus <- function(x) {                         ncol(meta(x, type = "indexed"))))
228      summary(x)      invisible(x)
     cat("\n")  
     print(noquote(lapply(x, identity)))  
229  }  }
230    
231  lapply.PCorpus <- function(X, FUN, ...) {  inspect <-
232      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  function(x)
233      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      UseMethod("inspect", x)
234  }  inspect.PCorpus <- inspect.VCorpus <-
235  lapply.VCorpus <- function(X, FUN, ...) {  function(x)
236      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")  {
237      if (!is.null(lazyTmMap))      print(x)
238          .Call("copyCorpus", X, materialize(X))      cat("\n")
239      base::lapply(X, FUN, ...)      print(noquote(content(x)))
240        invisible(x)
241  }  }
242    
243  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
244    function(x, path = ".", filenames = NULL)
245    {
246      filenames <- file.path(path,      filenames <- file.path(path,
247                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
248              sprintf("%s.txt", as.character(meta(x, "id", "local")))
249                             else filenames)                             else filenames)
250      i <- 1  
251      for (o in x) {      stopifnot(length(x) == length(filenames))
252          writeLines(as.PlainTextDocument(o), filenames[i])  
253          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
254      }  
255        invisible(x)
256  }  }

Legend:
Removed from v.1114  
changed lines
  Added in v.1366

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge