SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1108, Fri Oct 22 18:32:47 2010 UTC revision 1327, Mon Apr 14 15:35:38 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  PCorpus <-
4      attr(x, "CMetaData") <- cmeta  function(x,
5      attr(x, "DMetaData") <- dmeta           readerControl = list(reader = x$defaultreader, language = "en"),
6      attr(x, "DBControl") <- dbcontrol           dbControl = list(dbName = "", dbType = "DB1"))
7      class(x) <- c("PCorpus", "Corpus", "list")  {
8      x      stopifnot(inherits(x, "Source"))
9  }  
10  DBControl <- function(x) attr(x, "DBControl")      readerControl <- prepareReader(readerControl, x$defaultreader)
11    
12        if (is.function(readerControl$init))
13            readerControl$init()
14    
15  PCorpus <- function(x,      if (is.function(readerControl$exit))
16                      readerControl = list(reader = x$DefaultReader, language = "en"),          on.exit(readerControl$exit())
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
17    
18      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19          stop("error in creating database")          stop("error in creating database")
20      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
23      tdl <- if (x$Length > 0)      tdl <- if (x$length > 0) vector("list", as.integer(x$length)) else list()
         vector("list", as.integer(x$Length))  
     else  
         list()  
24    
25      counter <- 1      counter <- 1
26      while (!eoi(x)) {      while (!eoi(x)) {
27          x <- stepNext(x)          x <- stepNext(x)
28          elem <- getElem(x)          elem <- getElem(x)
29          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          id <- if (is.null(x$names) || is.na(x$names))
30          filehash::dbInsert(db, ID(doc), doc)              as.character(counter)
31          if (x$Length > 0) tdl[[counter]] <- ID(doc)          else
32          else tdl <- c(tdl, ID(doc))              x$names[counter]
33            doc <- readerControl$reader(elem, readerControl$language, id)
34            filehash::dbInsert(db, meta(doc, "id"), doc)
35            if (x$length > 0) tdl[[counter]] <- meta(doc, "id")
36            else tdl <- c(tdl, meta(doc, "id"))
37          counter <- counter + 1          counter <- counter + 1
38      }      }
39      names(tdl) <- x$Names      if (!is.null(x$names) && !is.na(x$names))
40            names(tdl) <- x$names
     df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
     filehash::dbInsert(db, "DMetaData", df)  
     dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))  
41    
42      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      structure(list(content = tdl,
43                       meta = CorpusMeta(),
44                       dmeta = data.frame(row.names = seq_along(tdl)),
45                       dbcontrol = dbControl),
46                  class = c("PCorpus", "Corpus"))
47  }  }
48    
49  .VCorpus <- function(x, cmeta, dmeta) {  VCorpus <- Corpus <-
50      attr(x, "CMetaData") <- cmeta  function(x, readerControl = list(reader = x$defaultreader, language = "en"))
51      attr(x, "DMetaData") <- dmeta  {
52      class(x) <- c("VCorpus", "Corpus", "list")      stopifnot(inherits(x, "Source"))
53      x  
54  }      readerControl <- prepareReader(readerControl, x$defaultreader)
55    
56        if (is.function(readerControl$init))
57            readerControl$init()
58    
59  # Register S3 corpus classes to be recognized by S4 methods. This is      if (is.function(readerControl$exit))
60  # mainly a fix to be compatible with packages which were originally          on.exit(readerControl$exit())
 # developed to cooperate with corresponding S4 tm classes. Necessary  
 # since tm's class architecture was changed to S3 since tm version 0.5.  
 setOldClass(c("VCorpus", "Corpus", "list"))  
   
 # The "..." are additional arguments for the FunctionGenerator reader  
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "en"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
61    
62      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
63      tdl <- if (x$Length > 0)      tdl <- if (x$length > 0) vector("list", as.integer(x$length)) else list()
         vector("list", as.integer(x$Length))  
     else  
         list()  
64    
65      if (x$Vectorized)      if (x$vectorized)
66          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),          tdl <- mapply(function(elem, id)
67                              readerControl$reader(elem, readerControl$language, id),
68                        pGetElem(x),                        pGetElem(x),
69                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = if (is.null(x$names) || is.na(x$names))
70                              as.character(seq_len(x$length))
71                          else x$names,
72                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
73      else {      else {
74          counter <- 1          counter <- 1
75          while (!eoi(x)) {          while (!eoi(x)) {
76              x <- stepNext(x)              x <- stepNext(x)
77              elem <- getElem(x)              elem <- getElem(x)
78              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              id <- if (is.null(x$names) || is.na(x$names))
79              if (x$Length > 0)                  as.character(counter)
80                else
81                    x$names[counter]
82                doc <- readerControl$reader(elem, readerControl$language, id)
83                if (x$length > 0)
84                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
85              else              else
86                  tdl <- c(tdl, list(doc))                  tdl <- c(tdl, list(doc))
87              counter <- counter + 1              counter <- counter + 1
88          }          }
89      }      }
90      names(tdl) <- x$Names      if (!is.null(x$names) && !is.na(x$names))
91      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)          names(tdl) <- x$names
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
92    
93  `[.PCorpus` <- function(x, i) {      structure(list(content = tdl,
94      if (missing(i)) return(x)                     meta = CorpusMeta(),
95      index <- attr(x, "DMetaData")[[1 , "subset"]]                     dmeta = data.frame(row.names = seq_along(tdl))),
96      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i                class = c("VCorpus", "Corpus"))
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
97  }  }
98    
99  `[.VCorpus` <- function(x, i) {  `[.PCorpus` <-
100      if (missing(i)) return(x)  function(x, i)
101      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])  {
102        if (!missing(i)) {
103            x$content <- x$content[i]
104            x$dmeta <- x$dmeta[i, , drop = FALSE]
105        }
106        x
107  }  }
108    
109  `[<-.PCorpus` <- function(x, i, value) {  `[.VCorpus` <-
110      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
111      counter <- 1  {
112      for (id in unclass(x)[i]) {      if (!missing(i)) {
113          if (identical(length(value), 1L)) db[[id]] <- value          x$content <- x$content[i]
114          else db[[id]] <- value[[counter]]          x$dmeta <- x$dmeta[i, , drop = FALSE]
115          counter <- counter + 1          if (!is.null(x$lazy))
116                x$lazy$index <- x$lazy$index[i]
117      }      }
118      x      x
119  }  }
120    
121  .map_name_index <- function(x, i) {  .map_name_index <-
122      if (is.character(i)) {  function(x, i)
123          if (is.null(names(x)))  {
124              match(i, meta(x, "ID", type = "local"))      if (is.character(i))
125            match(i, if (is.null(names(x))) meta(x, "id", "local") else names(x))
126          else          else
             match(i, names(x))  
     }  
127      i      i
128  }  }
129    
130  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
131    function(x, i)
132    {
133      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
134      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
135      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, x$content[[i]])
136  }  }
137  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
138    function(x, i)
139    {
140      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
141      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
142      if (!is.null(lazyTmMap))          .Call(copyCorpus, x, materialize(x, i))
143          .Call("copyCorpus", x, materialize(x, i))      x$content[[i]]
     NextMethod("[[")  
144  }  }
145    
146  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
147    function(x, i, value)
148    {
149      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
150      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
151      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
152      x      x
153  }  }
154  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
155    function(x, i, value)
156    {
157      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
158      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
159      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
160      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
161          lazyTmMap$index[i] <- FALSE      x$content[[i]] <- value
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
162          x          x
163      }      }
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
   
 # Find indices to be updated for a CMetaData tree  
 .find_indices <- function(x) {  
     indices.mapping <- NULL  
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
     indices.mapping  
 }  
164    
165  c2 <- function(x, y, ...) {  outer_union <-
166      # Update the CMetaData tree  function(x, y, ...)
167      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  {
168      update.struct <- .update_id(cmeta)      if (nrow(x) > 0L)
169            x[, setdiff(names(y), names(x))] <- NA
170      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)      if (nrow(y) > 0L)
171            y[, setdiff(names(x), names(y))] <- NA
172      # Find indices to be updated for the left tree      res <- rbind(x, y)
173      indices.mapping <- .find_indices(x)      if (ncol(res) == 0L)
174            res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
175      # Update the DMetaData data frames for the left tree      res
     for (i in 1:ncol(update.struct$left.mapping)) {  
         map <- update.struct$left.mapping[,i]  
         DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
   
     # Find indices to be updated for the right tree  
     indices.mapping <- .find_indices(y)  
   
     # Update the DMetaData data frames for the right tree  
     for (i in 1:ncol(update.struct$right.mapping)) {  
         map <- update.struct$right.mapping[,i]  
         DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
   
     # Merge the DMetaData data frames  
     labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  
     na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))  
     x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  
     labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  
     na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))  
     y.dmeta.aug <- cbind(DMetaData(y), na.matrix)  
     DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  
   
     new  
176  }  }
177    
178  c.Corpus <-  c.VCorpus <-
179  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
180  {  {
181      args <- list(...)      args <- list(...)
182        x <- args[[1L]]
183    
184      if (identical(length(args), 0L))      if (length(args) == 1L)
185          return(x)          return(x)
186    
187      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
188          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
189    
190      if (inherits(x, "PCorpus"))      structure(list(content = do.call("c", lapply(args, content)),
191          stop("concatenation of corpora with underlying databases is not supported")                     meta = structure(do.call("c",
192                         lapply(args, function(a) meta(a, type = "corpus"))),
193      l <- base::c(list(x), args)                                      class = "CorpusMeta"),
194      if (recursive)                     dmeta = Reduce(outer_union, lapply(args, meta))),
195          Reduce(c2, l)                class = c("VCorpus", "Corpus"))
     else {  
         l <- do.call("c", lapply(l, unclass))  
         .VCorpus(l,  
                  cmeta = .MetaDataNode(),  
                  dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))  
     }  
196  }  }
197    
198  c.TextDocument <- function(x, ..., recursive = FALSE) {  c.TextDocument <-
199    function(..., recursive = FALSE)
200    {
201      args <- list(...)      args <- list(...)
202        x <- args[[1L]]
203    
204      if (identical(length(args), 0L))      if (length(args) == 1L)
205          return(x)          return(x)
206    
207      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
208          stop("not all arguments are text documents")          stop("not all arguments are text documents")
209    
210      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)      structure(list(content = args,
211      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)                     meta = CorpusMeta(),
212                       dmeta = data.frame(row.names = seq_along(args))),
213                  class = c("VCorpus", "Corpus"))
214  }  }
215    
216  print.Corpus <- function(x, ...) {  as.list.PCorpus <- as.list.VCorpus <-
217      cat(sprintf(ngettext(length(x),  function(x, ...)
218                           "A corpus with %d text document\n",      content(x)
                          "A corpus with %d text documents\n"),  
                 length(x)))  
     invisible(x)  
 }  
219    
220  summary.Corpus <- function(object, ...) {  content.VCorpus <-
221      print(object)  function(x)
222      if (length(DMetaData(object)) > 0) {  {
223          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),      if (!is.null(x$lazy))
224                               "\nThe metadata consists of %d tag-value pair and a data frame\n",          .Call(copyCorpus, x, materialize(x))
225                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),      x$content
                     length(CMetaData(object)$MetaData)))  
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
226  }  }
227    
228  inspect <- function(x) UseMethod("inspect", x)  content.PCorpus <-
229  inspect.PCorpus <- function(x) {  function(x)
230      summary(x)  {
231      cat("\n")      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
232      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      filehash::dbMultiFetch(db, unlist(x$content))
     show(filehash::dbMultiFetch(db, unlist(x)))  
 }  
 inspect.VCorpus <- function(x) {  
     summary(x)  
     cat("\n")  
     print(noquote(lapply(x, identity)))  
233  }  }
234    
235  lapply.PCorpus <- function(X, FUN, ...) {  length.PCorpus <- length.VCorpus <-
236      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  function(x)
237      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      length(x$content)
238    
239    print.PCorpus <- print.VCorpus <-
240    function(x, ...)
241    {
242        cat(sprintf(ngettext(length(x),
243                             "A corpus with %d text document\n\n",
244                             "A corpus with %d text documents\n\n"),
245                    length(x)))
246    
247        meta <- meta(x, type = "corpus")
248        dmeta <- meta(x, type = "indexed")
249    
250        cat("Metadata:\n")
251        cat(sprintf("  Tag-value pairs. Tags: %s\n",
252                    paste(names(meta), collapse = " ")))
253        cat("  Data frame. Variables:", colnames(dmeta), "\n")
254    
255        invisible(x)
256  }  }
257  lapply.VCorpus <- function(X, FUN, ...) {  
258      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")  inspect <-
259      if (!is.null(lazyTmMap))  function(x)
260          .Call("copyCorpus", X, materialize(X))      UseMethod("inspect", x)
261      base::lapply(X, FUN, ...)  inspect.PCorpus <- inspect.VCorpus <-
262    function(x)
263    {
264        print(x)
265        cat("\n")
266        print(noquote(content(x)))
267        invisible(x)
268  }  }
269    
270  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
271    function(x, path = ".", filenames = NULL)
272    {
273      filenames <- file.path(path,      filenames <- file.path(path,
274                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
275              sprintf("%s.txt", as.character(meta(x, "id", "local")))
276                             else filenames)                             else filenames)
277      i <- 1  
278      for (o in x) {      stopifnot(length(x) == length(filenames))
279          writeLines(as.PlainTextDocument(o), filenames[i])  
280          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
281      }  
282        invisible(x)
283  }  }

Legend:
Removed from v.1108  
changed lines
  Added in v.1327

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge