SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1108, Fri Oct 22 18:32:47 2010 UTC revision 1404, Tue Feb 17 18:04:22 2015 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  PCorpus <-
4      attr(x, "CMetaData") <- cmeta  function(x,
5      attr(x, "DMetaData") <- dmeta           readerControl = list(reader = reader(x), language = "en"),
6      attr(x, "DBControl") <- dbcontrol           dbControl = list(dbName = "", dbType = "DB1"))
7      class(x) <- c("PCorpus", "Corpus", "list")  {
8      x      stopifnot(inherits(x, "Source"))
 }  
 DBControl <- function(x) attr(x, "DBControl")  
9    
10  PCorpus <- function(x,      readerControl <- prepareReader(readerControl, reader(x))
                     readerControl = list(reader = x$DefaultReader, language = "en"),  
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
11    
12      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
13          stop("error in creating database")          stop("error in creating database")
14      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
15    
16      # Allocate memory in advance if length is known      x <- open(x)
17      tdl <- if (x$Length > 0)      tdl <- vector("list", length(x))
         vector("list", as.integer(x$Length))  
     else  
         list()  
   
18      counter <- 1      counter <- 1
19      while (!eoi(x)) {      while (!eoi(x)) {
20          x <- stepNext(x)          x <- stepNext(x)
21          elem <- getElem(x)          elem <- getElem(x)
22          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          doc <- readerControl$reader(elem,
23          filehash::dbInsert(db, ID(doc), doc)                                      readerControl$language,
24          if (x$Length > 0) tdl[[counter]] <- ID(doc)                                      as.character(counter))
25          else tdl <- c(tdl, ID(doc))          filehash::dbInsert(db, meta(doc, "id"), doc)
26            tdl[[counter]] <- meta(doc, "id")
27          counter <- counter + 1          counter <- counter + 1
28      }      }
29      names(tdl) <- x$Names      x <- close(x)
30    
31      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      p <- list(content = tdl,
32      filehash::dbInsert(db, "DMetaData", df)                meta = CorpusMeta(),
33      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))                dmeta = data.frame(row.names = seq_along(tdl)),
34                  dbcontrol = dbControl)
35      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      class(p) <- c("PCorpus", "Corpus")
36        p
37  }  }
38    
39  .VCorpus <- function(x, cmeta, dmeta) {  Corpus <-
40      attr(x, "CMetaData") <- cmeta  VCorpus <-
41      attr(x, "DMetaData") <- dmeta  function(x, readerControl = list(reader = reader(x), language = "en"))
42      class(x) <- c("VCorpus", "Corpus", "list")  {
43      x      stopifnot(inherits(x, "Source"))
 }  
44    
45  # Register S3 corpus classes to be recognized by S4 methods. This is      readerControl <- prepareReader(readerControl, reader(x))
 # mainly a fix to be compatible with packages which were originally  
 # developed to cooperate with corresponding S4 tm classes. Necessary  
 # since tm's class architecture was changed to S3 since tm version 0.5.  
 setOldClass(c("VCorpus", "Corpus", "list"))  
   
 # The "..." are additional arguments for the FunctionGenerator reader  
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "en"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
   
     # Allocate memory in advance if length is known  
     tdl <- if (x$Length > 0)  
         vector("list", as.integer(x$Length))  
     else  
         list()  
46    
47      if (x$Vectorized)      x <- open(x)
48          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),      tdl <- vector("list", length(x))
49        # Check for parallel element access
50        if (is.function(getS3method("pGetElem", class(x), TRUE)))
51            tdl <- mapply(function(elem, id)
52                              readerControl$reader(elem, readerControl$language, id),
53                        pGetElem(x),                        pGetElem(x),
54                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = as.character(seq_along(x)),
55                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
56      else {      else {
57          counter <- 1          counter <- 1
58          while (!eoi(x)) {          while (!eoi(x)) {
59              x <- stepNext(x)              x <- stepNext(x)
60              elem <- getElem(x)              elem <- getElem(x)
61              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              doc <- readerControl$reader(elem,
62              if (x$Length > 0)                                          readerControl$language,
63                                            as.character(counter))
64                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
             else  
                 tdl <- c(tdl, list(doc))  
65              counter <- counter + 1              counter <- counter + 1
66          }          }
67      }      }
68      names(tdl) <- x$Names      x <- close(x)
     df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
69    
70  `[.PCorpus` <- function(x, i) {      v <- list(content = tdl,
71      if (missing(i)) return(x)                meta = CorpusMeta(),
72      index <- attr(x, "DMetaData")[[1 , "subset"]]                dmeta = data.frame(row.names = seq_along(tdl)))
73      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i      class(v) <- c("VCorpus", "Corpus")
74      dmeta <- attr(x, "DMetaData")      v
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
75  }  }
76    
77  `[.VCorpus` <- function(x, i) {  `[.PCorpus` <-
78      if (missing(i)) return(x)  function(x, i)
79      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])  {
80        if (!missing(i)) {
81            x$content <- x$content[i]
82            x$dmeta <- x$dmeta[i, , drop = FALSE]
83        }
84        x
85  }  }
86    
87  `[<-.PCorpus` <- function(x, i, value) {  `[.VCorpus` <-
88      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
89      counter <- 1  {
90      for (id in unclass(x)[i]) {      if (!missing(i)) {
91          if (identical(length(value), 1L)) db[[id]] <- value          x$content <- x$content[i]
92          else db[[id]] <- value[[counter]]          x$dmeta <- x$dmeta[i, , drop = FALSE]
93          counter <- counter + 1          if (!is.null(x$lazy))
94                x$lazy$index <- x$lazy$index[i]
95      }      }
96      x      x
97  }  }
98    
99  .map_name_index <- function(x, i) {  .map_name_index <-
100      if (is.character(i)) {  function(x, i)
101          if (is.null(names(x)))  {
102              match(i, meta(x, "ID", type = "local"))      if (is.character(i))
103            match(i, meta(x, "id", "local"))
104          else          else
             match(i, names(x))  
     }  
105      i      i
106  }  }
107    
108  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
109    function(x, i)
110    {
111      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
112      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
113      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, x$content[[i]])
114  }  }
115  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
116    function(x, i)
117    {
118      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
119      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
120      if (!is.null(lazyTmMap))          .Call(copyCorpus, x, materialize(x, i))
121          .Call("copyCorpus", x, materialize(x, i))      x$content[[i]]
     NextMethod("[[")  
122  }  }
123    
124  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
125    function(x, i, value)
126    {
127      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
128      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
129      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
130      x      x
131  }  }
132  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
133    function(x, i, value)
134    {
135      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
136      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
137      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
138      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
139          lazyTmMap$index[i] <- FALSE      x$content[[i]] <- value
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
140          x          x
141      }      }
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
   
 # Find indices to be updated for a CMetaData tree  
 .find_indices <- function(x) {  
     indices.mapping <- NULL  
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
     indices.mapping  
 }  
142    
143  c2 <- function(x, y, ...) {  as.list.PCorpus <- as.list.VCorpus <-
144      # Update the CMetaData tree  function(x, ...)
145      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))      content(x)
146      update.struct <- .update_id(cmeta)  
147    as.VCorpus <-
148      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)  function(x)
149        UseMethod("as.VCorpus")
150      # Find indices to be updated for the left tree  as.VCorpus.VCorpus <- identity
     indices.mapping <- .find_indices(x)  
   
     # Update the DMetaData data frames for the left tree  
     for (i in 1:ncol(update.struct$left.mapping)) {  
         map <- update.struct$left.mapping[,i]  
         DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
151    
152      # Find indices to be updated for the right tree  outer_union <-
153      indices.mapping <- .find_indices(y)  function(x, y, ...)
154    {
155      # Update the DMetaData data frames for the right tree      if (nrow(x) > 0L)
156      for (i in 1:ncol(update.struct$right.mapping)) {          x[, setdiff(names(y), names(x))] <- NA
157          map <- update.struct$right.mapping[,i]      if (nrow(y) > 0L)
158          DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])          y[, setdiff(names(x), names(y))] <- NA
159      }      res <- rbind(x, y)
160        if (ncol(res) == 0L)
161      # Merge the DMetaData data frames          res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
162      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))      res
     na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))  
     x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  
     labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  
     na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))  
     y.dmeta.aug <- cbind(DMetaData(y), na.matrix)  
     DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  
   
     new  
163  }  }
164    
165  c.Corpus <-  c.VCorpus <-
166  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
167  {  {
168      args <- list(...)      args <- list(...)
169        x <- args[[1L]]
170    
171      if (identical(length(args), 0L))      if (length(args) == 1L)
172          return(x)          return(x)
173    
174      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
175          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
176    
177      if (inherits(x, "PCorpus"))      v <- list(content = do.call("c", lapply(args, content)),
178          stop("concatenation of corpora with underlying databases is not supported")                meta = CorpusMeta(meta = do.call("c",
179                    lapply(args, function(a) meta(a, type = "corpus")))),
180      l <- base::c(list(x), args)                dmeta = Reduce(outer_union, lapply(args, meta)))
181      if (recursive)      class(v) <- c("VCorpus", "Corpus")
182          Reduce(c2, l)      v
     else {  
         l <- do.call("c", lapply(l, unclass))  
         .VCorpus(l,  
                  cmeta = .MetaDataNode(),  
                  dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))  
     }  
183  }  }
184    
185  c.TextDocument <- function(x, ..., recursive = FALSE) {  content.VCorpus <-
186      args <- list(...)  function(x)
187    {
188      if (identical(length(args), 0L))      if (!is.null(x$lazy))
189          return(x)          .Call(copyCorpus, x, materialize(x))
190        x$content
191      if (!all(unlist(lapply(args, inherits, class(x)))))  }
         stop("not all arguments are text documents")  
192    
193      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)  content.PCorpus <-
194      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)  function(x)
195    {
196        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
197        filehash::dbMultiFetch(db, unlist(x$content))
198  }  }
199    
200  print.Corpus <- function(x, ...) {  inspect <-
201      cat(sprintf(ngettext(length(x),  function(x)
202                           "A corpus with %d text document\n",      UseMethod("inspect", x)
203                           "A corpus with %d text documents\n"),  inspect.PCorpus <- inspect.VCorpus <-
204                  length(x)))  function(x)
205    {
206        print(x)
207        cat("\n")
208        print(noquote(content(x)))
209      invisible(x)      invisible(x)
210  }  }
211    
212  summary.Corpus <- function(object, ...) {  length.PCorpus <- length.VCorpus <-
213      print(object)  function(x)
214      if (length(DMetaData(object)) > 0) {      length(x$content)
215          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  
216                               "\nThe metadata consists of %d tag-value pair and a data frame\n",  names.PCorpus <- names.VCorpus <-
217                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  function(x)
218                      length(CMetaData(object)$MetaData)))      as.character(meta(x, "id", "local"))
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
 }  
219    
220  inspect <- function(x) UseMethod("inspect", x)  `names<-.PCorpus` <- `names<-.VCorpus` <-
221  inspect.PCorpus <- function(x) {  function(x, value)
222      summary(x)  {
223      cat("\n")      meta(x, "id", "local") <- as.character(value)
224      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      x
     show(filehash::dbMultiFetch(db, unlist(x)))  
 }  
 inspect.VCorpus <- function(x) {  
     summary(x)  
     cat("\n")  
     print(noquote(lapply(x, identity)))  
225  }  }
226    
227  lapply.PCorpus <- function(X, FUN, ...) {  print.PCorpus <- print.VCorpus <-
228      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  function(x, ...)
229      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)  {
230  }      writeLines(sprintf("<<%s (documents: %d, metadata (corpus/indexed): %d/%d)>>",
231  lapply.VCorpus <- function(X, FUN, ...) {                         class(x)[1],
232      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")                         length(x),
233      if (!is.null(lazyTmMap))                         length(meta(x, type = "corpus")),
234          .Call("copyCorpus", X, materialize(X))                         ncol(meta(x, type = "indexed"))))
235      base::lapply(X, FUN, ...)      invisible(x)
236  }  }
237    
238  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
239    function(x, path = ".", filenames = NULL)
240    {
241      filenames <- file.path(path,      filenames <- file.path(path,
242                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
243              sprintf("%s.txt", as.character(meta(x, "id", "local")))
244                             else filenames)                             else filenames)
245      i <- 1  
246      for (o in x) {      stopifnot(length(x) == length(filenames))
247          writeLines(as.PlainTextDocument(o), filenames[i])  
248          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
249      }  
250        invisible(x)
251  }  }

Legend:
Removed from v.1108  
changed lines
  Added in v.1404

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge