SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1095, Wed Aug 25 19:05:38 2010 UTC revision 1419, Sat May 2 17:23:47 2015 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  PCorpus <-
4      attr(x, "CMetaData") <- cmeta  function(x,
5      attr(x, "DMetaData") <- dmeta           readerControl = list(reader = reader(x), language = "en"),
6      attr(x, "DBControl") <- dbcontrol           dbControl = list(dbName = "", dbType = "DB1"))
7      class(x) <- c("PCorpus", "Corpus", "list")  {
8      x      stopifnot(inherits(x, "Source"))
 }  
 DBControl <- function(x) attr(x, "DBControl")  
9    
10  PCorpus <- function(x,      readerControl <- prepareReader(readerControl, reader(x))
                     readerControl = list(reader = x$DefaultReader, language = "en"),  
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
11    
12      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
13          stop("error in creating database")          stop("error in creating database")
14      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
15    
16      # Allocate memory in advance if length is known      x <- open(x)
17      tdl <- if (x$Length > 0)      tdl <- vector("list", length(x))
         vector("list", as.integer(x$Length))  
     else  
         list()  
   
18      counter <- 1      counter <- 1
19      while (!eoi(x)) {      while (!eoi(x)) {
20          x <- stepNext(x)          x <- stepNext(x)
21          elem <- getElem(x)          elem <- getElem(x)
22          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          doc <- readerControl$reader(elem,
23          filehash::dbInsert(db, ID(doc), doc)                                      readerControl$language,
24          if (x$Length > 0) tdl[[counter]] <- ID(doc)                                      as.character(counter))
25          else tdl <- c(tdl, ID(doc))          filehash::dbInsert(db, meta(doc, "id"), doc)
26            tdl[[counter]] <- meta(doc, "id")
27          counter <- counter + 1          counter <- counter + 1
28      }      }
29      names(tdl) <- x$Names      x <- close(x)
30    
31      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      p <- list(content = tdl,
32      filehash::dbInsert(db, "DMetaData", df)                meta = CorpusMeta(),
33      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))                dmeta = data.frame(row.names = seq_along(tdl)),
34                  dbcontrol = dbControl)
35      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      class(p) <- c("PCorpus", "Corpus")
36        p
37  }  }
38    
39  .VCorpus <- function(x, cmeta, dmeta) {  Corpus <-
40      attr(x, "CMetaData") <- cmeta  VCorpus <-
41      attr(x, "DMetaData") <- dmeta  function(x, readerControl = list(reader = reader(x), language = "en"))
42      class(x) <- c("VCorpus", "Corpus", "list")  {
43      x      stopifnot(inherits(x, "Source"))
 }  
44    
45  # Register S3 corpus classes to be recognized by S4 methods. This is      readerControl <- prepareReader(readerControl, reader(x))
 # mainly a fix to be compatible with packages which were originally  
 # developed to cooperate with corresponding S4 tm classes. Necessary  
 # since tm's class architecture was changed to S3 since tm version 0.5.  
 setOldClass(c("VCorpus", "Corpus", "list"))  
   
 # The "..." are additional arguments for the FunctionGenerator reader  
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "en"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
   
     # Allocate memory in advance if length is known  
     tdl <- if (x$Length > 0)  
         vector("list", as.integer(x$Length))  
     else  
         list()  
46    
47      if (x$Vectorized)      x <- open(x)
48          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),      tdl <- vector("list", length(x))
49        # Check for parallel element access
50        if (is.function(getS3method("pGetElem", class(x), TRUE)))
51            tdl <- mapply(function(elem, id)
52                              readerControl$reader(elem, readerControl$language, id),
53                        pGetElem(x),                        pGetElem(x),
54                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = as.character(seq_along(x)),
55                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
56      else {      else {
57          counter <- 1          counter <- 1
58          while (!eoi(x)) {          while (!eoi(x)) {
59              x <- stepNext(x)              x <- stepNext(x)
60              elem <- getElem(x)              elem <- getElem(x)
61              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              doc <- readerControl$reader(elem,
62              if (x$Length > 0)                                          readerControl$language,
63                                            as.character(counter))
64                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
             else  
                 tdl <- c(tdl, list(doc))  
65              counter <- counter + 1              counter <- counter + 1
66          }          }
67      }      }
68      names(tdl) <- x$Names      x <- close(x)
     df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
   
 `[.PCorpus` <- function(x, i) {  
     if (missing(i)) return(x)  
     index <- attr(x, "DMetaData")[[1 , "subset"]]  
     attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i  
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
 }  
69    
70  `[.VCorpus` <- function(x, i) {      as.VCorpus(tdl)
     if (missing(i)) return(x)  
     .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])  
71  }  }
72    
73  `[<-.PCorpus` <- function(x, i, value) {  `[.PCorpus` <-
74      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
75      counter <- 1  {
76      for (id in unclass(x)[i]) {      if (!missing(i)) {
77          if (identical(length(value), 1L)) db[[id]] <- value          x$content <- x$content[i]
78          else db[[id]] <- value[[counter]]          x$dmeta <- x$dmeta[i, , drop = FALSE]
         counter <- counter + 1  
79      }      }
80      x      x
81  }  }
82    
83  `[[.PCorpus` <-  function(x, i) {  `[.VCorpus` <-
84      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
85      filehash::dbFetch(db, NextMethod("[["))  {
86  }      if (!missing(i)) {
87  `[[.VCorpus` <-  function(x, i) {          x$content <- x$content[i]
88      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")          x$dmeta <- x$dmeta[i, , drop = FALSE]
89      if (!is.null(lazyTmMap))          if (!is.null(x$lazy))
90          .Call("copyCorpus", x, materialize(x, i))              x$lazy$index <- x$lazy$index[i]
     NextMethod("[[")  
91  }  }
   
 `[[<-.PCorpus` <-  function(x, i, value) {  
     db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  
     index <- unclass(x)[[i]]  
     db[[index]] <- value  
92      x      x
93  }  }
 `[[<-.VCorpus` <-  function(x, i, value) {  
     # Mark new objects as not active for lazy mapping  
     lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")  
     if (!is.null(lazyTmMap)) {  
         lazyTmMap$index[i] <- FALSE  
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
94    
95              x$Children <- list(left, right)  .map_name_index <-
96          }  function(x, i)
97          level <<- level - 1  {
98          x      if (is.character(i))
99      }          match(i, meta(x, "id", "local"))
100      list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)      else
101            i
102  }  }
103    
104  # Find indices to be updated for a CMetaData tree  `[[.PCorpus` <-
105  .find_indices <- function(x) {  function(x, i)
106      indices.mapping <- NULL  {
107      for (m in levels(as.factor(DMetaData(x)$MetaID))) {      i <- .map_name_index(x, i)
108          indices <- (DMetaData(x)$MetaID == m)      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
109          indices.mapping <- c(indices.mapping, list(m = indices))      filehash::dbFetch(db, x$content[[i]])
         names(indices.mapping)[length(indices.mapping)] <- m  
110      }      }
111      indices.mapping  `[[.VCorpus` <-
112    function(x, i)
113    {
114        i <- .map_name_index(x, i)
115        if (!is.null(x$lazy))
116            .Call(copyCorpus, x, materialize(x, i))
117        x$content[[i]]
118  }  }
119    
120  c2 <- function(x, y, ...) {  `[[<-.PCorpus` <-
121      # Update the CMetaData tree  function(x, i, value)
122      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  {
123      update.struct <- .update_id(cmeta)      i <- .map_name_index(x, i)
124        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
125      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)      db[[x$content[[i]]]] <- value
126        x
     # Find indices to be updated for the left tree  
     indices.mapping <- .find_indices(x)  
   
     # Update the DMetaData data frames for the left tree  
     for (i in 1:ncol(update.struct$left.mapping)) {  
         map <- update.struct$left.mapping[,i]  
         DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
127      }      }
128    `[[<-.VCorpus` <-
129      # Find indices to be updated for the right tree  function(x, i, value)
130      indices.mapping <- .find_indices(y)  {
131        i <- .map_name_index(x, i)
132      # Update the DMetaData data frames for the right tree      # Mark new objects as inactive for lazy mapping
133      for (i in 1:ncol(update.struct$right.mapping)) {      if (!is.null(x$lazy))
134          map <- update.struct$right.mapping[,i]          x$lazy$index[i] <- FALSE
135          DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])      x$content[[i]] <- value
136        x
137      }      }
138    
139      # Merge the DMetaData data frames  as.list.PCorpus <- as.list.VCorpus <-
140      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  function(x, ...)
141      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))      setNames(content(x), as.character(lapply(content(x), meta, "id")))
142      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  
143      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  as.VCorpus <-
144      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))  function(x)
145      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)      UseMethod("as.VCorpus")
146      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  as.VCorpus.VCorpus <- identity
147    as.VCorpus.list <-
148    function(x)
149    {
150        v <- list(content = x,
151                  meta = CorpusMeta(),
152                  dmeta = data.frame(row.names = seq_along(x)))
153        class(v) <- c("VCorpus", "Corpus")
154        v
155    }
156    
157      new  outer_union <-
158    function(x, y, ...)
159    {
160        if (nrow(x) > 0L)
161            x[, setdiff(names(y), names(x))] <- NA
162        if (nrow(y) > 0L)
163            y[, setdiff(names(x), names(y))] <- NA
164        res <- rbind(x, y)
165        if (ncol(res) == 0L)
166            res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
167        res
168  }  }
169    
170  c.Corpus <-  c.VCorpus <-
171  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
172  {  {
173      args <- list(...)      args <- list(...)
174        x <- args[[1L]]
175    
176      if (identical(length(args), 0L))      if (length(args) == 1L)
177          return(x)          return(x)
178    
179      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
180          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
181    
182      if (inherits(x, "PCorpus"))      v <- list(content = do.call("c", lapply(args, content)),
183          stop("concatenation of corpora with underlying databases is not supported")                meta = CorpusMeta(meta = do.call("c",
184                    lapply(args, function(a) meta(a, type = "corpus")))),
185      l <- base::c(list(x), args)                dmeta = Reduce(outer_union, lapply(args, meta)))
186      if (recursive)      class(v) <- c("VCorpus", "Corpus")
187          Reduce(c2, l)      v
     else {  
         l <- do.call("c", lapply(l, unclass))  
         .VCorpus(l,  
                  cmeta = .MetaDataNode(),  
                  dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))  
     }  
188  }  }
189    
190  c.TextDocument <- function(x, ..., recursive = FALSE) {  content.VCorpus <-
191      args <- list(...)  function(x)
192    {
193      if (identical(length(args), 0L))      if (!is.null(x$lazy))
194          return(x)          .Call(copyCorpus, x, materialize(x))
195        x$content
196      if (!all(unlist(lapply(args, inherits, class(x)))))  }
         stop("not all arguments are text documents")  
197    
198      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)  content.PCorpus <-
199      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)  function(x)
200    {
201        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
202        filehash::dbMultiFetch(db, unlist(x$content))
203  }  }
204    
205  print.Corpus <- function(x, ...) {  inspect <-
206      cat(sprintf(ngettext(length(x),  function(x)
207                           "A corpus with %d text document\n",      UseMethod("inspect", x)
208                           "A corpus with %d text documents\n"),  inspect.PCorpus <- inspect.VCorpus <-
209                  length(x)))  function(x)
210    {
211        print(x)
212        cat("\n")
213        print(noquote(content(x)))
214      invisible(x)      invisible(x)
215  }  }
216    
217  summary.Corpus <- function(object, ...) {  length.PCorpus <- length.VCorpus <-
218      print(object)  function(x)
219      if (length(DMetaData(object)) > 0) {      length(x$content)
220          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  
221                               "\nThe metadata consists of %d tag-value pair and a data frame\n",  names.PCorpus <- names.VCorpus <-
222                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  function(x)
223                      length(CMetaData(object)$MetaData)))      as.character(meta(x, "id", "local"))
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
 }  
224    
225  inspect <- function(x) UseMethod("inspect", x)  `names<-.PCorpus` <- `names<-.VCorpus` <-
226  inspect.PCorpus <- function(x) {  function(x, value)
227      summary(x)  {
228      cat("\n")      meta(x, "id", "local") <- as.character(value)
229      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      x
     show(filehash::dbMultiFetch(db, unlist(x)))  
 }  
 inspect.VCorpus <- function(x) {  
     summary(x)  
     cat("\n")  
     print(noquote(lapply(x, identity)))  
230  }  }
231    
232  lapply.PCorpus <- function(X, FUN, ...) {  format.PCorpus <- format.VCorpus <-
233      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  function(x, ...)
234      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)  {
235  }      c(sprintf("<<%s>>", class(x)[1L]),
236  lapply.VCorpus <- function(X, FUN, ...) {        sprintf("Metadata:  corpus specific: %d, document level (indexed): %d",
237      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")                length(meta(x, type = "corpus")),
238      if (!is.null(lazyTmMap))                ncol(meta(x, type = "indexed"))),
239          .Call("copyCorpus", X, materialize(X))        sprintf("Content:  documents: %d", length(x)))
     base::lapply(X, FUN, ...)  
240  }  }
241    
242  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
243    function(x, path = ".", filenames = NULL)
244    {
245      filenames <- file.path(path,      filenames <- file.path(path,
246                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
247              sprintf("%s.txt", as.character(meta(x, "id", "local")))
248                             else filenames)                             else filenames)
249      i <- 1  
250      for (o in x) {      stopifnot(length(x) == length(filenames))
251          writeLines(as.PlainTextDocument(o), filenames[i])  
252          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
253      }  
254        invisible(x)
255  }  }

Legend:
Removed from v.1095  
changed lines
  Added in v.1419

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge