SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1073, Fri May 28 12:32:46 2010 UTC revision 1377, Wed May 21 17:15:56 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  PCorpus <-
4      attr(x, "CMetaData") <- cmeta  function(x,
5      attr(x, "DMetaData") <- dmeta           readerControl = list(reader = reader(x), language = "en"),
6      attr(x, "DBControl") <- dbcontrol           dbControl = list(dbName = "", dbType = "DB1"))
7      class(x) <- c("PCorpus", "Corpus", "list")  {
8      x      stopifnot(inherits(x, "Source"))
9  }  
10  DBControl <- function(x) attr(x, "DBControl")      readerControl <- prepareReader(readerControl, reader(x))
11    
12        if (is.function(readerControl$init))
13            readerControl$init()
14    
15  PCorpus <- function(x,      if (is.function(readerControl$exit))
16                      readerControl = list(reader = x$DefaultReader, language = "en"),          on.exit(readerControl$exit())
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
17    
18      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19          stop("error in creating database")          stop("error in creating database")
20      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22      # Allocate memory in advance if length is known      tdl <- vector("list", length(x))
     tdl <- if (x$Length > 0)x  
         vector("list", as.integer(x$Length))  
     else  
         list()  
   
23      counter <- 1      counter <- 1
24      while (!eoi(x)) {      while (!eoi(x)) {
25          x <- stepNext(x)          x <- stepNext(x)
26          elem <- getElem(x)          elem <- getElem(x)
27          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          doc <- readerControl$reader(elem,
28          filehash::dbInsert(db, ID(doc), doc)                                      readerControl$language,
29          if (x$Length > 0) tdl[[counter]] <- ID(doc)                                      as.character(counter))
30          else tdl <- c(tdl, ID(doc))          filehash::dbInsert(db, meta(doc, "id"), doc)
31            tdl[[counter]] <- meta(doc, "id")
32          counter <- counter + 1          counter <- counter + 1
33      }      }
     names(tdl) <- x$Names  
34    
35      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      structure(list(content = tdl,
36      filehash::dbInsert(db, "DMetaData", df)                     meta = CorpusMeta(),
37      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))                     dmeta = data.frame(row.names = seq_along(tdl)),
38                       dbcontrol = dbControl),
39      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)                class = c("PCorpus", "Corpus"))
40  }  }
41    
42  .VCorpus <- function(x, cmeta, dmeta) {  Corpus <-
43      attr(x, "CMetaData") <- cmeta  VCorpus <-
44      attr(x, "DMetaData") <- dmeta  function(x, readerControl = list(reader = reader(x), language = "en"))
45      class(x) <- c("VCorpus", "Corpus", "list")  {
46      x      stopifnot(inherits(x, "Source"))
 }  
47    
48  # Register S3 corpus classes to be recognized by S4 methods. This is      readerControl <- prepareReader(readerControl, reader(x))
49  # mainly a fix to be compatible with packages which were originally  
50  # developed to cooperate with corresponding S4 tm classes. Necessary      if (is.function(readerControl$init))
51  # since tm's class architecture was changed to S3 since tm version 0.5.          readerControl$init()
 setOldClass(c("VCorpus", "Corpus", "list"))  
   
 # The "..." are additional arguments for the FunctionGenerator reader  
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "en"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
   
     # Allocate memory in advance if length is known  
     tdl <- if (x$Length > 0)  
         vector("list", as.integer(x$Length))  
     else  
         list()  
52    
53      if (x$Vectorized)      if (is.function(readerControl$exit))
54          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),          on.exit(readerControl$exit())
55    
56        tdl <- vector("list", length(x))
57        # Check for parallel element access
58        if (is.function(getS3method("pGetElem", class(x), TRUE)))
59            tdl <- mapply(function(elem, id)
60                              readerControl$reader(elem, readerControl$language, id),
61                        pGetElem(x),                        pGetElem(x),
62                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = as.character(seq_along(x)),
63                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
64      else {      else {
65          counter <- 1          counter <- 1
66          while (!eoi(x)) {          while (!eoi(x)) {
67              x <- stepNext(x)              x <- stepNext(x)
68              elem <- getElem(x)              elem <- getElem(x)
69              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              doc <- readerControl$reader(elem,
70              if (x$Length > 0)                                          readerControl$language,
71                                            as.character(counter))
72                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
             else  
                 tdl <- c(tdl, list(doc))  
73              counter <- counter + 1              counter <- counter + 1
74          }          }
75      }      }
     names(tdl) <- x$Names  
     df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
   
 `[.PCorpus` <- function(x, i) {  
     if (missing(i)) return(x)  
     index <- attr(x, "DMetaData")[[1 , "subset"]]  
     attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i  
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
 }  
76    
77  `[.VCorpus` <- function(x, i) {      structure(list(content = tdl,
78      if (missing(i)) return(x)                     meta = CorpusMeta(),
79      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])                     dmeta = data.frame(row.names = seq_along(tdl))),
80                  class = c("VCorpus", "Corpus"))
81  }  }
82    
83  `[<-.PCorpus` <- function(x, i, value) {  `[.PCorpus` <-
84      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
85      counter <- 1  {
86      for (id in unclass(x)[i]) {      if (!missing(i)) {
87          if (identical(length(value), 1L)) db[[id]] <- value          x$content <- x$content[i]
88          else db[[id]] <- value[[counter]]          x$dmeta <- x$dmeta[i, , drop = FALSE]
         counter <- counter + 1  
89      }      }
90      x      x
91  }  }
92    
93  `[[.PCorpus` <-  function(x, i) {  `[.VCorpus` <-
94      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
95      filehash::dbFetch(db, NextMethod("[["))  {
96  }      if (!missing(i)) {
97  `[[.VCorpus` <-  function(x, i) {          x$content <- x$content[i]
98      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")          x$dmeta <- x$dmeta[i, , drop = FALSE]
99      if (!is.null(lazyTmMap))          if (!is.null(x$lazy))
100          .Call("copyCorpus", x, materialize(x, i))              x$lazy$index <- x$lazy$index[i]
     NextMethod("[[")  
101  }  }
   
 `[[<-.PCorpus` <-  function(x, i, value) {  
     db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  
     index <- unclass(x)[[i]]  
     db[[index]] <- value  
102      x      x
103  }  }
 `[[<-.VCorpus` <-  function(x, i, value) {  
     # Mark new objects as not active for lazy mapping  
     lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")  
     if (!is.null(lazyTmMap)) {  
         lazyTmMap$index[i] <- FALSE  
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
104    
105              x$Children <- list(left, right)  .map_name_index <-
106          }  function(x, i)
107          level <<- level - 1  {
108          x      if (is.character(i))
109      }          match(i, meta(x, "id", "local"))
110      list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)      else
111            i
112  }  }
113    
114  # Find indices to be updated for a CMetaData tree  `[[.PCorpus` <-
115  .find_indices <- function(x) {  function(x, i)
116      indices.mapping <- NULL  {
117      for (m in levels(as.factor(DMetaData(x)$MetaID))) {      i <- .map_name_index(x, i)
118          indices <- (DMetaData(x)$MetaID == m)      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
119          indices.mapping <- c(indices.mapping, list(m = indices))      filehash::dbFetch(db, x$content[[i]])
         names(indices.mapping)[length(indices.mapping)] <- m  
120      }      }
121      indices.mapping  `[[.VCorpus` <-
122    function(x, i)
123    {
124        i <- .map_name_index(x, i)
125        if (!is.null(x$lazy))
126            .Call(copyCorpus, x, materialize(x, i))
127        x$content[[i]]
128  }  }
129    
130  c2 <- function(x, y, ...) {  `[[<-.PCorpus` <-
131      # Update the CMetaData tree  function(x, i, value)
132      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  {
133      update.struct <- .update_id(cmeta)      i <- .map_name_index(x, i)
134        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
135      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)      db[[x$content[[i]]]] <- value
136        x
     # Find indices to be updated for the left tree  
     indices.mapping <- .find_indices(x)  
   
     # Update the DMetaData data frames for the left tree  
     for (i in 1:ncol(update.struct$left.mapping)) {  
         map <- update.struct$left.mapping[,i]  
         DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
137      }      }
138    `[[<-.VCorpus` <-
139      # Find indices to be updated for the right tree  function(x, i, value)
140      indices.mapping <- .find_indices(y)  {
141        i <- .map_name_index(x, i)
142      # Update the DMetaData data frames for the right tree      # Mark new objects as inactive for lazy mapping
143      for (i in 1:ncol(update.struct$right.mapping)) {      if (!is.null(x$lazy))
144          map <- update.struct$right.mapping[,i]          x$lazy$index[i] <- FALSE
145          DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])      x$content[[i]] <- value
146        x
147      }      }
148    
149      # Merge the DMetaData data frames  as.list.PCorpus <- as.list.VCorpus <-
150      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  function(x, ...)
151      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))      content(x)
152      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  
153      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  as.VCorpus <-
154      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))  function(x)
155      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)      UseMethod("as.VCorpus")
156      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  as.VCorpus.VCorpus <- identity
157    
158      new  outer_union <-
159    function(x, y, ...)
160    {
161        if (nrow(x) > 0L)
162            x[, setdiff(names(y), names(x))] <- NA
163        if (nrow(y) > 0L)
164            y[, setdiff(names(x), names(y))] <- NA
165        res <- rbind(x, y)
166        if (ncol(res) == 0L)
167            res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
168        res
169  }  }
170    
171  c.Corpus <-  c.VCorpus <-
172  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
173  {  {
174      args <- list(...)      args <- list(...)
175        x <- args[[1L]]
176    
177      if (identical(length(args), 0L))      if (length(args) == 1L)
178          return(x)          return(x)
179    
180      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
181          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
182    
183      if (inherits(x, "PCorpus"))      structure(list(content = do.call("c", lapply(args, content)),
184          stop("concatenation of corpora with underlying databases is not supported")                     meta = CorpusMeta(meta = do.call("c",
185                         lapply(args, function(a) meta(a, type = "corpus")))),
186      Reduce(c2, base::c(list(x), args))                     dmeta = Reduce(outer_union, lapply(args, meta))),
187                  class = c("VCorpus", "Corpus"))
188  }  }
189    
190  c.TextDocument <- function(x, ..., recursive = FALSE) {  content.VCorpus <-
191      args <- list(...)  function(x)
192    {
193      if (identical(length(args), 0L))      if (!is.null(x$lazy))
194          return(x)          .Call(copyCorpus, x, materialize(x))
195        x$content
     if (!all(unlist(lapply(args, inherits, class(x)))))  
         stop("not all arguments are text documents")  
   
     dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)  
     .VCorpus(list(x, ...), .MetaDataNode(), dmeta)  
 }  
   
 print.Corpus <- function(x, ...) {  
     cat(sprintf(ngettext(length(x),  
                          "A corpus with %d text document\n",  
                          "A corpus with %d text documents\n"),  
                 length(x)))  
     invisible(x)  
196  }  }
197    
198  summary.Corpus <- function(object, ...) {  content.PCorpus <-
199      print(object)  function(x)
200      if (length(DMetaData(object)) > 0) {  {
201          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
202                               "\nThe metadata consists of %d tag-value pair and a data frame\n",      filehash::dbMultiFetch(db, unlist(x$content))
                              "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  
                     length(CMetaData(object)$MetaData)))  
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
203  }  }
204    
205  inspect <- function(x) UseMethod("inspect", x)  inspect <-
206  inspect.PCorpus <- function(x) {  function(x)
207      summary(x)      UseMethod("inspect", x)
208      cat("\n")  inspect.PCorpus <- inspect.VCorpus <-
209      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x)
210      show(filehash::dbMultiFetch(db, unlist(x)))  {
211  }      print(x)
 inspect.VCorpus <- function(x) {  
     summary(x)  
212      cat("\n")      cat("\n")
213      print(noquote(lapply(x, identity)))      print(noquote(content(x)))
214        invisible(x)
215  }  }
216    
217  lapply.PCorpus <- function(X, FUN, ...) {  length.PCorpus <- length.VCorpus <-
218      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  function(x)
219      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      length(x$content)
220  }  
221  lapply.VCorpus <- function(X, FUN, ...) {  names.PCorpus <- names.VCorpus <-
222      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")  function(x)
223      if (!is.null(lazyTmMap))      as.character(meta(x, "id", "local"))
224          .Call("copyCorpus", X, materialize(X))  
225      base::lapply(X, FUN, ...)  print.PCorpus <- print.VCorpus <-
226    function(x, ...)
227    {
228        writeLines(sprintf("<<%s (documents: %d, metadata (corpus/indexed): %d/%d)>>",
229                           class(x)[1],
230                           length(x),
231                           length(meta(x, type = "corpus")),
232                           ncol(meta(x, type = "indexed"))))
233        invisible(x)
234  }  }
235    
236  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
237    function(x, path = ".", filenames = NULL)
238    {
239      filenames <- file.path(path,      filenames <- file.path(path,
240                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
241              sprintf("%s.txt", as.character(meta(x, "id", "local")))
242                             else filenames)                             else filenames)
243      i <- 1  
244      for (o in x) {      stopifnot(length(x) == length(filenames))
245          writeLines(as.PlainTextDocument(o), filenames[i])  
246          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
247      }  
248        invisible(x)
249  }  }

Legend:
Removed from v.1073  
changed lines
  Added in v.1377

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge