SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1108, Fri Oct 22 18:32:47 2010 UTC revision 1350, Tue Apr 22 07:41:14 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  PCorpus <-
4      attr(x, "CMetaData") <- cmeta  function(x,
5      attr(x, "DMetaData") <- dmeta           readerControl = list(reader = reader(x), language = "en"),
6      attr(x, "DBControl") <- dbcontrol           dbControl = list(dbName = "", dbType = "DB1"))
7      class(x) <- c("PCorpus", "Corpus", "list")  {
8      x      stopifnot(inherits(x, "Source"))
9  }  
10  DBControl <- function(x) attr(x, "DBControl")      readerControl <- prepareReader(readerControl, reader(x))
11    
12        if (is.function(readerControl$init))
13            readerControl$init()
14    
15  PCorpus <- function(x,      if (is.function(readerControl$exit))
16                      readerControl = list(reader = x$DefaultReader, language = "en"),          on.exit(readerControl$exit())
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
17    
18      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19          stop("error in creating database")          stop("error in creating database")
20      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
23      tdl <- if (x$Length > 0)      tdl <- if (length(x) > 0) vector("list", as.integer(length(x))) else list()
         vector("list", as.integer(x$Length))  
     else  
         list()  
24    
25      counter <- 1      counter <- 1
26      while (!eoi(x)) {      while (!eoi(x)) {
27          x <- stepNext(x)          x <- stepNext(x)
28          elem <- getElem(x)          elem <- getElem(x)
29          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          id <- if (is.null(names(x)) || is.na(names(x)))
30          filehash::dbInsert(db, ID(doc), doc)              as.character(counter)
31          if (x$Length > 0) tdl[[counter]] <- ID(doc)          else
32          else tdl <- c(tdl, ID(doc))              names(x)[counter]
33            doc <- readerControl$reader(elem, readerControl$language, id)
34            filehash::dbInsert(db, meta(doc, "id"), doc)
35            tdl[[counter]] <- meta(doc, "id")
36          counter <- counter + 1          counter <- counter + 1
37      }      }
38      names(tdl) <- x$Names      if (!is.null(names(x)) && !is.na(names(x)))
39            names(tdl) <- names(x)
     df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
     filehash::dbInsert(db, "DMetaData", df)  
     dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))  
40    
41      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      structure(list(content = tdl,
42                       meta = CorpusMeta(),
43                       dmeta = data.frame(row.names = seq_along(tdl)),
44                       dbcontrol = dbControl),
45                  class = c("PCorpus", "Corpus"))
46  }  }
47    
48  .VCorpus <- function(x, cmeta, dmeta) {  VCorpus <-
49      attr(x, "CMetaData") <- cmeta  function(x, readerControl = list(reader = reader(x), language = "en"))
50      attr(x, "DMetaData") <- dmeta  {
51      class(x) <- c("VCorpus", "Corpus", "list")      stopifnot(inherits(x, "Source"))
52      x  
53  }      readerControl <- prepareReader(readerControl, reader(x))
54    
55        if (is.function(readerControl$init))
56            readerControl$init()
57    
58  # Register S3 corpus classes to be recognized by S4 methods. This is      if (is.function(readerControl$exit))
59  # mainly a fix to be compatible with packages which were originally          on.exit(readerControl$exit())
 # developed to cooperate with corresponding S4 tm classes. Necessary  
 # since tm's class architecture was changed to S3 since tm version 0.5.  
 setOldClass(c("VCorpus", "Corpus", "list"))  
   
 # The "..." are additional arguments for the FunctionGenerator reader  
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "en"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
60    
61      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
62      tdl <- if (x$Length > 0)      tdl <- if (length(x) > 0) vector("list", as.integer(length(x))) else list()
         vector("list", as.integer(x$Length))  
     else  
         list()  
63    
64      if (x$Vectorized)      # Check for parallel element access
65          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),      if (is.function(getS3method("pGetElem", class(x), TRUE)))
66            tdl <- mapply(function(elem, id)
67                              readerControl$reader(elem, readerControl$language, id),
68                        pGetElem(x),                        pGetElem(x),
69                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = if (is.null(names(x)) || is.na(names(x)))
70                              as.character(seq_len(length(x)))
71                          else names(x),
72                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
73      else {      else {
74          counter <- 1          counter <- 1
75          while (!eoi(x)) {          while (!eoi(x)) {
76              x <- stepNext(x)              x <- stepNext(x)
77              elem <- getElem(x)              elem <- getElem(x)
78              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              id <- if (is.null(names(x)) || is.na(names(x)))
79              if (x$Length > 0)                  as.character(counter)
                 tdl[[counter]] <- doc  
80              else              else
81                  tdl <- c(tdl, list(doc))                  names(x)[counter]
82                doc <- readerControl$reader(elem, readerControl$language, id)
83                tdl[[counter]] <- doc
84              counter <- counter + 1              counter <- counter + 1
85          }          }
86      }      }
87      names(tdl) <- x$Names      if (!is.null(names(x)) && !is.na(names(x)))
88      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)          names(tdl) <- names(x)
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
89    
90  `[.PCorpus` <- function(x, i) {      structure(list(content = tdl,
91      if (missing(i)) return(x)                     meta = CorpusMeta(),
92      index <- attr(x, "DMetaData")[[1 , "subset"]]                     dmeta = data.frame(row.names = seq_along(tdl))),
93      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i                class = c("VCorpus", "Corpus"))
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
94  }  }
95    
96  `[.VCorpus` <- function(x, i) {  `[.PCorpus` <-
97      if (missing(i)) return(x)  function(x, i)
98      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])  {
99        if (!missing(i)) {
100            x$content <- x$content[i]
101            x$dmeta <- x$dmeta[i, , drop = FALSE]
102        }
103        x
104  }  }
105    
106  `[<-.PCorpus` <- function(x, i, value) {  `[.VCorpus` <-
107      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
108      counter <- 1  {
109      for (id in unclass(x)[i]) {      if (!missing(i)) {
110          if (identical(length(value), 1L)) db[[id]] <- value          x$content <- x$content[i]
111          else db[[id]] <- value[[counter]]          x$dmeta <- x$dmeta[i, , drop = FALSE]
112          counter <- counter + 1          if (!is.null(x$lazy))
113                x$lazy$index <- x$lazy$index[i]
114      }      }
115      x      x
116  }  }
117    
118  .map_name_index <- function(x, i) {  .map_name_index <-
119    function(x, i)
120    {
121      if (is.character(i)) {      if (is.character(i)) {
122          if (is.null(names(x)))          n <- names(x$content)
123              match(i, meta(x, "ID", type = "local"))          match(i, if (is.null(n)) meta(x, "id", "local") else n)
124          else      } else
             match(i, names(x))  
     }  
125      i      i
126  }  }
127    
128  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
129    function(x, i)
130    {
131      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
132      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
133      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, x$content[[i]])
134  }  }
135  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
136    function(x, i)
137    {
138      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
139      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
140      if (!is.null(lazyTmMap))          .Call(copyCorpus, x, materialize(x, i))
141          .Call("copyCorpus", x, materialize(x, i))      x$content[[i]]
     NextMethod("[[")  
142  }  }
143    
144  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
145    function(x, i, value)
146    {
147      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
148      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
149      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
150      x      x
151  }  }
152  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
153    function(x, i, value)
154    {
155      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
156      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
157      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
158      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
159          lazyTmMap$index[i] <- FALSE      x$content[[i]] <- value
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
160          x          x
161      }      }
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
   
 # Find indices to be updated for a CMetaData tree  
 .find_indices <- function(x) {  
     indices.mapping <- NULL  
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
     indices.mapping  
 }  
   
 c2 <- function(x, y, ...) {  
     # Update the CMetaData tree  
     cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  
     update.struct <- .update_id(cmeta)  
162    
163      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)  as.list.PCorpus <- as.list.VCorpus <-
164    function(x, ...)
165        content(x)
166    
167    as.VCorpus <-
168    function(x)
169        UseMethod("as.VCorpus")
170    as.VCorpus.VCorpus <- identity
171    
172      # Find indices to be updated for the left tree  outer_union <-
173      indices.mapping <- .find_indices(x)  function(x, y, ...)
174    {
175      # Update the DMetaData data frames for the left tree      if (nrow(x) > 0L)
176      for (i in 1:ncol(update.struct$left.mapping)) {          x[, setdiff(names(y), names(x))] <- NA
177          map <- update.struct$left.mapping[,i]      if (nrow(y) > 0L)
178          DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])          y[, setdiff(names(x), names(y))] <- NA
179      }      res <- rbind(x, y)
180        if (ncol(res) == 0L)
181      # Find indices to be updated for the right tree          res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
182      indices.mapping <- .find_indices(y)      res
   
     # Update the DMetaData data frames for the right tree  
     for (i in 1:ncol(update.struct$right.mapping)) {  
         map <- update.struct$right.mapping[,i]  
         DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
   
     # Merge the DMetaData data frames  
     labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  
     na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))  
     x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  
     labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  
     na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))  
     y.dmeta.aug <- cbind(DMetaData(y), na.matrix)  
     DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  
   
     new  
183  }  }
184    
185  c.Corpus <-  c.VCorpus <-
186  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
187  {  {
188      args <- list(...)      args <- list(...)
189        x <- args[[1L]]
190    
191      if (identical(length(args), 0L))      if (length(args) == 1L)
192          return(x)          return(x)
193    
194      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
195          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
196    
197      if (inherits(x, "PCorpus"))      structure(list(content = do.call("c", lapply(args, content)),
198          stop("concatenation of corpora with underlying databases is not supported")                     meta = structure(do.call("c",
199                         lapply(args, function(a) meta(a, type = "corpus"))),
200      l <- base::c(list(x), args)                                      class = "CorpusMeta"),
201      if (recursive)                     dmeta = Reduce(outer_union, lapply(args, meta))),
202          Reduce(c2, l)                class = c("VCorpus", "Corpus"))
     else {  
         l <- do.call("c", lapply(l, unclass))  
         .VCorpus(l,  
                  cmeta = .MetaDataNode(),  
                  dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))  
203      }      }
 }  
   
 c.TextDocument <- function(x, ..., recursive = FALSE) {  
     args <- list(...)  
   
     if (identical(length(args), 0L))  
         return(x)  
204    
205      if (!all(unlist(lapply(args, inherits, class(x)))))  content.VCorpus <-
206          stop("not all arguments are text documents")  function(x)
207    {
208      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)      if (!is.null(x$lazy))
209      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)          .Call(copyCorpus, x, materialize(x))
210        x$content
211  }  }
212    
213  print.Corpus <- function(x, ...) {  content.PCorpus <-
214      cat(sprintf(ngettext(length(x),  function(x)
215                           "A corpus with %d text document\n",  {
216                           "A corpus with %d text documents\n"),      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
217                  length(x)))      filehash::dbMultiFetch(db, unlist(x$content))
     invisible(x)  
218  }  }
219    
220  summary.Corpus <- function(object, ...) {  length.PCorpus <- length.VCorpus <-
221      print(object)  function(x)
222      if (length(DMetaData(object)) > 0) {      length(x$content)
         cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  
                              "\nThe metadata consists of %d tag-value pair and a data frame\n",  
                              "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  
                     length(CMetaData(object)$MetaData)))  
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
 }  
223    
224  inspect <- function(x) UseMethod("inspect", x)  print.PCorpus <- print.VCorpus <-
225  inspect.PCorpus <- function(x) {  function(x, ...)
226      summary(x)  {
227      cat("\n")      writeLines(sprintf("<<%s (documents: %d, metadata (corpus/indexed): %d/%d)>>",
228      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])                         class(x)[1],
229      show(filehash::dbMultiFetch(db, unlist(x)))                         length(x),
230  }                         length(meta(x, type = "corpus")),
231  inspect.VCorpus <- function(x) {                         ncol(meta(x, type = "indexed"))))
232      summary(x)      invisible(x)
     cat("\n")  
     print(noquote(lapply(x, identity)))  
233  }  }
234    
235  lapply.PCorpus <- function(X, FUN, ...) {  inspect <-
236      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  function(x)
237      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      UseMethod("inspect", x)
238  }  inspect.PCorpus <- inspect.VCorpus <-
239  lapply.VCorpus <- function(X, FUN, ...) {  function(x)
240      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")  {
241      if (!is.null(lazyTmMap))      print(x)
242          .Call("copyCorpus", X, materialize(X))      cat("\n")
243      base::lapply(X, FUN, ...)      print(noquote(content(x)))
244        invisible(x)
245  }  }
246    
247  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
248    function(x, path = ".", filenames = NULL)
249    {
250      filenames <- file.path(path,      filenames <- file.path(path,
251                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
252              sprintf("%s.txt", as.character(meta(x, "id", "local")))
253                             else filenames)                             else filenames)
254      i <- 1  
255      for (o in x) {      stopifnot(length(x) == length(filenames))
256          writeLines(as.PlainTextDocument(o), filenames[i])  
257          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
258      }  
259        invisible(x)
260  }  }

Legend:
Removed from v.1108  
changed lines
  Added in v.1350

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge