SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1114, Fri Nov 26 14:05:54 2010 UTC revision 1342, Sat Apr 19 17:06:45 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  PCorpus <-
4      attr(x, "CMetaData") <- cmeta  function(x,
5      attr(x, "DMetaData") <- dmeta           readerControl = list(reader = reader(x), language = "en"),
6      attr(x, "DBControl") <- dbcontrol           dbControl = list(dbName = "", dbType = "DB1"))
7      class(x) <- c("PCorpus", "Corpus", "list")  {
8      x      stopifnot(inherits(x, "Source"))
 }  
 DBControl <- function(x) attr(x, "DBControl")  
9    
10  PCorpus <- function(x,      readerControl <- prepareReader(readerControl, reader(x))
                     readerControl = list(reader = x$DefaultReader, language = "en"),  
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
11    
12      if (is.function(readerControl$init))      if (is.function(readerControl$init))
13          readerControl$init()          readerControl$init()
# Line 26  Line 20 
20      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
23      tdl <- if (x$Length > 0)      tdl <- if (length(x) > 0) vector("list", as.integer(length(x))) else list()
         vector("list", as.integer(x$Length))  
     else  
         list()  
24    
25      counter <- 1      counter <- 1
26      while (!eoi(x)) {      while (!eoi(x)) {
27          x <- stepNext(x)          x <- stepNext(x)
28          elem <- getElem(x)          elem <- getElem(x)
29          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          id <- if (is.null(names(x)) || is.na(names(x)))
30          filehash::dbInsert(db, ID(doc), doc)              as.character(counter)
31          if (x$Length > 0) tdl[[counter]] <- ID(doc)          else
32          else tdl <- c(tdl, ID(doc))              names(x)[counter]
33            doc <- readerControl$reader(elem, readerControl$language, id)
34            filehash::dbInsert(db, meta(doc, "id"), doc)
35            tdl[[counter]] <- meta(doc, "id")
36          counter <- counter + 1          counter <- counter + 1
37      }      }
38      names(tdl) <- x$Names      if (!is.null(names(x)) && !is.na(names(x)))
39            names(tdl) <- names(x)
40    
41      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      structure(list(content = tdl,
42      filehash::dbInsert(db, "DMetaData", df)                     meta = CorpusMeta(),
43      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))                     dmeta = data.frame(row.names = seq_along(tdl)),
44                       dbcontrol = dbControl),
45      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)                class = c("PCorpus", "Corpus"))
46  }  }
47    
48  .VCorpus <- function(x, cmeta, dmeta) {  VCorpus <-
49      attr(x, "CMetaData") <- cmeta  function(x, readerControl = list(reader = reader(x), language = "en"))
50      attr(x, "DMetaData") <- dmeta  {
51      class(x) <- c("VCorpus", "Corpus", "list")      stopifnot(inherits(x, "Source"))
     x  
 }  
52    
53  # Register S3 corpus classes to be recognized by S4 methods. This is      readerControl <- prepareReader(readerControl, reader(x))
 # mainly a fix to be compatible with packages which were originally  
 # developed to cooperate with corresponding S4 tm classes. Necessary  
 # since tm's class architecture was changed to S3 since tm version 0.5.  
 setOldClass(c("VCorpus", "Corpus", "list"))  
   
 # The "..." are additional arguments for the FunctionGenerator reader  
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "en"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
54    
55      if (is.function(readerControl$init))      if (is.function(readerControl$init))
56          readerControl$init()          readerControl$init()
# Line 76  Line 59 
59          on.exit(readerControl$exit())          on.exit(readerControl$exit())
60    
61      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
62      tdl <- if (x$Length > 0)      tdl <- if (length(x) > 0) vector("list", as.integer(length(x))) else list()
         vector("list", as.integer(x$Length))  
     else  
         list()  
63    
64      if (x$Vectorized)      # Check for parallel element access
65          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),      if (is.function(getS3method("pGetElem", class(x), TRUE)))
66            tdl <- mapply(function(elem, id)
67                              readerControl$reader(elem, readerControl$language, id),
68                        pGetElem(x),                        pGetElem(x),
69                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = if (is.null(names(x)) || is.na(names(x)))
70                              as.character(seq_len(length(x)))
71                          else names(x),
72                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
73      else {      else {
74          counter <- 1          counter <- 1
75          while (!eoi(x)) {          while (!eoi(x)) {
76              x <- stepNext(x)              x <- stepNext(x)
77              elem <- getElem(x)              elem <- getElem(x)
78              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              id <- if (is.null(names(x)) || is.na(names(x)))
79              if (x$Length > 0)                  as.character(counter)
                 tdl[[counter]] <- doc  
80              else              else
81                  tdl <- c(tdl, list(doc))                  names(x)[counter]
82                doc <- readerControl$reader(elem, readerControl$language, id)
83                tdl[[counter]] <- doc
84              counter <- counter + 1              counter <- counter + 1
85          }          }
86      }      }
87      names(tdl) <- x$Names      if (!is.null(names(x)) && !is.na(names(x)))
88      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)          names(tdl) <- names(x)
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
89    
90  `[.PCorpus` <- function(x, i) {      structure(list(content = tdl,
91      if (missing(i)) return(x)                     meta = CorpusMeta(),
92      index <- attr(x, "DMetaData")[[1 , "subset"]]                     dmeta = data.frame(row.names = seq_along(tdl))),
93      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i                class = c("VCorpus", "Corpus"))
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
94  }  }
95    
96  `[.VCorpus` <- function(x, i) {  `[.PCorpus` <-
97      if (missing(i)) return(x)  function(x, i)
98      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])  {
99        if (!missing(i)) {
100            x$content <- x$content[i]
101            x$dmeta <- x$dmeta[i, , drop = FALSE]
102        }
103        x
104  }  }
105    
106  `[<-.PCorpus` <- function(x, i, value) {  `[.VCorpus` <-
107      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
108      counter <- 1  {
109      for (id in unclass(x)[i]) {      if (!missing(i)) {
110          if (identical(length(value), 1L)) db[[id]] <- value          x$content <- x$content[i]
111          else db[[id]] <- value[[counter]]          x$dmeta <- x$dmeta[i, , drop = FALSE]
112          counter <- counter + 1          if (!is.null(x$lazy))
113                x$lazy$index <- x$lazy$index[i]
114      }      }
115      x      x
116  }  }
117    
118  .map_name_index <- function(x, i) {  .map_name_index <-
119    function(x, i)
120    {
121      if (is.character(i)) {      if (is.character(i)) {
122          if (is.null(names(x)))          n <- names(x$content)
123              match(i, meta(x, "ID", type = "local"))          match(i, if (is.null(n)) meta(x, "id", "local") else n)
124          else      } else
             match(i, names(x))  
     }  
125      i      i
126  }  }
127    
128  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
129    function(x, i)
130    {
131      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
132      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
133      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, x$content[[i]])
134  }  }
135  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
136    function(x, i)
137    {
138      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
139      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
140      if (!is.null(lazyTmMap))          .Call(copyCorpus, x, materialize(x, i))
141          .Call("copyCorpus", x, materialize(x, i))      x$content[[i]]
     NextMethod("[[")  
142  }  }
143    
144  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
145    function(x, i, value)
146    {
147      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
148      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
149      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
150      x      x
151  }  }
152  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
153    function(x, i, value)
154    {
155      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
156      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
157      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
158      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
159          lazyTmMap$index[i] <- FALSE      x$content[[i]] <- value
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
160          x          x
161      }      }
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
   
 # Find indices to be updated for a CMetaData tree  
 .find_indices <- function(x) {  
     indices.mapping <- NULL  
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
     indices.mapping  
 }  
   
 c2 <- function(x, y, ...) {  
     # Update the CMetaData tree  
     cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  
     update.struct <- .update_id(cmeta)  
   
     new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)  
   
     # Find indices to be updated for the left tree  
     indices.mapping <- .find_indices(x)  
162    
163      # Update the DMetaData data frames for the left tree  outer_union <-
164      for (i in 1:ncol(update.struct$left.mapping)) {  function(x, y, ...)
165          map <- update.struct$left.mapping[,i]  {
166          DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])      if (nrow(x) > 0L)
167      }          x[, setdiff(names(y), names(x))] <- NA
168        if (nrow(y) > 0L)
169      # Find indices to be updated for the right tree          y[, setdiff(names(x), names(y))] <- NA
170      indices.mapping <- .find_indices(y)      res <- rbind(x, y)
171        if (ncol(res) == 0L)
172      # Update the DMetaData data frames for the right tree          res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
173      for (i in 1:ncol(update.struct$right.mapping)) {      res
         map <- update.struct$right.mapping[,i]  
         DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
   
     # Merge the DMetaData data frames  
     labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  
     na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))  
     x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  
     labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  
     na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))  
     y.dmeta.aug <- cbind(DMetaData(y), na.matrix)  
     DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  
   
     new  
174  }  }
175    
176  c.Corpus <-  c.VCorpus <-
177  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
178  {  {
179      args <- list(...)      args <- list(...)
180        x <- args[[1L]]
181    
182      if (identical(length(args), 0L))      if (length(args) == 1L)
183          return(x)          return(x)
184    
185      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
186          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
187    
188      if (inherits(x, "PCorpus"))      structure(list(content = do.call("c", lapply(args, content)),
189          stop("concatenation of corpora with underlying databases is not supported")                     meta = structure(do.call("c",
190                         lapply(args, function(a) meta(a, type = "corpus"))),
191      l <- base::c(list(x), args)                                      class = "CorpusMeta"),
192      if (recursive)                     dmeta = Reduce(outer_union, lapply(args, meta))),
193          Reduce(c2, l)                class = c("VCorpus", "Corpus"))
     else {  
         l <- do.call("c", lapply(l, unclass))  
         .VCorpus(l,  
                  cmeta = .MetaDataNode(),  
                  dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))  
     }  
194  }  }
195    
196  c.TextDocument <- function(x, ..., recursive = FALSE) {  as.list.PCorpus <- as.list.VCorpus <-
197      args <- list(...)  function(x, ...)
198        content(x)
199    
200      if (identical(length(args), 0L))  content.VCorpus <-
201          return(x)  function(x)
202    {
203      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!is.null(x$lazy))
204          stop("not all arguments are text documents")          .Call(copyCorpus, x, materialize(x))
205        x$content
     dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)  
     .VCorpus(list(x, ...), .MetaDataNode(), dmeta)  
206  }  }
207    
208  print.Corpus <- function(x, ...) {  content.PCorpus <-
209      cat(sprintf(ngettext(length(x),  function(x)
210                           "A corpus with %d text document\n",  {
211                           "A corpus with %d text documents\n"),      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
212                  length(x)))      filehash::dbMultiFetch(db, unlist(x$content))
     invisible(x)  
213  }  }
214    
215  summary.Corpus <- function(object, ...) {  length.PCorpus <- length.VCorpus <-
216      print(object)  function(x)
217      if (length(DMetaData(object)) > 0) {      length(x$content)
         cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  
                              "\nThe metadata consists of %d tag-value pair and a data frame\n",  
                              "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  
                     length(CMetaData(object)$MetaData)))  
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
 }  
218    
219  inspect <- function(x) UseMethod("inspect", x)  print.PCorpus <- print.VCorpus <-
220  inspect.PCorpus <- function(x) {  function(x, ...)
221      summary(x)  {
222      cat("\n")      writeLines(sprintf("<<%s (documents: %d, metadata (corpus/indexed): %d/%d)>>",
223      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])                         class(x)[1],
224      show(filehash::dbMultiFetch(db, unlist(x)))                         length(x),
225  }                         length(meta(x, type = "corpus")),
226  inspect.VCorpus <- function(x) {                         ncol(meta(x, type = "indexed"))))
227      summary(x)      invisible(x)
     cat("\n")  
     print(noquote(lapply(x, identity)))  
228  }  }
229    
230  lapply.PCorpus <- function(X, FUN, ...) {  inspect <-
231      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  function(x)
232      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      UseMethod("inspect", x)
233  }  inspect.PCorpus <- inspect.VCorpus <-
234  lapply.VCorpus <- function(X, FUN, ...) {  function(x)
235      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")  {
236      if (!is.null(lazyTmMap))      print(x)
237          .Call("copyCorpus", X, materialize(X))      cat("\n")
238      base::lapply(X, FUN, ...)      print(noquote(content(x)))
239        invisible(x)
240  }  }
241    
242  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
243    function(x, path = ".", filenames = NULL)
244    {
245      filenames <- file.path(path,      filenames <- file.path(path,
246                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
247              sprintf("%s.txt", as.character(meta(x, "id", "local")))
248                             else filenames)                             else filenames)
249      i <- 1  
250      for (o in x) {      stopifnot(length(x) == length(filenames))
251          writeLines(as.PlainTextDocument(o), filenames[i])  
252          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
253      }  
254        invisible(x)
255  }  }

Legend:
Removed from v.1114  
changed lines
  Added in v.1342

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge