SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1114, Fri Nov 26 14:05:54 2010 UTC revision 1336, Sat Apr 19 08:59:39 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  PCorpus <-
4      attr(x, "CMetaData") <- cmeta  function(x,
5      attr(x, "DMetaData") <- dmeta           readerControl = list(reader = reader(x), language = "en"),
6      attr(x, "DBControl") <- dbcontrol           dbControl = list(dbName = "", dbType = "DB1"))
7      class(x) <- c("PCorpus", "Corpus", "list")  {
8      x      stopifnot(inherits(x, "Source"))
 }  
 DBControl <- function(x) attr(x, "DBControl")  
9    
10  PCorpus <- function(x,      readerControl <- prepareReader(readerControl, reader(x))
                     readerControl = list(reader = x$DefaultReader, language = "en"),  
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
11    
12      if (is.function(readerControl$init))      if (is.function(readerControl$init))
13          readerControl$init()          readerControl$init()
# Line 26  Line 20 
20      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
23      tdl <- if (x$Length > 0)      tdl <- if (length(x) > 0) vector("list", as.integer(length(x))) else list()
         vector("list", as.integer(x$Length))  
     else  
         list()  
24    
25      counter <- 1      counter <- 1
26      while (!eoi(x)) {      while (!eoi(x)) {
27          x <- stepNext(x)          x <- stepNext(x)
28          elem <- getElem(x)          elem <- getElem(x)
29          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          id <- if (is.null(names(x)) || is.na(names(x)))
30          filehash::dbInsert(db, ID(doc), doc)              as.character(counter)
31          if (x$Length > 0) tdl[[counter]] <- ID(doc)          else
32          else tdl <- c(tdl, ID(doc))              names(x)[counter]
33            doc <- readerControl$reader(elem, readerControl$language, id)
34            filehash::dbInsert(db, meta(doc, "id"), doc)
35            tdl[[counter]] <- meta(doc, "id")
36          counter <- counter + 1          counter <- counter + 1
37      }      }
38      names(tdl) <- x$Names      if (!is.null(names(x)) && !is.na(names(x)))
39            names(tdl) <- names(x)
40    
41      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      structure(list(content = tdl,
42      filehash::dbInsert(db, "DMetaData", df)                     meta = CorpusMeta(),
43      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))                     dmeta = data.frame(row.names = seq_along(tdl)),
44                       dbcontrol = dbControl),
45      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)                class = c("PCorpus", "Corpus"))
46  }  }
47    
48  .VCorpus <- function(x, cmeta, dmeta) {  VCorpus <-
49      attr(x, "CMetaData") <- cmeta  function(x, readerControl = list(reader = reader(x), language = "en"))
50      attr(x, "DMetaData") <- dmeta  {
51      class(x) <- c("VCorpus", "Corpus", "list")      stopifnot(inherits(x, "Source"))
     x  
 }  
52    
53  # Register S3 corpus classes to be recognized by S4 methods. This is      readerControl <- prepareReader(readerControl, reader(x))
 # mainly a fix to be compatible with packages which were originally  
 # developed to cooperate with corresponding S4 tm classes. Necessary  
 # since tm's class architecture was changed to S3 since tm version 0.5.  
 setOldClass(c("VCorpus", "Corpus", "list"))  
   
 # The "..." are additional arguments for the FunctionGenerator reader  
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "en"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
54    
55      if (is.function(readerControl$init))      if (is.function(readerControl$init))
56          readerControl$init()          readerControl$init()
# Line 76  Line 59 
59          on.exit(readerControl$exit())          on.exit(readerControl$exit())
60    
61      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
62      tdl <- if (x$Length > 0)      tdl <- if (length(x) > 0) vector("list", as.integer(length(x))) else list()
         vector("list", as.integer(x$Length))  
     else  
         list()  
63    
64      if (x$Vectorized)      # Check for parallel element access
65          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),      if (is.function(getS3method("pGetElem", class(x), TRUE)))
66            tdl <- mapply(function(elem, id)
67                              readerControl$reader(elem, readerControl$language, id),
68                        pGetElem(x),                        pGetElem(x),
69                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = if (is.null(names(x)) || is.na(names(x)))
70                              as.character(seq_len(length(x)))
71                          else names(x),
72                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
73      else {      else {
74          counter <- 1          counter <- 1
75          while (!eoi(x)) {          while (!eoi(x)) {
76              x <- stepNext(x)              x <- stepNext(x)
77              elem <- getElem(x)              elem <- getElem(x)
78              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              id <- if (is.null(names(x)) || is.na(names(x)))
79              if (x$Length > 0)                  as.character(counter)
                 tdl[[counter]] <- doc  
80              else              else
81                  tdl <- c(tdl, list(doc))                  names(x)[counter]
82                doc <- readerControl$reader(elem, readerControl$language, id)
83                tdl[[counter]] <- doc
84              counter <- counter + 1              counter <- counter + 1
85          }          }
86      }      }
87      names(tdl) <- x$Names      if (!is.null(names(x)) && !is.na(names(x)))
88      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)          names(tdl) <- names(x)
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
89    
90  `[.PCorpus` <- function(x, i) {      structure(list(content = tdl,
91      if (missing(i)) return(x)                     meta = CorpusMeta(),
92      index <- attr(x, "DMetaData")[[1 , "subset"]]                     dmeta = data.frame(row.names = seq_along(tdl))),
93      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i                class = c("VCorpus", "Corpus"))
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
94  }  }
95    
96  `[.VCorpus` <- function(x, i) {  `[.PCorpus` <-
97      if (missing(i)) return(x)  function(x, i)
98      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])  {
99        if (!missing(i)) {
100            x$content <- x$content[i]
101            x$dmeta <- x$dmeta[i, , drop = FALSE]
102        }
103        x
104  }  }
105    
106  `[<-.PCorpus` <- function(x, i, value) {  `[.VCorpus` <-
107      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
108      counter <- 1  {
109      for (id in unclass(x)[i]) {      if (!missing(i)) {
110          if (identical(length(value), 1L)) db[[id]] <- value          x$content <- x$content[i]
111          else db[[id]] <- value[[counter]]          x$dmeta <- x$dmeta[i, , drop = FALSE]
112          counter <- counter + 1          if (!is.null(x$lazy))
113                x$lazy$index <- x$lazy$index[i]
114      }      }
115      x      x
116  }  }
117    
118  .map_name_index <- function(x, i) {  .map_name_index <-
119      if (is.character(i)) {  function(x, i)
120          if (is.null(names(x)))  {
121              match(i, meta(x, "ID", type = "local"))      if (is.character(i))
122            match(i, if (is.null(names(x))) meta(x, "id", "local") else names(x))
123          else          else
             match(i, names(x))  
     }  
124      i      i
125  }  }
126    
127  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
128    function(x, i)
129    {
130      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
131      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
132      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, x$content[[i]])
133  }  }
134  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
135    function(x, i)
136    {
137      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
138      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
139      if (!is.null(lazyTmMap))          .Call(copyCorpus, x, materialize(x, i))
140          .Call("copyCorpus", x, materialize(x, i))      x$content[[i]]
     NextMethod("[[")  
141  }  }
142    
143  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
144    function(x, i, value)
145    {
146      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
147      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
148      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
149      x      x
150  }  }
151  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
152    function(x, i, value)
153    {
154      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
155      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
156      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
157      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
158          lazyTmMap$index[i] <- FALSE      x$content[[i]] <- value
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
159          x          x
160      }      }
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
   
 # Find indices to be updated for a CMetaData tree  
 .find_indices <- function(x) {  
     indices.mapping <- NULL  
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
     indices.mapping  
 }  
   
 c2 <- function(x, y, ...) {  
     # Update the CMetaData tree  
     cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  
     update.struct <- .update_id(cmeta)  
   
     new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)  
   
     # Find indices to be updated for the left tree  
     indices.mapping <- .find_indices(x)  
   
     # Update the DMetaData data frames for the left tree  
     for (i in 1:ncol(update.struct$left.mapping)) {  
         map <- update.struct$left.mapping[,i]  
         DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
   
     # Find indices to be updated for the right tree  
     indices.mapping <- .find_indices(y)  
   
     # Update the DMetaData data frames for the right tree  
     for (i in 1:ncol(update.struct$right.mapping)) {  
         map <- update.struct$right.mapping[,i]  
         DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
161    
162      # Merge the DMetaData data frames  outer_union <-
163      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  function(x, y, ...)
164      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))  {
165      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)      if (nrow(x) > 0L)
166      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))          x[, setdiff(names(y), names(x))] <- NA
167      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))      if (nrow(y) > 0L)
168      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)          y[, setdiff(names(x), names(y))] <- NA
169      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)      res <- rbind(x, y)
170        if (ncol(res) == 0L)
171      new          res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
172        res
173  }  }
174    
175  c.Corpus <-  c.VCorpus <-
176  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
177  {  {
178      args <- list(...)      args <- list(...)
179        x <- args[[1L]]
180    
181      if (identical(length(args), 0L))      if (length(args) == 1L)
182          return(x)          return(x)
183    
184      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
185          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
186    
187      if (inherits(x, "PCorpus"))      structure(list(content = do.call("c", lapply(args, content)),
188          stop("concatenation of corpora with underlying databases is not supported")                     meta = structure(do.call("c",
189                         lapply(args, function(a) meta(a, type = "corpus"))),
190      l <- base::c(list(x), args)                                      class = "CorpusMeta"),
191      if (recursive)                     dmeta = Reduce(outer_union, lapply(args, meta))),
192          Reduce(c2, l)                class = c("VCorpus", "Corpus"))
     else {  
         l <- do.call("c", lapply(l, unclass))  
         .VCorpus(l,  
                  cmeta = .MetaDataNode(),  
                  dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))  
     }  
193  }  }
194    
195  c.TextDocument <- function(x, ..., recursive = FALSE) {  as.list.PCorpus <- as.list.VCorpus <-
196      args <- list(...)  function(x, ...)
197        content(x)
198    
199      if (identical(length(args), 0L))  content.VCorpus <-
200          return(x)  function(x)
201    {
202      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!is.null(x$lazy))
203          stop("not all arguments are text documents")          .Call(copyCorpus, x, materialize(x))
204        x$content
     dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)  
     .VCorpus(list(x, ...), .MetaDataNode(), dmeta)  
205  }  }
206    
207  print.Corpus <- function(x, ...) {  content.PCorpus <-
208      cat(sprintf(ngettext(length(x),  function(x)
209                           "A corpus with %d text document\n",  {
210                           "A corpus with %d text documents\n"),      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
211                  length(x)))      filehash::dbMultiFetch(db, unlist(x$content))
     invisible(x)  
212  }  }
213    
214  summary.Corpus <- function(object, ...) {  length.PCorpus <- length.VCorpus <-
215      print(object)  function(x)
216      if (length(DMetaData(object)) > 0) {      length(x$content)
         cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  
                              "\nThe metadata consists of %d tag-value pair and a data frame\n",  
                              "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  
                     length(CMetaData(object)$MetaData)))  
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
 }  
217    
218  inspect <- function(x) UseMethod("inspect", x)  print.PCorpus <- print.VCorpus <-
219  inspect.PCorpus <- function(x) {  function(x, ...)
220      summary(x)  {
221      cat("\n")      writeLines(sprintf("<<%s (documents: %d, metadata (corpus/indexed): %d/%d)>>",
222      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])                         class(x)[1],
223      show(filehash::dbMultiFetch(db, unlist(x)))                         length(x),
224  }                         length(meta(x, type = "corpus")),
225  inspect.VCorpus <- function(x) {                         ncol(meta(x, type = "indexed"))))
226      summary(x)      invisible(x)
     cat("\n")  
     print(noquote(lapply(x, identity)))  
227  }  }
228    
229  lapply.PCorpus <- function(X, FUN, ...) {  inspect <-
230      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  function(x)
231      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      UseMethod("inspect", x)
232  }  inspect.PCorpus <- inspect.VCorpus <-
233  lapply.VCorpus <- function(X, FUN, ...) {  function(x)
234      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")  {
235      if (!is.null(lazyTmMap))      print(x)
236          .Call("copyCorpus", X, materialize(X))      cat("\n")
237      base::lapply(X, FUN, ...)      print(noquote(content(x)))
238        invisible(x)
239  }  }
240    
241  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
242    function(x, path = ".", filenames = NULL)
243    {
244      filenames <- file.path(path,      filenames <- file.path(path,
245                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
246              sprintf("%s.txt", as.character(meta(x, "id", "local")))
247                             else filenames)                             else filenames)
248      i <- 1  
249      for (o in x) {      stopifnot(length(x) == length(filenames))
250          writeLines(as.PlainTextDocument(o), filenames[i])  
251          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
252      }  
253        invisible(x)
254  }  }

Legend:
Removed from v.1114  
changed lines
  Added in v.1336

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge