SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1114, Fri Nov 26 14:05:54 2010 UTC revision 1383, Thu May 29 07:32:14 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  PCorpus <-
4      attr(x, "CMetaData") <- cmeta  function(x,
5      attr(x, "DMetaData") <- dmeta           readerControl = list(reader = reader(x), language = "en"),
6      attr(x, "DBControl") <- dbcontrol           dbControl = list(dbName = "", dbType = "DB1"))
7      class(x) <- c("PCorpus", "Corpus", "list")  {
8      x      stopifnot(inherits(x, "Source"))
 }  
 DBControl <- function(x) attr(x, "DBControl")  
   
 PCorpus <- function(x,  
                     readerControl = list(reader = x$DefaultReader, language = "en"),  
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
   
     if (is.function(readerControl$init))  
         readerControl$init()  
9    
10      if (is.function(readerControl$exit))      readerControl <- prepareReader(readerControl, reader(x))
         on.exit(readerControl$exit())  
11    
12      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
13          stop("error in creating database")          stop("error in creating database")
14      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
15    
16      # Allocate memory in advance if length is known      tdl <- vector("list", length(x))
     tdl <- if (x$Length > 0)  
         vector("list", as.integer(x$Length))  
     else  
         list()  
   
17      counter <- 1      counter <- 1
18      while (!eoi(x)) {      while (!eoi(x)) {
19          x <- stepNext(x)          x <- stepNext(x)
20          elem <- getElem(x)          elem <- getElem(x)
21          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          doc <- readerControl$reader(elem,
22          filehash::dbInsert(db, ID(doc), doc)                                      readerControl$language,
23          if (x$Length > 0) tdl[[counter]] <- ID(doc)                                      as.character(counter))
24          else tdl <- c(tdl, ID(doc))          filehash::dbInsert(db, meta(doc, "id"), doc)
25            tdl[[counter]] <- meta(doc, "id")
26          counter <- counter + 1          counter <- counter + 1
27      }      }
     names(tdl) <- x$Names  
28    
29      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      structure(list(content = tdl,
30      filehash::dbInsert(db, "DMetaData", df)                     meta = CorpusMeta(),
31      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))                     dmeta = data.frame(row.names = seq_along(tdl)),
32                       dbcontrol = dbControl),
33      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)                class = c("PCorpus", "Corpus"))
34  }  }
35    
36  .VCorpus <- function(x, cmeta, dmeta) {  Corpus <-
37      attr(x, "CMetaData") <- cmeta  VCorpus <-
38      attr(x, "DMetaData") <- dmeta  function(x, readerControl = list(reader = reader(x), language = "en"))
39      class(x) <- c("VCorpus", "Corpus", "list")  {
40      x      stopifnot(inherits(x, "Source"))
 }  
41    
42  # Register S3 corpus classes to be recognized by S4 methods. This is      readerControl <- prepareReader(readerControl, reader(x))
 # mainly a fix to be compatible with packages which were originally  
 # developed to cooperate with corresponding S4 tm classes. Necessary  
 # since tm's class architecture was changed to S3 since tm version 0.5.  
 setOldClass(c("VCorpus", "Corpus", "list"))  
   
 # The "..." are additional arguments for the FunctionGenerator reader  
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "en"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
   
     if (is.function(readerControl$init))  
         readerControl$init()  
   
     if (is.function(readerControl$exit))  
         on.exit(readerControl$exit())  
   
     # Allocate memory in advance if length is known  
     tdl <- if (x$Length > 0)  
         vector("list", as.integer(x$Length))  
     else  
         list()  
43    
44      if (x$Vectorized)      tdl <- vector("list", length(x))
45          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),      # Check for parallel element access
46        if (is.function(getS3method("pGetElem", class(x), TRUE)))
47            tdl <- mapply(function(elem, id)
48                              readerControl$reader(elem, readerControl$language, id),
49                        pGetElem(x),                        pGetElem(x),
50                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = as.character(seq_along(x)),
51                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
52      else {      else {
53          counter <- 1          counter <- 1
54          while (!eoi(x)) {          while (!eoi(x)) {
55              x <- stepNext(x)              x <- stepNext(x)
56              elem <- getElem(x)              elem <- getElem(x)
57              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              doc <- readerControl$reader(elem,
58              if (x$Length > 0)                                          readerControl$language,
59                                            as.character(counter))
60                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
             else  
                 tdl <- c(tdl, list(doc))  
61              counter <- counter + 1              counter <- counter + 1
62          }          }
63      }      }
     names(tdl) <- x$Names  
     df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
64    
65  `[.PCorpus` <- function(x, i) {      structure(list(content = tdl,
66      if (missing(i)) return(x)                     meta = CorpusMeta(),
67      index <- attr(x, "DMetaData")[[1 , "subset"]]                     dmeta = data.frame(row.names = seq_along(tdl))),
68      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i                class = c("VCorpus", "Corpus"))
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
69  }  }
70    
71  `[.VCorpus` <- function(x, i) {  `[.PCorpus` <-
72      if (missing(i)) return(x)  function(x, i)
73      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])  {
74        if (!missing(i)) {
75            x$content <- x$content[i]
76            x$dmeta <- x$dmeta[i, , drop = FALSE]
77        }
78        x
79  }  }
80    
81  `[<-.PCorpus` <- function(x, i, value) {  `[.VCorpus` <-
82      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
83      counter <- 1  {
84      for (id in unclass(x)[i]) {      if (!missing(i)) {
85          if (identical(length(value), 1L)) db[[id]] <- value          x$content <- x$content[i]
86          else db[[id]] <- value[[counter]]          x$dmeta <- x$dmeta[i, , drop = FALSE]
87          counter <- counter + 1          if (!is.null(x$lazy))
88                x$lazy$index <- x$lazy$index[i]
89      }      }
90      x      x
91  }  }
92    
93  .map_name_index <- function(x, i) {  .map_name_index <-
94      if (is.character(i)) {  function(x, i)
95          if (is.null(names(x)))  {
96              match(i, meta(x, "ID", type = "local"))      if (is.character(i))
97            match(i, meta(x, "id", "local"))
98          else          else
             match(i, names(x))  
     }  
99      i      i
100  }  }
101    
102  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
103    function(x, i)
104    {
105      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
106      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
107      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, x$content[[i]])
108  }  }
109  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
110    function(x, i)
111    {
112      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
113      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
114      if (!is.null(lazyTmMap))          .Call(copyCorpus, x, materialize(x, i))
115          .Call("copyCorpus", x, materialize(x, i))      x$content[[i]]
     NextMethod("[[")  
116  }  }
117    
118  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
119    function(x, i, value)
120    {
121      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
122      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
123      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
124      x      x
125  }  }
126  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
127    function(x, i, value)
128    {
129      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
130      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
131      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
132      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
133          lazyTmMap$index[i] <- FALSE      x$content[[i]] <- value
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
134          x          x
135      }      }
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
136    
137  # Find indices to be updated for a CMetaData tree  as.list.PCorpus <- as.list.VCorpus <-
138  .find_indices <- function(x) {  function(x, ...)
139      indices.mapping <- NULL      content(x)
140      for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
141          indices <- (DMetaData(x)$MetaID == m)  as.VCorpus <-
142          indices.mapping <- c(indices.mapping, list(m = indices))  function(x)
143          names(indices.mapping)[length(indices.mapping)] <- m      UseMethod("as.VCorpus")
144      }  as.VCorpus.VCorpus <- identity
     indices.mapping  
 }  
145    
146  c2 <- function(x, y, ...) {  outer_union <-
147      # Update the CMetaData tree  function(x, y, ...)
148      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  {
149      update.struct <- .update_id(cmeta)      if (nrow(x) > 0L)
150            x[, setdiff(names(y), names(x))] <- NA
151      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)      if (nrow(y) > 0L)
152            y[, setdiff(names(x), names(y))] <- NA
153      # Find indices to be updated for the left tree      res <- rbind(x, y)
154      indices.mapping <- .find_indices(x)      if (ncol(res) == 0L)
155            res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
156      # Update the DMetaData data frames for the left tree      res
     for (i in 1:ncol(update.struct$left.mapping)) {  
         map <- update.struct$left.mapping[,i]  
         DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
   
     # Find indices to be updated for the right tree  
     indices.mapping <- .find_indices(y)  
   
     # Update the DMetaData data frames for the right tree  
     for (i in 1:ncol(update.struct$right.mapping)) {  
         map <- update.struct$right.mapping[,i]  
         DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
   
     # Merge the DMetaData data frames  
     labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  
     na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))  
     x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  
     labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  
     na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))  
     y.dmeta.aug <- cbind(DMetaData(y), na.matrix)  
     DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  
   
     new  
157  }  }
158    
159  c.Corpus <-  c.VCorpus <-
160  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
161  {  {
162      args <- list(...)      args <- list(...)
163        x <- args[[1L]]
164    
165      if (identical(length(args), 0L))      if (length(args) == 1L)
166          return(x)          return(x)
167    
168      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
169          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
170    
171      if (inherits(x, "PCorpus"))      structure(list(content = do.call("c", lapply(args, content)),
172          stop("concatenation of corpora with underlying databases is not supported")                     meta = CorpusMeta(meta = do.call("c",
173                         lapply(args, function(a) meta(a, type = "corpus")))),
174      l <- base::c(list(x), args)                     dmeta = Reduce(outer_union, lapply(args, meta))),
175      if (recursive)                class = c("VCorpus", "Corpus"))
         Reduce(c2, l)  
     else {  
         l <- do.call("c", lapply(l, unclass))  
         .VCorpus(l,  
                  cmeta = .MetaDataNode(),  
                  dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))  
     }  
176  }  }
177    
178  c.TextDocument <- function(x, ..., recursive = FALSE) {  content.VCorpus <-
179      args <- list(...)  function(x)
180    {
181      if (identical(length(args), 0L))      if (!is.null(x$lazy))
182          return(x)          .Call(copyCorpus, x, materialize(x))
183        x$content
184      if (!all(unlist(lapply(args, inherits, class(x)))))  }
         stop("not all arguments are text documents")  
185    
186      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)  content.PCorpus <-
187      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)  function(x)
188    {
189        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
190        filehash::dbMultiFetch(db, unlist(x$content))
191  }  }
192    
193  print.Corpus <- function(x, ...) {  inspect <-
194      cat(sprintf(ngettext(length(x),  function(x)
195                           "A corpus with %d text document\n",      UseMethod("inspect", x)
196                           "A corpus with %d text documents\n"),  inspect.PCorpus <- inspect.VCorpus <-
197                  length(x)))  function(x)
198    {
199        print(x)
200        cat("\n")
201        print(noquote(content(x)))
202      invisible(x)      invisible(x)
203  }  }
204    
205  summary.Corpus <- function(object, ...) {  length.PCorpus <- length.VCorpus <-
206      print(object)  function(x)
207      if (length(DMetaData(object)) > 0) {      length(x$content)
208          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  
209                               "\nThe metadata consists of %d tag-value pair and a data frame\n",  names.PCorpus <- names.VCorpus <-
210                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  function(x)
211                      length(CMetaData(object)$MetaData)))      as.character(meta(x, "id", "local"))
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
 }  
212    
213  inspect <- function(x) UseMethod("inspect", x)  `names<-.PCorpus` <- `names<-.VCorpus` <-
214  inspect.PCorpus <- function(x) {  function(x, value)
215      summary(x)  {
216      cat("\n")      meta(x, "id", "local") <- as.character(value)
217      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      x
     show(filehash::dbMultiFetch(db, unlist(x)))  
 }  
 inspect.VCorpus <- function(x) {  
     summary(x)  
     cat("\n")  
     print(noquote(lapply(x, identity)))  
218  }  }
219    
220  lapply.PCorpus <- function(X, FUN, ...) {  print.PCorpus <- print.VCorpus <-
221      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  function(x, ...)
222      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)  {
223  }      writeLines(sprintf("<<%s (documents: %d, metadata (corpus/indexed): %d/%d)>>",
224  lapply.VCorpus <- function(X, FUN, ...) {                         class(x)[1],
225      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")                         length(x),
226      if (!is.null(lazyTmMap))                         length(meta(x, type = "corpus")),
227          .Call("copyCorpus", X, materialize(X))                         ncol(meta(x, type = "indexed"))))
228      base::lapply(X, FUN, ...)      invisible(x)
229  }  }
230    
231  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
232    function(x, path = ".", filenames = NULL)
233    {
234      filenames <- file.path(path,      filenames <- file.path(path,
235                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
236              sprintf("%s.txt", as.character(meta(x, "id", "local")))
237                             else filenames)                             else filenames)
238      i <- 1  
239      for (o in x) {      stopifnot(length(x) == length(filenames))
240          writeLines(as.PlainTextDocument(o), filenames[i])  
241          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
242      }  
243        invisible(x)
244  }  }

Legend:
Removed from v.1114  
changed lines
  Added in v.1383

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge