SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1108, Fri Oct 22 18:32:47 2010 UTC revision 1328, Tue Apr 15 09:46:28 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  PCorpus <-
4      attr(x, "CMetaData") <- cmeta  function(x,
5      attr(x, "DMetaData") <- dmeta           readerControl = list(reader = x$defaultreader, language = "en"),
6      attr(x, "DBControl") <- dbcontrol           dbControl = list(dbName = "", dbType = "DB1"))
7      class(x) <- c("PCorpus", "Corpus", "list")  {
8      x      stopifnot(inherits(x, "Source"))
9  }  
10  DBControl <- function(x) attr(x, "DBControl")      readerControl <- prepareReader(readerControl, x$defaultreader)
11    
12        if (is.function(readerControl$init))
13            readerControl$init()
14    
15  PCorpus <- function(x,      if (is.function(readerControl$exit))
16                      readerControl = list(reader = x$DefaultReader, language = "en"),          on.exit(readerControl$exit())
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
17    
18      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19          stop("error in creating database")          stop("error in creating database")
20      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
23      tdl <- if (x$Length > 0)      tdl <- if (x$length > 0) vector("list", as.integer(x$length)) else list()
         vector("list", as.integer(x$Length))  
     else  
         list()  
24    
25      counter <- 1      counter <- 1
26      while (!eoi(x)) {      while (!eoi(x)) {
27          x <- stepNext(x)          x <- stepNext(x)
28          elem <- getElem(x)          elem <- getElem(x)
29          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          id <- if (is.null(x$names) || is.na(x$names))
30          filehash::dbInsert(db, ID(doc), doc)              as.character(counter)
31          if (x$Length > 0) tdl[[counter]] <- ID(doc)          else
32          else tdl <- c(tdl, ID(doc))              x$names[counter]
33            doc <- readerControl$reader(elem, readerControl$language, id)
34            filehash::dbInsert(db, meta(doc, "id"), doc)
35            if (x$length > 0) tdl[[counter]] <- meta(doc, "id")
36            else tdl <- c(tdl, meta(doc, "id"))
37          counter <- counter + 1          counter <- counter + 1
38      }      }
39      names(tdl) <- x$Names      if (!is.null(x$names) && !is.na(x$names))
40            names(tdl) <- x$names
     df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
     filehash::dbInsert(db, "DMetaData", df)  
     dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))  
41    
42      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      structure(list(content = tdl,
43                       meta = CorpusMeta(),
44                       dmeta = data.frame(row.names = seq_along(tdl)),
45                       dbcontrol = dbControl),
46                  class = c("PCorpus", "Corpus"))
47  }  }
48    
49  .VCorpus <- function(x, cmeta, dmeta) {  VCorpus <- Corpus <-
50      attr(x, "CMetaData") <- cmeta  function(x, readerControl = list(reader = x$defaultreader, language = "en"))
51      attr(x, "DMetaData") <- dmeta  {
52      class(x) <- c("VCorpus", "Corpus", "list")      stopifnot(inherits(x, "Source"))
53      x  
54  }      readerControl <- prepareReader(readerControl, x$defaultreader)
55    
56        if (is.function(readerControl$init))
57            readerControl$init()
58    
59  # Register S3 corpus classes to be recognized by S4 methods. This is      if (is.function(readerControl$exit))
60  # mainly a fix to be compatible with packages which were originally          on.exit(readerControl$exit())
 # developed to cooperate with corresponding S4 tm classes. Necessary  
 # since tm's class architecture was changed to S3 since tm version 0.5.  
 setOldClass(c("VCorpus", "Corpus", "list"))  
   
 # The "..." are additional arguments for the FunctionGenerator reader  
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "en"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
61    
62      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
63      tdl <- if (x$Length > 0)      tdl <- if (x$length > 0) vector("list", as.integer(x$length)) else list()
         vector("list", as.integer(x$Length))  
     else  
         list()  
64    
65      if (x$Vectorized)      if (x$vectorized)
66          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),          tdl <- mapply(function(elem, id)
67                              readerControl$reader(elem, readerControl$language, id),
68                        pGetElem(x),                        pGetElem(x),
69                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = if (is.null(x$names) || is.na(x$names))
70                              as.character(seq_len(x$length))
71                          else x$names,
72                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
73      else {      else {
74          counter <- 1          counter <- 1
75          while (!eoi(x)) {          while (!eoi(x)) {
76              x <- stepNext(x)              x <- stepNext(x)
77              elem <- getElem(x)              elem <- getElem(x)
78              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              id <- if (is.null(x$names) || is.na(x$names))
79              if (x$Length > 0)                  as.character(counter)
80                else
81                    x$names[counter]
82                doc <- readerControl$reader(elem, readerControl$language, id)
83                if (x$length > 0)
84                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
85              else              else
86                  tdl <- c(tdl, list(doc))                  tdl <- c(tdl, list(doc))
87              counter <- counter + 1              counter <- counter + 1
88          }          }
89      }      }
90      names(tdl) <- x$Names      if (!is.null(x$names) && !is.na(x$names))
91      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)          names(tdl) <- x$names
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
92    
93  `[.PCorpus` <- function(x, i) {      structure(list(content = tdl,
94      if (missing(i)) return(x)                     meta = CorpusMeta(),
95      index <- attr(x, "DMetaData")[[1 , "subset"]]                     dmeta = data.frame(row.names = seq_along(tdl))),
96      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i                class = c("VCorpus", "Corpus"))
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
97  }  }
98    
99  `[.VCorpus` <- function(x, i) {  `[.PCorpus` <-
100      if (missing(i)) return(x)  function(x, i)
101      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])  {
102        if (!missing(i)) {
103            x$content <- x$content[i]
104            x$dmeta <- x$dmeta[i, , drop = FALSE]
105        }
106        x
107  }  }
108    
109  `[<-.PCorpus` <- function(x, i, value) {  `[.VCorpus` <-
110      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
111      counter <- 1  {
112      for (id in unclass(x)[i]) {      if (!missing(i)) {
113          if (identical(length(value), 1L)) db[[id]] <- value          x$content <- x$content[i]
114          else db[[id]] <- value[[counter]]          x$dmeta <- x$dmeta[i, , drop = FALSE]
115          counter <- counter + 1          if (!is.null(x$lazy))
116                x$lazy$index <- x$lazy$index[i]
117      }      }
118      x      x
119  }  }
120    
121  .map_name_index <- function(x, i) {  .map_name_index <-
122      if (is.character(i)) {  function(x, i)
123          if (is.null(names(x)))  {
124              match(i, meta(x, "ID", type = "local"))      if (is.character(i))
125            match(i, if (is.null(names(x))) meta(x, "id", "local") else names(x))
126          else          else
             match(i, names(x))  
     }  
127      i      i
128  }  }
129    
130  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
131    function(x, i)
132    {
133      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
134      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
135      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, x$content[[i]])
136  }  }
137  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
138    function(x, i)
139    {
140      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
141      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
142      if (!is.null(lazyTmMap))          .Call(copyCorpus, x, materialize(x, i))
143          .Call("copyCorpus", x, materialize(x, i))      x$content[[i]]
     NextMethod("[[")  
144  }  }
145    
146  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
147    function(x, i, value)
148    {
149      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
150      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
151      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
152      x      x
153  }  }
154  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
155    function(x, i, value)
156    {
157      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
158      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
159      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
160      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
161          lazyTmMap$index[i] <- FALSE      x$content[[i]] <- value
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
162          x          x
163      }      }
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
   
 # Find indices to be updated for a CMetaData tree  
 .find_indices <- function(x) {  
     indices.mapping <- NULL  
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
     indices.mapping  
 }  
   
 c2 <- function(x, y, ...) {  
     # Update the CMetaData tree  
     cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  
     update.struct <- .update_id(cmeta)  
   
     new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)  
   
     # Find indices to be updated for the left tree  
     indices.mapping <- .find_indices(x)  
   
     # Update the DMetaData data frames for the left tree  
     for (i in 1:ncol(update.struct$left.mapping)) {  
         map <- update.struct$left.mapping[,i]  
         DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
   
     # Find indices to be updated for the right tree  
     indices.mapping <- .find_indices(y)  
   
     # Update the DMetaData data frames for the right tree  
     for (i in 1:ncol(update.struct$right.mapping)) {  
         map <- update.struct$right.mapping[,i]  
         DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
164    
165      # Merge the DMetaData data frames  outer_union <-
166      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  function(x, y, ...)
167      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))  {
168      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)      if (nrow(x) > 0L)
169      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))          x[, setdiff(names(y), names(x))] <- NA
170      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))      if (nrow(y) > 0L)
171      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)          y[, setdiff(names(x), names(y))] <- NA
172      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)      res <- rbind(x, y)
173        if (ncol(res) == 0L)
174      new          res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
175        res
176  }  }
177    
178  c.Corpus <-  c.VCorpus <-
179  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
180  {  {
181      args <- list(...)      args <- list(...)
182        x <- args[[1L]]
183    
184      if (identical(length(args), 0L))      if (length(args) == 1L)
185          return(x)          return(x)
186    
187      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
188          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
189    
190      if (inherits(x, "PCorpus"))      structure(list(content = do.call("c", lapply(args, content)),
191          stop("concatenation of corpora with underlying databases is not supported")                     meta = structure(do.call("c",
192                         lapply(args, function(a) meta(a, type = "corpus"))),
193      l <- base::c(list(x), args)                                      class = "CorpusMeta"),
194      if (recursive)                     dmeta = Reduce(outer_union, lapply(args, meta))),
195          Reduce(c2, l)                class = c("VCorpus", "Corpus"))
     else {  
         l <- do.call("c", lapply(l, unclass))  
         .VCorpus(l,  
                  cmeta = .MetaDataNode(),  
                  dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))  
     }  
196  }  }
197    
198  c.TextDocument <- function(x, ..., recursive = FALSE) {  as.list.PCorpus <- as.list.VCorpus <-
199      args <- list(...)  function(x, ...)
200        content(x)
     if (identical(length(args), 0L))  
         return(x)  
201    
202      if (!all(unlist(lapply(args, inherits, class(x)))))  content.VCorpus <-
203          stop("not all arguments are text documents")  function(x)
204    {
205        if (!is.null(x$lazy))
206            .Call(copyCorpus, x, materialize(x))
207        x$content
208    }
209    
210      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)  content.PCorpus <-
211      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)  function(x)
212    {
213        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
214        filehash::dbMultiFetch(db, unlist(x$content))
215  }  }
216    
217  print.Corpus <- function(x, ...) {  length.PCorpus <- length.VCorpus <-
218    function(x)
219        length(x$content)
220    
221    print.PCorpus <- print.VCorpus <-
222    function(x, ...)
223    {
224      cat(sprintf(ngettext(length(x),      cat(sprintf(ngettext(length(x),
225                           "A corpus with %d text document\n",                           "A corpus with %d text document\n\n",
226                           "A corpus with %d text documents\n"),                           "A corpus with %d text documents\n\n"),
227                  length(x)))                  length(x)))
     invisible(x)  
 }  
228    
229  summary.Corpus <- function(object, ...) {      meta <- meta(x, type = "corpus")
230      print(object)      dmeta <- meta(x, type = "indexed")
     if (length(DMetaData(object)) > 0) {  
         cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  
                              "\nThe metadata consists of %d tag-value pair and a data frame\n",  
                              "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  
                     length(CMetaData(object)$MetaData)))  
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
 }  
231    
232  inspect <- function(x) UseMethod("inspect", x)      cat("Metadata:\n")
233  inspect.PCorpus <- function(x) {      cat(sprintf("  Tag-value pairs. Tags: %s\n",
234      summary(x)                  paste(names(meta), collapse = " ")))
235      cat("\n")      cat("  Data frame. Variables:", colnames(dmeta), "\n")
     db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  
     show(filehash::dbMultiFetch(db, unlist(x)))  
 }  
 inspect.VCorpus <- function(x) {  
     summary(x)  
     cat("\n")  
     print(noquote(lapply(x, identity)))  
 }  
236    
237  lapply.PCorpus <- function(X, FUN, ...) {      invisible(x)
     db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  
     lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)  
238  }  }
239  lapply.VCorpus <- function(X, FUN, ...) {  
240      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")  inspect <-
241      if (!is.null(lazyTmMap))  function(x)
242          .Call("copyCorpus", X, materialize(X))      UseMethod("inspect", x)
243      base::lapply(X, FUN, ...)  inspect.PCorpus <- inspect.VCorpus <-
244    function(x)
245    {
246        print(x)
247        cat("\n")
248        print(noquote(content(x)))
249        invisible(x)
250  }  }
251    
252  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
253    function(x, path = ".", filenames = NULL)
254    {
255      filenames <- file.path(path,      filenames <- file.path(path,
256                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
257              sprintf("%s.txt", as.character(meta(x, "id", "local")))
258                             else filenames)                             else filenames)
259      i <- 1  
260      for (o in x) {      stopifnot(length(x) == length(filenames))
261          writeLines(as.PlainTextDocument(o), filenames[i])  
262          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
263      }  
264        invisible(x)
265  }  }

Legend:
Removed from v.1108  
changed lines
  Added in v.1328

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge