SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1063, Fri Apr 9 10:36:39 2010 UTC revision 1313, Sun Mar 30 09:28:00 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  PCorpus <-
4      attr(x, "CMetaData") <- cmeta  function(x,
5      attr(x, "DMetaData") <- dmeta           readerControl = list(reader = x$defaultreader, language = "en"),
6      attr(x, "DBControl") <- dbcontrol           dbControl = list(dbName = "", dbType = "DB1"))
7      class(x) <- c("PCorpus", "Corpus", "list")  {
8      x      stopifnot(inherits(x, "Source"))
9  }  
10  DBControl <- function(x) attr(x, "DBControl")      readerControl <- prepareReader(readerControl, x$defaultreader)
11    
12        if (is.function(readerControl$init))
13            readerControl$init()
14    
15  PCorpus <- function(x,      if (is.function(readerControl$exit))
16                      readerControl = list(reader = x$DefaultReader, language = "eng"),          on.exit(readerControl$exit())
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
17    
18      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19          stop("error in creating database")          stop("error in creating database")
20      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
23      tdl <- if (x$Length > 0)      tdl <- if (x$length > 0)
24          vector("list", as.integer(x$Length))          vector("list", as.integer(x$length))
25      else      else
26          list()          list()
27    
# Line 29  Line 29 
29      while (!eoi(x)) {      while (!eoi(x)) {
30          x <- stepNext(x)          x <- stepNext(x)
31          elem <- getElem(x)          elem <- getElem(x)
32          doc <- readerControl$reader(elem, readerControl$language, as.character(counter))          id <- if (is.null(x$names) || is.na(x$names))
33          filehash::dbInsert(db, ID(doc), doc)              as.character(counter)
34          if (x$Length > 0) tdl[[counter]] <- ID(doc)          else
35          else tdl <- c(tdl, ID(doc))              x$names[counter]
36            doc <- readerControl$reader(elem, readerControl$language, id)
37            filehash::dbInsert(db, meta(doc, "id"), doc)
38            if (x$length > 0) tdl[[counter]] <- meta(doc, "id")
39            else tdl <- c(tdl, meta(doc, "id"))
40          counter <- counter + 1          counter <- counter + 1
41      }      }
42        if (!is.null(x$names) && !is.na(x$names))
43            names(tdl) <- x$names
44    
45      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      structure(list(content = tdl,
46      filehash::dbInsert(db, "DMetaData", df)                     meta = CorpusMeta(),
47      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))                     dmeta = data.frame(row.names = seq_along(tdl)),
48                       dbcontrol = dbControl),
49      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)                class = c("PCorpus", "Corpus"))
50  }  }
51    
52  .VCorpus <- function(x, cmeta, dmeta) {  VCorpus <- Corpus <-
53      attr(x, "CMetaData") <- cmeta  function(x, readerControl = list(reader = x$defaultreader, language = "en"))
54      attr(x, "DMetaData") <- dmeta  {
55      class(x) <- c("VCorpus", "Corpus", "list")      stopifnot(inherits(x, "Source"))
     x  
 }  
56    
57  # Register S3 corpus classes to be recognized by S4 methods. This is      readerControl <- prepareReader(readerControl, x$defaultreader)
58  # mainly a fix to be compatible with packages which were originally  
59  # developed to cooperate with corresponding S4 tm classes. Necessary      if (is.function(readerControl$init))
60  # since tm's class architecture was changed to S3 since tm version 0.5.          readerControl$init()
61  setOldClass(c("VCorpus", "Corpus", "list"))  
62        if (is.function(readerControl$exit))
63  # The "..." are additional arguments for the FunctionGenerator reader          on.exit(readerControl$exit())
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "eng"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
64    
65      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
66      tdl <- if (x$Length > 0)      tdl <- if (x$length > 0)
67          vector("list", as.integer(x$Length))          vector("list", as.integer(x$length))
68      else      else
69          list()          list()
70    
71      if (x$Vectorized)      if (x$vectorized)
72          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),          tdl <- mapply(function(elem, id)
73                              readerControl$reader(elem, readerControl$language, id),
74                        pGetElem(x),                        pGetElem(x),
75                        id = as.character(seq_len(x$Length)),                        id = if (is.null(x$names) || is.na(x$names))
76                              as.character(seq_len(x$length))
77                          else x$names,
78                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
79      else {      else {
80          counter <- 1          counter <- 1
81          while (!eoi(x)) {          while (!eoi(x)) {
82              x <- stepNext(x)              x <- stepNext(x)
83              elem <- getElem(x)              elem <- getElem(x)
84              doc <- readerControl$reader(elem, readerControl$language, as.character(counter))              id <- if (is.null(x$names) || is.na(x$names))
85              if (x$Length > 0)                  as.character(counter)
86                else
87                    x$names[counter]
88                doc <- readerControl$reader(elem, readerControl$language, id)
89                if (x$length > 0)
90                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
91              else              else
92                  tdl <- c(tdl, list(doc))                  tdl <- c(tdl, list(doc))
93              counter <- counter + 1              counter <- counter + 1
94          }          }
95      }      }
96      names(tdl) <- x$Names      if (!is.null(x$names) && !is.na(x$names))
97      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)          names(tdl) <- x$names
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
98    
99  `[.PCorpus` <- function(x, i) {      structure(list(content = tdl,
100      if (missing(i)) return(x)                     meta = CorpusMeta(),
101      index <- attr(x, "DMetaData")[[1 , "subset"]]                     dmeta = data.frame(row.names = seq_along(tdl))),
102      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i                class = c("VCorpus", "Corpus"))
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
103  }  }
104    
105  `[.VCorpus` <- function(x, i) {  `[.PCorpus` <- `[.VCorpus` <-
106      if (missing(i)) return(x)  function(x, i)
107      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])  {
108  }      if (!missing(i)) {
109            x$content <- x$content[i]
110  `[<-.PCorpus` <- function(x, i, value) {          x$dmeta <- x$dmeta[i, , drop = FALSE]
     db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  
     counter <- 1  
     for (id in unclass(x)[i]) {  
         if (identical(length(value), 1L)) db[[id]] <- value  
         else db[[id]] <- value[[counter]]  
         counter <- counter + 1  
111      }      }
112      x      x
113  }  }
114    
115  `[[.PCorpus` <-  function(x, i) {  .map_name_index <-
116      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
117      filehash::dbFetch(db, NextMethod("[["))  {
118        if (is.character(i))
119            match(i, if (is.null(names(x))) meta(x, "id", "local") else names(x))
120        else
121            i
122    }
123    
124    `[[.PCorpus` <-
125    function(x, i)
126    {
127        i <- .map_name_index(x, i)
128        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
129        filehash::dbFetch(db, x$content[[i]])
130  }  }
131  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
132    function(x, i)
133    {
134        i <- .map_name_index(x, i)
135      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
136      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
137          .Call("copyCorpus", x, materialize(x, i))          .Call("copyCorpus", x, materialize(x, i))
138      NextMethod("[[")      x$content[[i]]
139  }  }
140    
141  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
142      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i, value)
143      index <- unclass(x)[[i]]  {
144      db[[index]] <- value      i <- .map_name_index(x, i)
145        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
146        db[[x$content[[i]]]] <- value
147      x      x
148  }  }
149  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
150    function(x, i, value)
151    {
152        i <- .map_name_index(x, i)
153      # Mark new objects as not active for lazy mapping      # Mark new objects as not active for lazy mapping
154      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
155      if (!is.null(lazyTmMap)) {      if (!is.null(lazyTmMap)) {
156          lazyTmMap$index[i] <- FALSE          lazyTmMap$index[i] <- FALSE
157          meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap          meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
158      }      }
159      # Set the value      x$content[[i]] <- value
160      cl <- class(x)      x
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
161  }  }
162    
163  # Update NodeIDs of a CMetaData tree  # Update NodeIDs of a CMetaData tree
164  .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  .update_id <-
165    function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0)
166    {
167      # Traversal of (binary) CMetaData tree with setup of NodeIDs      # Traversal of (binary) CMetaData tree with setup of NodeIDs
168      set_id <- function(x) {      set_id <- function(x) {
169          x$NodeID <- id          x$NodeID <- id
170          id <<- id + 1          id <<- id + 1
171          level <<- level + 1          level <<- level + 1
172          if (length(x$Children) > 0) {          if (length(x$Children)) {
173              mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))              mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
174              left <- set_id(x$Children[[1]])              left <- set_id(x$Children[[1]])
175              if (level == 1) {              if (level == 1) {
# Line 172  Line 188 
188  }  }
189    
190  # Find indices to be updated for a CMetaData tree  # Find indices to be updated for a CMetaData tree
191  .find_indices <- function(x) {  .find_indices <-
192    function(x)
193    {
194      indices.mapping <- NULL      indices.mapping <- NULL
195      for (m in levels(as.factor(DMetaData(x)$MetaID))) {      for (m in levels(as.factor(CorpusDMeta(x)$MetaID))) {
196          indices <- (DMetaData(x)$MetaID == m)          indices <- (CorpusDMeta(x)$MetaID == m)
197          indices.mapping <- c(indices.mapping, list(m = indices))          indices.mapping <- c(indices.mapping, list(m = indices))
198          names(indices.mapping)[length(indices.mapping)] <- m          names(indices.mapping)[length(indices.mapping)] <- m
199      }      }
200      indices.mapping      indices.mapping
201  }  }
202    
203  c2 <- function(x, y, ...) {  #c2 <-
204      # Update the CMetaData tree  #function(x, y, ...)
205      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  #{
206      update.struct <- .update_id(cmeta)  #    # Update the CMetaData tree
207    #    cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
208      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)  #    update.struct <- .update_id(cmeta)
209    #
210      # Find indices to be updated for the left tree  #    new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
211      indices.mapping <- .find_indices(x)  #
212    #    # Find indices to be updated for the left tree
213      # Update the DMetaData data frames for the left tree  #    indices.mapping <- .find_indices(x)
214      for (i in 1:ncol(update.struct$left.mapping)) {  #
215          map <- update.struct$left.mapping[,i]  #    # Update the CorpusDMeta data frames for the left tree
216          DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  #    for (i in 1:ncol(update.struct$left.mapping)) {
217      }  #        map <- update.struct$left.mapping[,i]
218    #        DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
219      # Find indices to be updated for the right tree  #    }
220      indices.mapping <- .find_indices(y)  #
221    #    # Find indices to be updated for the right tree
222      # Update the DMetaData data frames for the right tree  #    indices.mapping <- .find_indices(y)
223      for (i in 1:ncol(update.struct$right.mapping)) {  #
224          map <- update.struct$right.mapping[,i]  #    # Update the CorpusDMeta data frames for the right tree
225          DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  #    for (i in 1:ncol(update.struct$right.mapping)) {
226      }  #        map <- update.struct$right.mapping[,i]
227    #        DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
228      # Merge the DMetaData data frames  #    }
229      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  #
230      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))  #    # Merge the CorpusDMeta data frames
231      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  #    labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
232      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  #    na.matrix <- matrix(NA,
233      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))  #                        nrow = nrow(DMetaData(x)),
234      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)  #                        ncol = length(labels),
235      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  #                        dimnames = list(row.names(DMetaData(x)), labels))
236    #    x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
237    #    labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
238    #    na.matrix <- matrix(NA,
239    #                        nrow = nrow(DMetaData(y)),
240    #                        ncol = length(labels),
241    #                        dimnames = list(row.names(DMetaData(y)), labels))
242    #    y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
243    #    DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
244    #
245    #    new
246    #}
247    
248      new  c.VCorpus <-
249  }  function(..., recursive = FALSE)
   
 c.Corpus <-  
 function(x, ..., recursive = FALSE)  
250  {  {
251      args <- list(...)      args <- list(...)
252        x <- args[[1L]]
253    
254      if (identical(length(args), 0L))      if (length(args) == 1L)
255          return(x)          return(x)
256    
257      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
258          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
259    
260      if (inherits(x, "PCorpus"))      if (recursive)
261          stop("concatenation of corpora with underlying databases is not supported")          Reduce(c2, args)
262        else {
263      Reduce(c2, base::c(list(x), args))          args <- do.call("c", lapply(args, content))
264            structure(list(content = args,
265                           meta = CorpusMeta(),
266                           dmeta = data.frame(row.names = seq_along(args))),
267                      class = c("VCorpus", "Corpus"))
268        }
269  }  }
270    
271  c.TextDocument <- function(x, ..., recursive = FALSE) {  c.TextDocument <-
272    function(..., recursive = FALSE)
273    {
274      args <- list(...)      args <- list(...)
275        x <- args[[1L]]
276    
277      if (identical(length(args), 0L))      if (length(args) == 1L)
278          return(x)          return(x)
279    
280      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
281          stop("not all arguments are text documents")          stop("not all arguments are text documents")
282    
283      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)      structure(list(content = args,
284      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)                     meta = CorpusMeta(),
285                       dmeta = data.frame(row.names = seq_along(args))),
286                  class = c("VCorpus", "Corpus"))
287  }  }
288    
289  print.Corpus <- function(x, ...) {  as.list.PCorpus <- as.list.VCorpus <-
290      cat(sprintf(ngettext(length(x),  function(x, ...)
291                           "A corpus with %d text document\n",      content(x)
                          "A corpus with %d text documents\n"),  
                 length(x)))  
     invisible(x)  
 }  
292    
293  summary.Corpus <- function(object, ...) {  content.VCorpus <-
294      print(object)  function(x)
295      if (length(DMetaData(object)) > 0) {  {
296          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
297                               "\nThe metadata consists of %d tag-value pair and a data frame\n",      if (!is.null(lazyTmMap))
298                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),          .Call("copyCorpus", x, materialize(x))
299                      length(CMetaData(object)$MetaData)))      x$content
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
300  }  }
301    
302  inspect <- function(x) UseMethod("inspect", x)  content.PCorpus <-
303  inspect.PCorpus <- function(x) {  function(x)
304      summary(x)  {
305      cat("\n")      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
306      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      filehash::dbMultiFetch(db, unlist(x$content))
     show(filehash::dbMultiFetch(db, unlist(x)))  
 }  
 inspect.VCorpus <- function(x) {  
     summary(x)  
     cat("\n")  
     print(noquote(lapply(x, identity)))  
307  }  }
308    
309  lapply.PCorpus <- function(X, FUN, ...) {  length.PCorpus <- length.VCorpus <-
310      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  function(x)
311      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      length(x$content)
312    
313    print.PCorpus <- print.VCorpus <-
314    function(x, ...)
315    {
316        cat(sprintf(ngettext(length(x),
317                             "A corpus with %d text document\n\n",
318                             "A corpus with %d text documents\n\n"),
319                    length(x)))
320    
321        meta <- meta(x, type = "corpus")
322        dmeta <- meta(x, type = "indexed")
323    
324        cat("Metadata:\n")
325        cat(sprintf("  Tag-value pairs. Tags: %s\n",
326                    paste(names(meta), collapse = " ")))
327        cat("  Data frame. Variables:", colnames(dmeta), "\n")
328    
329        invisible(x)
330  }  }
331  lapply.VCorpus <- function(X, FUN, ...) {  
332      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")  inspect <-
333      if (!is.null(lazyTmMap))  function(x)
334          .Call("copyCorpus", X, materialize(X))      UseMethod("inspect", x)
335      base::lapply(X, FUN, ...)  inspect.PCorpus <- inspect.VCorpus <-
336    function(x)
337    {
338        print(x)
339        cat("\n")
340        print(noquote(content(x)))
341        invisible(x)
342  }  }
343    
344  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
345    function(x, path = ".", filenames = NULL)
346    {
347      filenames <- file.path(path,      filenames <- file.path(path,
348                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
349              sprintf("%s.txt", as.character(meta(x, "id", "local")))
350                             else filenames)                             else filenames)
351      i <- 1  
352      for (o in x) {      stopifnot(length(x) == length(filenames))
353          writeLines(as.PlainTextDocument(o), filenames[i])  
354          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
355      }  
356        invisible(x)
357  }  }

Legend:
Removed from v.1063  
changed lines
  Added in v.1313

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge