SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1258, Fri Sep 20 12:15:42 2013 UTC revision 1313, Sun Mar 30 09:28:00 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
 .PCorpus <-  
 function(x, cmeta, dmeta, dbcontrol)  
 {  
     attr(x, "CMetaData") <- cmeta  
     attr(x, "DMetaData") <- dmeta  
     attr(x, "DBControl") <- dbcontrol  
     class(x) <- c("PCorpus", "Corpus", "list")  
     x  
 }  
   
 DBControl <-  
 function(x)  
     attr(x, "DBControl")  
   
3  PCorpus <-  PCorpus <-
4  function(x,  function(x,
5           readerControl = list(reader = x$DefaultReader, language = "en"),           readerControl = list(reader = x$defaultreader, language = "en"),
6           dbControl = list(dbName = "", dbType = "DB1"),           dbControl = list(dbName = "", dbType = "DB1"))
          ...)  
7  {  {
8      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      stopifnot(inherits(x, "Source"))
9    
10        readerControl <- prepareReader(readerControl, x$defaultreader)
11    
12      if (is.function(readerControl$init))      if (is.function(readerControl$init))
13          readerControl$init()          readerControl$init()
# Line 33  Line 20 
20      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
23      tdl <- if (x$Length > 0)      tdl <- if (x$length > 0)
24          vector("list", as.integer(x$Length))          vector("list", as.integer(x$length))
25      else      else
26          list()          list()
27    
# Line 42  Line 29 
29      while (!eoi(x)) {      while (!eoi(x)) {
30          x <- stepNext(x)          x <- stepNext(x)
31          elem <- getElem(x)          elem <- getElem(x)
32          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          id <- if (is.null(x$names) || is.na(x$names))
33          filehash::dbInsert(db, ID(doc), doc)              as.character(counter)
34          if (x$Length > 0) tdl[[counter]] <- ID(doc)          else
35          else tdl <- c(tdl, ID(doc))              x$names[counter]
36            doc <- readerControl$reader(elem, readerControl$language, id)
37            filehash::dbInsert(db, meta(doc, "id"), doc)
38            if (x$length > 0) tdl[[counter]] <- meta(doc, "id")
39            else tdl <- c(tdl, meta(doc, "id"))
40          counter <- counter + 1          counter <- counter + 1
41      }      }
42      names(tdl) <- x$Names      if (!is.null(x$names) && !is.na(x$names))
43            names(tdl) <- x$names
44    
45      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      structure(list(content = tdl,
46      filehash::dbInsert(db, "DMetaData", df)                     meta = CorpusMeta(),
47      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))                     dmeta = data.frame(row.names = seq_along(tdl)),
48                       dbcontrol = dbControl),
49      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)                class = c("PCorpus", "Corpus"))
50  }  }
51    
52  .VCorpus <-  VCorpus <- Corpus <-
53  function(x, cmeta, dmeta)  function(x, readerControl = list(reader = x$defaultreader, language = "en"))
54  {  {
55      attr(x, "CMetaData") <- cmeta      stopifnot(inherits(x, "Source"))
     attr(x, "DMetaData") <- dmeta  
     class(x) <- c("VCorpus", "Corpus", "list")  
     x  
 }  
56    
57  # The "..." are additional arguments for the FunctionGenerator reader      readerControl <- prepareReader(readerControl, x$defaultreader)
 VCorpus <-  
 Corpus <-  
 function(x,  
          readerControl = list(reader = x$DefaultReader, language = "en"),  
          ...)  
 {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
58    
59      if (is.function(readerControl$init))      if (is.function(readerControl$init))
60          readerControl$init()          readerControl$init()
# Line 82  Line 63 
63          on.exit(readerControl$exit())          on.exit(readerControl$exit())
64    
65      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
66      tdl <- if (x$Length > 0)      tdl <- if (x$length > 0)
67          vector("list", as.integer(x$Length))          vector("list", as.integer(x$length))
68      else      else
69          list()          list()
70    
71      if (x$Vectorized)      if (x$vectorized)
72          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),          tdl <- mapply(function(elem, id)
73                              readerControl$reader(elem, readerControl$language, id),
74                        pGetElem(x),                        pGetElem(x),
75                        id = if (is.null(x$Names) || is.na(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = if (is.null(x$names) || is.na(x$names))
76                              as.character(seq_len(x$length))
77                          else x$names,
78                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
79      else {      else {
80          counter <- 1          counter <- 1
81          while (!eoi(x)) {          while (!eoi(x)) {
82              x <- stepNext(x)              x <- stepNext(x)
83              elem <- getElem(x)              elem <- getElem(x)
84              id <- if (is.null(x$Names) || is.na(x$Names))              id <- if (is.null(x$names) || is.na(x$names))
85                  as.character(counter)                  as.character(counter)
86              else              else
87                  x$Names[counter]                  x$names[counter]
88              doc <- readerControl$reader(elem, readerControl$language, id)              doc <- readerControl$reader(elem, readerControl$language, id)
89              if (x$Length > 0)              if (x$length > 0)
90                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
91              else              else
92                  tdl <- c(tdl, list(doc))                  tdl <- c(tdl, list(doc))
93              counter <- counter + 1              counter <- counter + 1
94          }          }
95      }      }
96      names(tdl) <- x$Names      if (!is.null(x$names) && !is.na(x$names))
97      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)          names(tdl) <- x$names
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
98    
99  `[.PCorpus` <-      structure(list(content = tdl,
100  function(x, i)                     meta = CorpusMeta(),
101  {                     dmeta = data.frame(row.names = seq_along(tdl))),
102      if (missing(i)) return(x)                class = c("VCorpus", "Corpus"))
     index <- attr(x, "DMetaData")[[1 , "subset"]]  
     attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i  
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
103  }  }
104    
105  `[.VCorpus` <-  `[.PCorpus` <- `[.VCorpus` <-
106  function(x, i)  function(x, i)
107  {  {
108      if (missing(i)) return(x)      if (!missing(i)) {
109      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])          x$content <- x$content[i]
110  }          x$dmeta <- x$dmeta[i, , drop = FALSE]
   
 `[<-.PCorpus` <-  
 function(x, i, value)  
 {  
     db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  
     counter <- 1  
     for (id in unclass(x)[i]) {  
         if (identical(length(value), 1L)) db[[id]] <- value  
         else db[[id]] <- value[[counter]]  
         counter <- counter + 1  
111      }      }
112      x      x
113  }  }
# Line 147  Line 115 
115  .map_name_index <-  .map_name_index <-
116  function(x, i)  function(x, i)
117  {  {
118      if (is.character(i)) {      if (is.character(i))
119          if (is.null(names(x)))          match(i, if (is.null(names(x))) meta(x, "id", "local") else names(x))
             match(i, meta(x, "ID", type = "local"))  
120          else          else
             match(i, names(x))  
     }  
121      i      i
122  }  }
123    
# Line 160  Line 125 
125  function(x, i)  function(x, i)
126  {  {
127      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
128      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
129      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, x$content[[i]])
130  }  }
131  `[[.VCorpus` <-  `[[.VCorpus` <-
132  function(x, i)  function(x, i)
# Line 170  Line 135 
135      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
136      if (!is.null(lazyTmMap))      if (!is.null(lazyTmMap))
137          .Call("copyCorpus", x, materialize(x, i))          .Call("copyCorpus", x, materialize(x, i))
138      NextMethod("[[")      x$content[[i]]
139  }  }
140    
141  `[[<-.PCorpus` <-  `[[<-.PCorpus` <-
142  function(x, i, value)  function(x, i, value)
143  {  {
144      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
145      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
146      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
147      x      x
148  }  }
149  `[[<-.VCorpus` <-  `[[<-.VCorpus` <-
# Line 192  Line 156 
156          lazyTmMap$index[i] <- FALSE          lazyTmMap$index[i] <- FALSE
157          meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap          meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
158      }      }
159      # Set the value      x$content[[i]] <- value
160      cl <- class(x)      x
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
161  }  }
162    
163  # Update NodeIDs of a CMetaData tree  # Update NodeIDs of a CMetaData tree
# Line 208  Line 169 
169          x$NodeID <- id          x$NodeID <- id
170          id <<- id + 1          id <<- id + 1
171          level <<- level + 1          level <<- level + 1
172          if (length(x$Children) > 0) {          if (length(x$Children)) {
173              mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))              mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
174              left <- set_id(x$Children[[1]])              left <- set_id(x$Children[[1]])
175              if (level == 1) {              if (level == 1) {
# Line 231  Line 192 
192  function(x)  function(x)
193  {  {
194      indices.mapping <- NULL      indices.mapping <- NULL
195      for (m in levels(as.factor(DMetaData(x)$MetaID))) {      for (m in levels(as.factor(CorpusDMeta(x)$MetaID))) {
196          indices <- (DMetaData(x)$MetaID == m)          indices <- (CorpusDMeta(x)$MetaID == m)
197          indices.mapping <- c(indices.mapping, list(m = indices))          indices.mapping <- c(indices.mapping, list(m = indices))
198          names(indices.mapping)[length(indices.mapping)] <- m          names(indices.mapping)[length(indices.mapping)] <- m
199      }      }
200      indices.mapping      indices.mapping
201  }  }
202    
203  c2 <-  #c2 <-
204  function(x, y, ...)  #function(x, y, ...)
205  {  #{
206      # Update the CMetaData tree  #    # Update the CMetaData tree
207      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  #    cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
208      update.struct <- .update_id(cmeta)  #    update.struct <- .update_id(cmeta)
209    #
210      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)  #    new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
211    #
212      # Find indices to be updated for the left tree  #    # Find indices to be updated for the left tree
213      indices.mapping <- .find_indices(x)  #    indices.mapping <- .find_indices(x)
214    #
215      # Update the DMetaData data frames for the left tree  #    # Update the CorpusDMeta data frames for the left tree
216      for (i in 1:ncol(update.struct$left.mapping)) {  #    for (i in 1:ncol(update.struct$left.mapping)) {
217          map <- update.struct$left.mapping[,i]  #        map <- update.struct$left.mapping[,i]
218          DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  #        DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
219      }  #    }
220    #
221      # Find indices to be updated for the right tree  #    # Find indices to be updated for the right tree
222      indices.mapping <- .find_indices(y)  #    indices.mapping <- .find_indices(y)
223    #
224      # Update the DMetaData data frames for the right tree  #    # Update the CorpusDMeta data frames for the right tree
225      for (i in 1:ncol(update.struct$right.mapping)) {  #    for (i in 1:ncol(update.struct$right.mapping)) {
226          map <- update.struct$right.mapping[,i]  #        map <- update.struct$right.mapping[,i]
227          DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  #        DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
228      }  #    }
229    #
230      # Merge the DMetaData data frames  #    # Merge the CorpusDMeta data frames
231      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  #    labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
232      na.matrix <- matrix(NA,  #    na.matrix <- matrix(NA,
233                          nrow = nrow(DMetaData(x)),  #                        nrow = nrow(DMetaData(x)),
234                          ncol = length(labels),  #                        ncol = length(labels),
235                          dimnames = list(row.names(DMetaData(x)), labels))  #                        dimnames = list(row.names(DMetaData(x)), labels))
236      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  #    x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
237      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  #    labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
238      na.matrix <- matrix(NA,  #    na.matrix <- matrix(NA,
239                          nrow = nrow(DMetaData(y)),  #                        nrow = nrow(DMetaData(y)),
240                          ncol = length(labels),  #                        ncol = length(labels),
241                          dimnames = list(row.names(DMetaData(y)), labels))  #                        dimnames = list(row.names(DMetaData(y)), labels))
242      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)  #    y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
243      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  #    DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
244    #
245      new  #    new
246  }  #}
247    
248  c.Corpus <-  c.VCorpus <-
249  function(..., recursive = FALSE)  function(..., recursive = FALSE)
250  {  {
251      args <- list(...)      args <- list(...)
# Line 296  Line 257 
257      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
258          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
259    
     if (inherits(x, "PCorpus"))  
         stop("concatenation of corpora with underlying databases is not supported")  
   
260      if (recursive)      if (recursive)
261          Reduce(c2, args)          Reduce(c2, args)
262      else {      else {
263          args <- do.call("c", lapply(args, unclass))          args <- do.call("c", lapply(args, content))
264          .VCorpus(args,          structure(list(content = args,
265                   cmeta = .MetaDataNode(),                         meta = CorpusMeta(),
266                   dmeta = data.frame(MetaID = rep(0, length(args)),                         dmeta = data.frame(row.names = seq_along(args))),
267                                      stringsAsFactors = FALSE))                    class = c("VCorpus", "Corpus"))
268      }      }
269  }  }
270    
# Line 322  Line 280 
280      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
281          stop("not all arguments are text documents")          stop("not all arguments are text documents")
282    
283      dmeta <- data.frame(MetaID = rep(0, length(args)),      structure(list(content = args,
284                          stringsAsFactors = FALSE)                     meta = CorpusMeta(),
285      .VCorpus(args, .MetaDataNode(), dmeta)                     dmeta = data.frame(row.names = seq_along(args))),
286                  class = c("VCorpus", "Corpus"))
287  }  }
288    
289  print.Corpus <-  as.list.PCorpus <- as.list.VCorpus <-
290  function(x, ...)  function(x, ...)
291        content(x)
292    
293    content.VCorpus <-
294    function(x)
295  {  {
296      cat(sprintf(ngettext(length(x),      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
297                           "A corpus with %d text document\n",      if (!is.null(lazyTmMap))
298                           "A corpus with %d text documents\n"),          .Call("copyCorpus", x, materialize(x))
299                  length(x)))      x$content
     invisible(x)  
300  }  }
301    
302  summary.Corpus <-  content.PCorpus <-
303  function(object, ...)  function(x)
304  {  {
305      print(object)      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
306      if (length(DMetaData(object)) > 0) {      filehash::dbMultiFetch(db, unlist(x$content))
         cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  
                              "\nThe metadata consists of %d tag-value pair and a data frame\n",  
                              "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  
                     length(CMetaData(object)$MetaData)))  
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
307      }      }
308    
309    length.PCorpus <- length.VCorpus <-
310    function(x)
311        length(x$content)
312    
313    print.PCorpus <- print.VCorpus <-
314    function(x, ...)
315    {
316        cat(sprintf(ngettext(length(x),
317                             "A corpus with %d text document\n\n",
318                             "A corpus with %d text documents\n\n"),
319                    length(x)))
320    
321        meta <- meta(x, type = "corpus")
322        dmeta <- meta(x, type = "indexed")
323    
324        cat("Metadata:\n")
325        cat(sprintf("  Tag-value pairs. Tags: %s\n",
326                    paste(names(meta), collapse = " ")))
327        cat("  Data frame. Variables:", colnames(dmeta), "\n")
328    
329        invisible(x)
330  }  }
331    
332  inspect <-  inspect <-
333  function(x)  function(x)
334      UseMethod("inspect", x)      UseMethod("inspect", x)
335  inspect.PCorpus <-  inspect.PCorpus <- inspect.VCorpus <-
336  function(x)  function(x)
337  {  {
338      summary(x)      print(x)
339      cat("\n")      cat("\n")
340      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      print(noquote(content(x)))
341      show(filehash::dbMultiFetch(db, unlist(x)))      invisible(x)
 }  
 inspect.VCorpus <-  
 function(x)  
 {  
     summary(x)  
     cat("\n")  
     print(noquote(lapply(x, identity)))  
 }  
   
 lapply.PCorpus <-  
 function(X, FUN, ...)  
 {  
     db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  
     lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)  
 }  
 lapply.VCorpus <-  
 function(X, FUN, ...)  
 {  
     lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")  
     if (!is.null(lazyTmMap))  
         .Call("copyCorpus", X, materialize(X))  
     base::lapply(X, FUN, ...)  
342  }  }
343    
344  writeCorpus <-  writeCorpus <-
345  function(x, path = ".", filenames = NULL)  function(x, path = ".", filenames = NULL)
346  {  {
347      filenames <- file.path(path,      filenames <- file.path(path,
348                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
349              sprintf("%s.txt", as.character(meta(x, "id", "local")))
350                             else filenames)                             else filenames)
351      i <- 1  
352      for (o in x) {      stopifnot(length(x) == length(filenames))
353          writeLines(as.PlainTextDocument(o), filenames[i])  
354          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
355      }  
356        invisible(x)
357  }  }

Legend:
Removed from v.1258  
changed lines
  Added in v.1313

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge