SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1242, Mon Aug 19 05:33:57 2013 UTC revision 1419, Sat May 2 17:23:47 2015 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
 .PCorpus <-  
 function(x, cmeta, dmeta, dbcontrol)  
 {  
     attr(x, "CMetaData") <- cmeta  
     attr(x, "DMetaData") <- dmeta  
     attr(x, "DBControl") <- dbcontrol  
     class(x) <- c("PCorpus", "Corpus", "list")  
     x  
 }  
   
 DBControl <-  
 function(x)  
     attr(x, "DBControl")  
   
3  PCorpus <-  PCorpus <-
4  function(x,  function(x,
5           readerControl = list(reader = x$DefaultReader, language = "en"),           readerControl = list(reader = reader(x), language = "en"),
6           dbControl = list(dbName = "", dbType = "DB1"),           dbControl = list(dbName = "", dbType = "DB1"))
          ...)  
7  {  {
8      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      stopifnot(inherits(x, "Source"))
9    
10      if (is.function(readerControl$init))      readerControl <- prepareReader(readerControl, reader(x))
         readerControl$init()  
   
     if (is.function(readerControl$exit))  
         on.exit(readerControl$exit())  
11    
12      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
13          stop("error in creating database")          stop("error in creating database")
14      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
15    
16      # Allocate memory in advance if length is known      x <- open(x)
17      tdl <- if (x$Length > 0)      tdl <- vector("list", length(x))
         vector("list", as.integer(x$Length))  
     else  
         list()  
   
18      counter <- 1      counter <- 1
19      while (!eoi(x)) {      while (!eoi(x)) {
20          x <- stepNext(x)          x <- stepNext(x)
21          elem <- getElem(x)          elem <- getElem(x)
22          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          doc <- readerControl$reader(elem,
23          filehash::dbInsert(db, ID(doc), doc)                                      readerControl$language,
24          if (x$Length > 0) tdl[[counter]] <- ID(doc)                                      as.character(counter))
25          else tdl <- c(tdl, ID(doc))          filehash::dbInsert(db, meta(doc, "id"), doc)
26            tdl[[counter]] <- meta(doc, "id")
27          counter <- counter + 1          counter <- counter + 1
28      }      }
29      names(tdl) <- x$Names      x <- close(x)
30    
31      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      p <- list(content = tdl,
32      filehash::dbInsert(db, "DMetaData", df)                meta = CorpusMeta(),
33      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))                dmeta = data.frame(row.names = seq_along(tdl)),
34                  dbcontrol = dbControl)
35      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      class(p) <- c("PCorpus", "Corpus")
36        p
37  }  }
38    
 .VCorpus <-  
 function(x, cmeta, dmeta)  
 {  
     attr(x, "CMetaData") <- cmeta  
     attr(x, "DMetaData") <- dmeta  
     class(x) <- c("VCorpus", "Corpus", "list")  
     x  
 }  
   
 # The "..." are additional arguments for the FunctionGenerator reader  
 VCorpus <-  
39  Corpus <-  Corpus <-
40  function(x,  VCorpus <-
41           readerControl = list(reader = x$DefaultReader, language = "en"),  function(x, readerControl = list(reader = reader(x), language = "en"))
          ...)  
42  {  {
43      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      stopifnot(inherits(x, "Source"))
   
     if (is.function(readerControl$init))  
         readerControl$init()  
44    
45      if (is.function(readerControl$exit))      readerControl <- prepareReader(readerControl, reader(x))
         on.exit(readerControl$exit())  
46    
47      # Allocate memory in advance if length is known      x <- open(x)
48      tdl <- if (x$Length > 0)      tdl <- vector("list", length(x))
49          vector("list", as.integer(x$Length))      # Check for parallel element access
50      else      if (is.function(getS3method("pGetElem", class(x), TRUE)))
51          list()          tdl <- mapply(function(elem, id)
52                              readerControl$reader(elem, readerControl$language, id),
     if (x$Vectorized)  
         tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),  
53                        pGetElem(x),                        pGetElem(x),
54                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = as.character(seq_along(x)),
55                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
56      else {      else {
57          counter <- 1          counter <- 1
58          while (!eoi(x)) {          while (!eoi(x)) {
59              x <- stepNext(x)              x <- stepNext(x)
60              elem <- getElem(x)              elem <- getElem(x)
61              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              doc <- readerControl$reader(elem,
62              if (x$Length > 0)                                          readerControl$language,
63                                            as.character(counter))
64                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
             else  
                 tdl <- c(tdl, list(doc))  
65              counter <- counter + 1              counter <- counter + 1
66          }          }
67      }      }
68      names(tdl) <- x$Names      x <- close(x)
69      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
70      .VCorpus(tdl, .MetaDataNode(), df)      as.VCorpus(tdl)
71  }  }
72    
73  `[.PCorpus` <-  `[.PCorpus` <-
74  function(x, i)  function(x, i)
75  {  {
76      if (missing(i)) return(x)      if (!missing(i)) {
77      index <- attr(x, "DMetaData")[[1 , "subset"]]          x$content <- x$content[i]
78      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i          x$dmeta <- x$dmeta[i, , drop = FALSE]
79      dmeta <- attr(x, "DMetaData")      }
80      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))      x
81  }  }
82    
83  `[.VCorpus` <-  `[.VCorpus` <-
84  function(x, i)  function(x, i)
85  {  {
86      if (missing(i)) return(x)      if (!missing(i)) {
87      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])          x$content <- x$content[i]
88  }          x$dmeta <- x$dmeta[i, , drop = FALSE]
89            if (!is.null(x$lazy))
90  `[<-.PCorpus` <-              x$lazy$index <- x$lazy$index[i]
 function(x, i, value)  
 {  
     db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  
     counter <- 1  
     for (id in unclass(x)[i]) {  
         if (identical(length(value), 1L)) db[[id]] <- value  
         else db[[id]] <- value[[counter]]  
         counter <- counter + 1  
91      }      }
92      x      x
93  }  }
# Line 143  Line 95 
95  .map_name_index <-  .map_name_index <-
96  function(x, i)  function(x, i)
97  {  {
98      if (is.character(i)) {      if (is.character(i))
99          if (is.null(names(x)))          match(i, meta(x, "id", "local"))
             match(i, meta(x, "ID", type = "local"))  
100          else          else
             match(i, names(x))  
     }  
101      i      i
102  }  }
103    
# Line 156  Line 105 
105  function(x, i)  function(x, i)
106  {  {
107      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
108      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
109      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, x$content[[i]])
110  }  }
111  `[[.VCorpus` <-  `[[.VCorpus` <-
112  function(x, i)  function(x, i)
113  {  {
114      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
115      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
116      if (!is.null(lazyTmMap))          .Call(copyCorpus, x, materialize(x, i))
117          .Call("copyCorpus", x, materialize(x, i))      x$content[[i]]
     NextMethod("[[")  
118  }  }
119    
120  `[[<-.PCorpus` <-  `[[<-.PCorpus` <-
121  function(x, i, value)  function(x, i, value)
122  {  {
123      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
124      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
125      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
126      x      x
127  }  }
128  `[[<-.VCorpus` <-  `[[<-.VCorpus` <-
129  function(x, i, value)  function(x, i, value)
130  {  {
131      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
132      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
133      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
134      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
135          lazyTmMap$index[i] <- FALSE      x$content[[i]] <- value
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <-  
 function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0)  
 {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
136          x          x
137      }      }
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
138    
139  # Find indices to be updated for a CMetaData tree  as.list.PCorpus <- as.list.VCorpus <-
140  .find_indices <-  function(x, ...)
141        setNames(content(x), as.character(lapply(content(x), meta, "id")))
142    
143    as.VCorpus <-
144    function(x)
145        UseMethod("as.VCorpus")
146    as.VCorpus.VCorpus <- identity
147    as.VCorpus.list <-
148  function(x)  function(x)
149  {  {
150      indices.mapping <- NULL      v <- list(content = x,
151      for (m in levels(as.factor(DMetaData(x)$MetaID))) {                meta = CorpusMeta(),
152          indices <- (DMetaData(x)$MetaID == m)                dmeta = data.frame(row.names = seq_along(x)))
153          indices.mapping <- c(indices.mapping, list(m = indices))      class(v) <- c("VCorpus", "Corpus")
154          names(indices.mapping)[length(indices.mapping)] <- m      v
     }  
     indices.mapping  
155  }  }
156    
157  c2 <-  outer_union <-
158  function(x, y, ...)  function(x, y, ...)
159  {  {
160      # Update the CMetaData tree      if (nrow(x) > 0L)
161      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))          x[, setdiff(names(y), names(x))] <- NA
162      update.struct <- .update_id(cmeta)      if (nrow(y) > 0L)
163            y[, setdiff(names(x), names(y))] <- NA
164      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)      res <- rbind(x, y)
165        if (ncol(res) == 0L)
166      # Find indices to be updated for the left tree          res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
167      indices.mapping <- .find_indices(x)      res
   
     # Update the DMetaData data frames for the left tree  
     for (i in 1:ncol(update.struct$left.mapping)) {  
         map <- update.struct$left.mapping[,i]  
         DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
168      }      }
169    
170      # Find indices to be updated for the right tree  c.VCorpus <-
     indices.mapping <- .find_indices(y)  
   
     # Update the DMetaData data frames for the right tree  
     for (i in 1:ncol(update.struct$right.mapping)) {  
         map <- update.struct$right.mapping[,i]  
         DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
   
     # Merge the DMetaData data frames  
     labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  
     na.matrix <- matrix(NA,  
                         nrow = nrow(DMetaData(x)),  
                         ncol = length(labels),  
                         dimnames = list(row.names(DMetaData(x)), labels))  
     x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  
     labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  
     na.matrix <- matrix(NA,  
                         nrow = nrow(DMetaData(y)),  
                         ncol = length(labels),  
                         dimnames = list(row.names(DMetaData(y)), labels))  
     y.dmeta.aug <- cbind(DMetaData(y), na.matrix)  
     DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  
   
     new  
 }  
   
 c.Corpus <-  
171  function(..., recursive = FALSE)  function(..., recursive = FALSE)
172  {  {
173      args <- list(...)      args <- list(...)
# Line 292  Line 179 
179      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
180          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
181    
182      if (inherits(x, "PCorpus"))      v <- list(content = do.call("c", lapply(args, content)),
183          stop("concatenation of corpora with underlying databases is not supported")                meta = CorpusMeta(meta = do.call("c",
184                    lapply(args, function(a) meta(a, type = "corpus")))),
185      if (recursive)                dmeta = Reduce(outer_union, lapply(args, meta)))
186          Reduce(c2, args)      class(v) <- c("VCorpus", "Corpus")
187      else {      v
         args <- do.call("c", lapply(args, unclass))  
         .VCorpus(args,  
                  cmeta = .MetaDataNode(),  
                  dmeta = data.frame(MetaID = rep(0, length(args)),  
                                     stringsAsFactors = FALSE))  
     }  
 }  
   
 c.TextDocument <-  
 function(..., recursive = FALSE)  
 {  
     args <- list(...)  
     x <- args[[1L]]  
   
     if(length(args) == 1L)  
         return(x)  
   
     if (!all(unlist(lapply(args, inherits, class(x)))))  
         stop("not all arguments are text documents")  
   
     dmeta <- data.frame(MetaID = rep(0, length(args)),  
                         stringsAsFactors = FALSE)  
     .VCorpus(args, .MetaDataNode(), dmeta)  
188  }  }
189    
190  print.Corpus <-  content.VCorpus <-
191  function(x, ...)  function(x)
192  {  {
193      cat(sprintf(ngettext(length(x),      if (!is.null(x$lazy))
194                           "A corpus with %d text document\n",          .Call(copyCorpus, x, materialize(x))
195                           "A corpus with %d text documents\n"),      x$content
                 length(x)))  
     invisible(x)  
196  }  }
197    
198  summary.Corpus <-  content.PCorpus <-
199  function(object, ...)  function(x)
200  {  {
201      print(object)      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
202      if (length(DMetaData(object)) > 0) {      filehash::dbMultiFetch(db, unlist(x$content))
         cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  
                              "\nThe metadata consists of %d tag-value pair and a data frame\n",  
                              "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  
                     length(CMetaData(object)$MetaData)))  
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
203  }  }
204    
205  inspect <-  inspect <-
206  function(x)  function(x)
207      UseMethod("inspect", x)      UseMethod("inspect", x)
208  inspect.PCorpus <-  inspect.PCorpus <- inspect.VCorpus <-
209  function(x)  function(x)
210  {  {
211      summary(x)      print(x)
212      cat("\n")      cat("\n")
213      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      print(noquote(content(x)))
214      show(filehash::dbMultiFetch(db, unlist(x)))      invisible(x)
215  }  }
216  inspect.VCorpus <-  
217    length.PCorpus <- length.VCorpus <-
218  function(x)  function(x)
219  {      length(x$content)
     summary(x)  
     cat("\n")  
     print(noquote(lapply(x, identity)))  
 }  
220    
221  lapply.PCorpus <-  names.PCorpus <- names.VCorpus <-
222  function(X, FUN, ...)  function(x)
223        as.character(meta(x, "id", "local"))
224    
225    `names<-.PCorpus` <- `names<-.VCorpus` <-
226    function(x, value)
227  {  {
228      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])      meta(x, "id", "local") <- as.character(value)
229      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      x
230  }  }
231  lapply.VCorpus <-  
232  function(X, FUN, ...)  format.PCorpus <- format.VCorpus <-
233    function(x, ...)
234  {  {
235      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")      c(sprintf("<<%s>>", class(x)[1L]),
236      if (!is.null(lazyTmMap))        sprintf("Metadata:  corpus specific: %d, document level (indexed): %d",
237          .Call("copyCorpus", X, materialize(X))                length(meta(x, type = "corpus")),
238      base::lapply(X, FUN, ...)                ncol(meta(x, type = "indexed"))),
239          sprintf("Content:  documents: %d", length(x)))
240  }  }
241    
242  writeCorpus <-  writeCorpus <-
243  function(x, path = ".", filenames = NULL)  function(x, path = ".", filenames = NULL)
244  {  {
245      filenames <- file.path(path,      filenames <- file.path(path,
246                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
247              sprintf("%s.txt", as.character(meta(x, "id", "local")))
248                             else filenames)                             else filenames)
249      i <- 1  
250      for (o in x) {      stopifnot(length(x) == length(filenames))
251          writeLines(as.PlainTextDocument(o), filenames[i])  
252          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
253      }  
254        invisible(x)
255  }  }

Legend:
Removed from v.1242  
changed lines
  Added in v.1419

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge