SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 986, Tue Sep 1 15:33:30 2009 UTC revision 1409, Fri Feb 27 16:10:18 2015 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  prepareReader <- function(readerControl, defaultReader = NULL, ...) {  PCorpus <-
4      if (is.null(readerControl$reader))  function(x,
5          readerControl$reader <- defaultReader           readerControl = list(reader = reader(x), language = "en"),
6      if (inherits(readerControl$reader, "FunctionGenerator"))           dbControl = list(dbName = "", dbType = "DB1"))
7          readerControl$reader <- readerControl$reader(...)  {
8      if (is.null(readerControl$language))      stopifnot(inherits(x, "Source"))
         readerControl$language <- "eng"  
     readerControl  
 }  
   
 # Node ID, actual meta data, and possibly other nodes as children  
 .MetaDataNode <- function(nodeid = 0, meta = list(create_date = as.POSIXlt(Sys.time(), tz = "GMT"), creator = Sys.getenv("LOGNAME")), children = NULL) {  
     structure(list(NodeID = nodeid, MetaData = meta, Children = children),  
               class = "MetaDataNode")  
 }  
   
 print.MetaDataNode <- function(x, ...)  
     print(x$MetaData)  
   
 .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  
     attr(x, "CMetaData") <- cmeta  
     attr(x, "DMetaData") <- dmeta  
     attr(x, "DBControl") <- dbcontrol  
     class(x) <- c("PCorpus", "Corpus", "list")  
     x  
 }  
9    
10  PCorpus <- function(x,      readerControl <- prepareReader(readerControl, reader(x))
                     readerControl = list(reader = x$DefaultReader, language = "eng"),  
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
11    
12      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
13          stop("error in creating database")          stop("error in creating database")
14      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
15    
16      # Allocate memory in advance if length is known      x <- open(x)
17      tdl <- if (x$Length > 0)      tdl <- vector("list", length(x))
         vector("list", as.integer(x$Length))  
     else  
         list()  
   
18      counter <- 1      counter <- 1
19      while (!eoi(x)) {      while (!eoi(x)) {
20          x <- stepNext(x)          x <- stepNext(x)
21          elem <- getElem(x)          elem <- getElem(x)
22          doc <- readerControl$reader(elem, readerControl$language, as.character(counter))          doc <- readerControl$reader(elem,
23          filehash::dbInsert(db, ID(doc), doc)                                      readerControl$language,
24          if (x$Length > 0) tdl[[counter]] <- ID(doc)                                      as.character(counter))
25          else tdl <- c(tdl, ID(doc))          filehash::dbInsert(db, meta(doc, "id"), doc)
26            tdl[[counter]] <- meta(doc, "id")
27          counter <- counter + 1          counter <- counter + 1
28      }      }
29        x <- close(x)
30    
31      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      p <- list(content = tdl,
32      filehash::dbInsert(db, "DMetaData", df)                meta = CorpusMeta(),
33      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))                dmeta = data.frame(row.names = seq_along(tdl)),
34                  dbcontrol = dbControl)
35      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      class(p) <- c("PCorpus", "Corpus")
36        p
37  }  }
38    
39  .VCorpus <- function(x, cmeta, dmeta) {  Corpus <-
40      attr(x, "CMetaData") <- cmeta  VCorpus <-
41      attr(x, "DMetaData") <- dmeta  function(x, readerControl = list(reader = reader(x), language = "en"))
42      class(x) <- c("VCorpus", "Corpus", "list")  {
43      x      stopifnot(inherits(x, "Source"))
 }  
44    
45  # The "..." are additional arguments for the FunctionGenerator reader      readerControl <- prepareReader(readerControl, reader(x))
 VCorpus <- Corpus <- function(x,  
                     readerControl = list(reader = x$DefaultReader, language = "eng"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
   
     # Allocate memory in advance if length is known  
     tdl <- if (x$Length > 0)  
         vector("list", as.integer(x$Length))  
     else  
         list()  
46    
47      if (x$Vectorized)      x <- open(x)
48          mapply(function(x, id) readerControl$reader(x, readerControl$language, id),      tdl <- vector("list", length(x))
49        # Check for parallel element access
50        if (is.function(getS3method("pGetElem", class(x), TRUE)))
51            tdl <- mapply(function(elem, id)
52                              readerControl$reader(elem, readerControl$language, id),
53                 pGetElem(x),                 pGetElem(x),
54                 id = as.character(seq_len(x$Length)),                        id = as.character(seq_along(x)),
55                 SIMPLIFY = FALSE)                 SIMPLIFY = FALSE)
56      else {      else {
57          counter <- 1          counter <- 1
58          while (!eoi(x)) {          while (!eoi(x)) {
59              x <- stepNext(x)              x <- stepNext(x)
60              elem <- getElem(x)              elem <- getElem(x)
61              doc <- readerControl$reader(elem, readerControl$language, as.character(counter))              doc <- readerControl$reader(elem,
62              if (x$Length > 0)                                          readerControl$language,
63                                            as.character(counter))
64                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
             else  
                 tdl <- c(tdl, list(doc))  
65              counter <- counter + 1              counter <- counter + 1
66          }          }
67      }      }
68        x <- close(x)
69    
70      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      as.VCorpus(tdl)
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
   
 `[.PCorpus` <- function(x, i) {  
     if (missing(i)) return(x)  
     cmeta <- CMetaData(x)  
     index <- attr(x, "DMetaData")[[1 , "subset"]]  
     attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i  
     dmeta <- attr(x, "DMetaData")  
     dbcontrol <- DBControl(x)  
     class(x) <- "list"  
     .PCorpus(x[i, drop = FALSE], cmeta, dmeta, dbcontrol)  
 }  
   
 `[.VCorpus` <- function(x, i) {  
     if (missing(i)) return(x)  
     cmeta <- CMetaData(x)  
     dmeta <- DMetaData(x)[i, , drop = FALSE]  
     class(x) <- "list"  
     .VCorpus(x[i, drop = FALSE], cmeta, dmeta)  
71  }  }
72    
73  `[<-.PCorpus` <- function(x, i, value) {  `[.PCorpus` <-
74      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
75      counter <- 1  {
76      for (id in unclass(x)[i]) {      if (!missing(i)) {
77          if (identical(length(value), 1)) db[[id]] <- value          x$content <- x$content[i]
78          else db[[id]] <- value[[counter]]          x$dmeta <- x$dmeta[i, , drop = FALSE]
         counter <- counter + 1  
79      }      }
80      x      x
81  }  }
82    
83  `[[.PCorpus` <-  function(x, i) {  `[.VCorpus` <-
84      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
85      class(x) <- "list"  {
86      filehash::dbFetch(db, x[[i]])      if (!missing(i)) {
87  }          x$content <- x$content[i]
88  `[[.VCorpus` <-  function(x, i) {          x$dmeta <- x$dmeta[i, , drop = FALSE]
89      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")          if (!is.null(x$lazy))
90      if (!is.null(lazyTmMap))              x$lazy$index <- x$lazy$index[i]
         .Call("copyCorpus", x, materialize(x, i))  
     class(x) <- "list"  
     x[[i]]  
 }  
   
 `[[<-.PCorpus` <-  function(x, i, value) {  
     db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  
     index <- unclass(x)[[i]]  
     db[[index]] <- value  
     x  
91  }  }
 `[[<-.VCorpus` <-  function(x, i, value) {  
     # Mark new objects as not active for lazy mapping  
     lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")  
     if (!is.null(lazyTmMap)) {  
         lazyTmMap$index[i] <- FALSE  
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     class(x) <- "list"  
     x[[i]] <- value  
     class(x) <- cl  
92      x      x
93  }  }
94    
95  # Update \code{NodeID}s of a CMetaData tree  .map_name_index <-
96  update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  function(x, i)
97      # Traversal of (binary) CMetaData tree with setup of \code{NodeID}s  {
98      set_id <- function(x) {      if (is.character(i))
99          attrs <- attributes(x)          match(i, meta(x, "id", "local"))
100          x <- id      else
101          attributes(x) <- attrs          i
         id <<- id + 1  
         level <<- level + 1  
         if (length(attr(x, "Children")) > 0) {  
             mapping <<- cbind(mapping, c(as.numeric(attr(x, "Children")[[1]]), id))  
             left <- set_id(attr(x, "Children")[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(as.numeric(attr(x, "Children")[[2]]), id))  
             right <- set_id(attr(x, "Children")[[2]])  
   
             attr(x, "Children") <- list(left, right)  
         }  
         level <<- level - 1  
         x  
102      }      }
103    
104      list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  `[[.PCorpus` <-
105    function(x, i)
106    {
107        i <- .map_name_index(x, i)
108        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
109        filehash::dbFetch(db, x$content[[i]])
110  }  }
111    `[[.VCorpus` <-
112  c2 <- function(x, y, ...) {  function(x, i)
113      # Update the CMetaData tree  {
114      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))      i <- .map_name_index(x, i)
115      update.struct <- update_id(cmeta)      if (!is.null(x$lazy))
116            .Call(copyCorpus, x, materialize(x, i))
117      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)      x$content[[i]]
   
     # Find indices to be updated for the left tree  
     indices.mapping <- NULL  
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
118      }      }
119    
120      # Update the DMetaData data frames for the left tree  `[[<-.PCorpus` <-
121      for (i in 1:ncol(update.struct$left.mapping)) {  function(x, i, value)
122          map <- update.struct$left.mapping[,i]  {
123          DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])      i <- .map_name_index(x, i)
124        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
125        db[[x$content[[i]]]] <- value
126        x
127      }      }
128    `[[<-.VCorpus` <-
129      # Find indices to be updated for the right tree  function(x, i, value)
130      indices.mapping <- NULL  {
131      for (m in levels(as.factor(DMetaData(y)$MetaID))) {      i <- .map_name_index(x, i)
132          indices <- (DMetaData(y)$MetaID == m)      # Mark new objects as inactive for lazy mapping
133          indices.mapping <- c(indices.mapping, list(m = indices))      if (!is.null(x$lazy))
134          names(indices.mapping)[length(indices.mapping)] <- m          x$lazy$index[i] <- FALSE
135        x$content[[i]] <- value
136        x
137      }      }
138    
139      # Update the DMetaData data frames for the right tree  as.list.PCorpus <- as.list.VCorpus <-
140      for (i in 1:ncol(update.struct$right.mapping)) {  function(x, ...)
141          map <- update.struct$right.mapping[,i]      content(x)
142          DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
143    as.VCorpus <-
144    function(x)
145        UseMethod("as.VCorpus")
146    as.VCorpus.VCorpus <- identity
147    as.VCorpus.list <-
148    function(x)
149    {
150        v <- list(content = x,
151                  meta = CorpusMeta(),
152                  dmeta = data.frame(row.names = seq_along(x)))
153        class(v) <- c("VCorpus", "Corpus")
154        v
155      }      }
156    
157      # Merge the DMetaData data frames  outer_union <-
158      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  function(x, y, ...)
159      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))  {
160      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)      if (nrow(x) > 0L)
161      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))          x[, setdiff(names(y), names(x))] <- NA
162      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))      if (nrow(y) > 0L)
163      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)          y[, setdiff(names(x), names(y))] <- NA
164      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)      res <- rbind(x, y)
165        if (ncol(res) == 0L)
166      new          res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
167        res
168  }  }
169    
170  c.Corpus <-  c.VCorpus <-
171  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
172  {  {
173      args <- list(...)      args <- list(...)
174        x <- args[[1L]]
175    
176      if (identical(length(args), 0))      if (length(args) == 1L)
177          return(x)          return(x)
178    
179      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
180          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
181    
182      if (inherits(x, "PCorpus"))      v <- list(content = do.call("c", lapply(args, content)),
183          stop("concatenation of corpora with underlying databases is not supported")                meta = CorpusMeta(meta = do.call("c",
184                    lapply(args, function(a) meta(a, type = "corpus")))),
185      Reduce(c2, base::c(list(x), args))                dmeta = Reduce(outer_union, lapply(args, meta)))
186        class(v) <- c("VCorpus", "Corpus")
187        v
188  }  }
189    
190  c.TextDocument <- function(x, ..., recursive = FALSE) {  content.VCorpus <-
191      args <- list(...)  function(x)
192    {
193      if (identical(length(args), 0))      if (!is.null(x$lazy))
194          return(x)          .Call(copyCorpus, x, materialize(x))
195        x$content
     if (!all(unlist(lapply(args, inherits, class(x)))))  
         stop("not all arguments are text documents")  
   
     dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)  
     .VCorpus(list(x, ...), .MetaDataNode(), dmeta)  
196  }  }
197    
198  print.Corpus <- function(x, ...) {  content.PCorpus <-
199      cat(sprintf(ngettext(length(x),  function(x)
200                           "A corpus with %d text document\n",  {
201                           "A corpus with %d text documents\n"),      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
202                  length(x)))      filehash::dbMultiFetch(db, unlist(x$content))
     invisible(x)  
203  }  }
204    
205  summary.Corpus <- function(x, ...) {  inspect <-
206    function(x)
207        UseMethod("inspect", x)
208    inspect.PCorpus <- inspect.VCorpus <-
209    function(x)
210    {
211      print(x)      print(x)
     if (length(DMetaData(x)) > 0) {  
         cat(sprintf(ngettext(length(attr(CMetaData(x), "MetaData")),  
                              "\nThe metadata consists of %d tag-value pair and a data frame\n",  
                              "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  
                     length(attr(CMetaData(x), "MetaData"))))  
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(attr(CMetaData(x), "MetaData")), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(x)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
 }  
   
 inspect <- function(x) UseMethod("inspect", x)  
 inspect.PCorpus <- function(x) {  
     summary(x)  
     cat("\n")  
     db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  
     show(filehash::dbMultiFetch(db, unlist(x)))  
 }  
 inspect.VCorpus <- function(x) {  
     summary(x)  
212      cat("\n")      cat("\n")
213      print(noquote(lapply(x, identity)))      print(noquote(content(x)))
214        invisible(x)
215  }  }
216    
217  # No metadata is checked  length.PCorpus <- length.VCorpus <-
218  `%IN%` <- function(x, y) UseMethod("%IN%", y)  function(x)
219  `%IN%.PCorpus` <- function(x, y) {      length(x$content)
220      db <- filehash::dbInit(DBControl(y)[["dbName"]], DBControl(y)[["dbType"]])  
221      any(unlist(lapply(y, function(x, z) {x %in% Content(z)}, x)))  names.PCorpus <- names.VCorpus <-
222  }  function(x)
223  `%IN%.VCorpus` <- function(x, y) x %in% y      as.character(meta(x, "id", "local"))
224    
225  lapply.PCorpus <- function(X, FUN, ...) {  `names<-.PCorpus` <- `names<-.VCorpus` <-
226      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  function(x, value)
227      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)  {
228        meta(x, "id", "local") <- as.character(value)
229        x
230  }  }
231  lapply.VCorpus <- function(X, FUN, ...) {  
232      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")  print.PCorpus <- print.VCorpus <-
233      if (!is.null(lazyTmMap))  function(x, ...)
234          .Call("copyCorpus", X, materialize(X))  {
235      base::lapply(X, FUN, ...)      writeLines(sprintf("<<%s (documents: %d, metadata (corpus/indexed): %d/%d)>>",
236                           class(x)[1],
237                           length(x),
238                           length(meta(x, type = "corpus")),
239                           ncol(meta(x, type = "indexed"))))
240        invisible(x)
241  }  }
242    
243  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
244    function(x, path = ".", filenames = NULL)
245    {
246      filenames <- file.path(path,      filenames <- file.path(path,
247                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
248              sprintf("%s.txt", as.character(meta(x, "id", "local")))
249                             else filenames)                             else filenames)
250      i <- 1  
251      for (o in x) {      stopifnot(length(x) == length(filenames))
252          writeLines(as.PlainTextDocument(o), filenames[i])  
253          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
254      }  
255        invisible(x)
256  }  }

Legend:
Removed from v.986  
changed lines
  Added in v.1409

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge