SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1070, Tue May 18 08:58:22 2010 UTC revision 1333, Fri Apr 18 10:38:46 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  PCorpus <-
4      attr(x, "CMetaData") <- cmeta  function(x,
5      attr(x, "DMetaData") <- dmeta           readerControl = list(reader = x$defaultreader, language = "en"),
6      attr(x, "DBControl") <- dbcontrol           dbControl = list(dbName = "", dbType = "DB1"))
7      class(x) <- c("PCorpus", "Corpus", "list")  {
8      x      stopifnot(inherits(x, "Source"))
9  }  
10  DBControl <- function(x) attr(x, "DBControl")      readerControl <- prepareReader(readerControl, x$defaultreader)
11    
12        if (is.function(readerControl$init))
13            readerControl$init()
14    
15  PCorpus <- function(x,      if (is.function(readerControl$exit))
16                      readerControl = list(reader = x$DefaultReader, language = "eng"),          on.exit(readerControl$exit())
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
17    
18      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19          stop("error in creating database")          stop("error in creating database")
20      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
23      tdl <- if (x$Length > 0)      tdl <- if (x$length > 0) vector("list", as.integer(x$length)) else list()
         vector("list", as.integer(x$Length))  
     else  
         list()  
24    
25      counter <- 1      counter <- 1
26      while (!eoi(x)) {      while (!eoi(x)) {
27          x <- stepNext(x)          x <- stepNext(x)
28          elem <- getElem(x)          elem <- getElem(x)
29          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          id <- if (is.null(x$names) || is.na(x$names))
30          filehash::dbInsert(db, ID(doc), doc)              as.character(counter)
31          if (x$Length > 0) tdl[[counter]] <- ID(doc)          else
32          else tdl <- c(tdl, ID(doc))              x$names[counter]
33            doc <- readerControl$reader(elem, readerControl$language, id)
34            filehash::dbInsert(db, meta(doc, "id"), doc)
35            if (x$length > 0) tdl[[counter]] <- meta(doc, "id")
36            else tdl <- c(tdl, meta(doc, "id"))
37          counter <- counter + 1          counter <- counter + 1
38      }      }
39      names(tdl) <- x$Names      if (!is.null(x$names) && !is.na(x$names))
40            names(tdl) <- x$names
41    
42      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      structure(list(content = tdl,
43      filehash::dbInsert(db, "DMetaData", df)                     meta = CorpusMeta(),
44      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))                     dmeta = data.frame(row.names = seq_along(tdl)),
45                       dbcontrol = dbControl),
46      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)                class = c("PCorpus", "Corpus"))
47  }  }
48    
49  .VCorpus <- function(x, cmeta, dmeta) {  VCorpus <-
50      attr(x, "CMetaData") <- cmeta  function(x, readerControl = list(reader = x$defaultreader, language = "en"))
51      attr(x, "DMetaData") <- dmeta  {
52      class(x) <- c("VCorpus", "Corpus", "list")      stopifnot(inherits(x, "Source"))
53      x  
54  }      readerControl <- prepareReader(readerControl, x$defaultreader)
55    
56        if (is.function(readerControl$init))
57            readerControl$init()
58    
59  # Register S3 corpus classes to be recognized by S4 methods. This is      if (is.function(readerControl$exit))
60  # mainly a fix to be compatible with packages which were originally          on.exit(readerControl$exit())
 # developed to cooperate with corresponding S4 tm classes. Necessary  
 # since tm's class architecture was changed to S3 since tm version 0.5.  
 setOldClass(c("VCorpus", "Corpus", "list"))  
   
 # The "..." are additional arguments for the FunctionGenerator reader  
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "eng"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
61    
62      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
63      tdl <- if (x$Length > 0)      tdl <- if (x$length > 0) vector("list", as.integer(x$length)) else list()
         vector("list", as.integer(x$Length))  
     else  
         list()  
64    
65      if (x$Vectorized)      if (x$vectorized)
66          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),          tdl <- mapply(function(elem, id)
67                              readerControl$reader(elem, readerControl$language, id),
68                        pGetElem(x),                        pGetElem(x),
69                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = if (is.null(x$names) || is.na(x$names))
70                              as.character(seq_len(x$length))
71                          else x$names,
72                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
73      else {      else {
74          counter <- 1          counter <- 1
75          while (!eoi(x)) {          while (!eoi(x)) {
76              x <- stepNext(x)              x <- stepNext(x)
77              elem <- getElem(x)              elem <- getElem(x)
78              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              id <- if (is.null(x$names) || is.na(x$names))
79              if (x$Length > 0)                  as.character(counter)
80                else
81                    x$names[counter]
82                doc <- readerControl$reader(elem, readerControl$language, id)
83                if (x$length > 0)
84                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
85              else              else
86                  tdl <- c(tdl, list(doc))                  tdl <- c(tdl, list(doc))
87              counter <- counter + 1              counter <- counter + 1
88          }          }
89      }      }
90      names(tdl) <- x$Names      if (!is.null(x$names) && !is.na(x$names))
91      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)          names(tdl) <- x$names
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
   
 `[.PCorpus` <- function(x, i) {  
     if (missing(i)) return(x)  
     index <- attr(x, "DMetaData")[[1 , "subset"]]  
     attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i  
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
 }  
92    
93  `[.VCorpus` <- function(x, i) {      structure(list(content = tdl,
94      if (missing(i)) return(x)                     meta = CorpusMeta(),
95      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])                     dmeta = data.frame(row.names = seq_along(tdl))),
96                  class = c("VCorpus", "Corpus"))
97  }  }
98    
99  `[<-.PCorpus` <- function(x, i, value) {  `[.PCorpus` <-
100      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
101      counter <- 1  {
102      for (id in unclass(x)[i]) {      if (!missing(i)) {
103          if (identical(length(value), 1L)) db[[id]] <- value          x$content <- x$content[i]
104          else db[[id]] <- value[[counter]]          x$dmeta <- x$dmeta[i, , drop = FALSE]
         counter <- counter + 1  
105      }      }
106      x      x
107  }  }
108    
109  `[[.PCorpus` <-  function(x, i) {  `[.VCorpus` <-
110      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
111      filehash::dbFetch(db, NextMethod("[["))  {
112  }      if (!missing(i)) {
113  `[[.VCorpus` <-  function(x, i) {          x$content <- x$content[i]
114      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")          x$dmeta <- x$dmeta[i, , drop = FALSE]
115      if (!is.null(lazyTmMap))          if (!is.null(x$lazy))
116          .Call("copyCorpus", x, materialize(x, i))              x$lazy$index <- x$lazy$index[i]
     NextMethod("[[")  
117  }  }
   
 `[[<-.PCorpus` <-  function(x, i, value) {  
     db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  
     index <- unclass(x)[[i]]  
     db[[index]] <- value  
118      x      x
119  }  }
 `[[<-.VCorpus` <-  function(x, i, value) {  
     # Mark new objects as not active for lazy mapping  
     lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")  
     if (!is.null(lazyTmMap)) {  
         lazyTmMap$index[i] <- FALSE  
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
120    
121              x$Children <- list(left, right)  .map_name_index <-
122          }  function(x, i)
123          level <<- level - 1  {
124          x      if (is.character(i))
125      }          match(i, if (is.null(names(x))) meta(x, "id", "local") else names(x))
126      list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)      else
127            i
128  }  }
129    
130  # Find indices to be updated for a CMetaData tree  `[[.PCorpus` <-
131  .find_indices <- function(x) {  function(x, i)
132      indices.mapping <- NULL  {
133      for (m in levels(as.factor(DMetaData(x)$MetaID))) {      i <- .map_name_index(x, i)
134          indices <- (DMetaData(x)$MetaID == m)      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
135          indices.mapping <- c(indices.mapping, list(m = indices))      filehash::dbFetch(db, x$content[[i]])
         names(indices.mapping)[length(indices.mapping)] <- m  
136      }      }
137      indices.mapping  `[[.VCorpus` <-
138    function(x, i)
139    {
140        i <- .map_name_index(x, i)
141        if (!is.null(x$lazy))
142            .Call(copyCorpus, x, materialize(x, i))
143        x$content[[i]]
144  }  }
145    
146  c2 <- function(x, y, ...) {  `[[<-.PCorpus` <-
147      # Update the CMetaData tree  function(x, i, value)
148      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  {
149      update.struct <- .update_id(cmeta)      i <- .map_name_index(x, i)
150        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
151      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)      db[[x$content[[i]]]] <- value
152        x
     # Find indices to be updated for the left tree  
     indices.mapping <- .find_indices(x)  
   
     # Update the DMetaData data frames for the left tree  
     for (i in 1:ncol(update.struct$left.mapping)) {  
         map <- update.struct$left.mapping[,i]  
         DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
153      }      }
154    `[[<-.VCorpus` <-
155      # Find indices to be updated for the right tree  function(x, i, value)
156      indices.mapping <- .find_indices(y)  {
157        i <- .map_name_index(x, i)
158      # Update the DMetaData data frames for the right tree      # Mark new objects as inactive for lazy mapping
159      for (i in 1:ncol(update.struct$right.mapping)) {      if (!is.null(x$lazy))
160          map <- update.struct$right.mapping[,i]          x$lazy$index[i] <- FALSE
161          DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])      x$content[[i]] <- value
162        x
163      }      }
164    
165      # Merge the DMetaData data frames  outer_union <-
166      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  function(x, y, ...)
167      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))  {
168      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)      if (nrow(x) > 0L)
169      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))          x[, setdiff(names(y), names(x))] <- NA
170      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))      if (nrow(y) > 0L)
171      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)          y[, setdiff(names(x), names(y))] <- NA
172      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)      res <- rbind(x, y)
173        if (ncol(res) == 0L)
174      new          res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
175        res
176  }  }
177    
178  c.Corpus <-  c.VCorpus <-
179  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
180  {  {
181      args <- list(...)      args <- list(...)
182        x <- args[[1L]]
183    
184      if (identical(length(args), 0L))      if (length(args) == 1L)
185          return(x)          return(x)
186    
187      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
188          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
189    
190      if (inherits(x, "PCorpus"))      structure(list(content = do.call("c", lapply(args, content)),
191          stop("concatenation of corpora with underlying databases is not supported")                     meta = structure(do.call("c",
192                         lapply(args, function(a) meta(a, type = "corpus"))),
193      Reduce(c2, base::c(list(x), args))                                      class = "CorpusMeta"),
194                       dmeta = Reduce(outer_union, lapply(args, meta))),
195                  class = c("VCorpus", "Corpus"))
196  }  }
197    
198  c.TextDocument <- function(x, ..., recursive = FALSE) {  as.list.PCorpus <- as.list.VCorpus <-
199      args <- list(...)  function(x, ...)
200        content(x)
     if (identical(length(args), 0L))  
         return(x)  
   
     if (!all(unlist(lapply(args, inherits, class(x)))))  
         stop("not all arguments are text documents")  
201    
202      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)  content.VCorpus <-
203      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)  function(x)
204    {
205        if (!is.null(x$lazy))
206            .Call(copyCorpus, x, materialize(x))
207        x$content
208  }  }
209    
210  print.Corpus <- function(x, ...) {  content.PCorpus <-
211      cat(sprintf(ngettext(length(x),  function(x)
212                           "A corpus with %d text document\n",  {
213                           "A corpus with %d text documents\n"),      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
214                  length(x)))      filehash::dbMultiFetch(db, unlist(x$content))
     invisible(x)  
215  }  }
216    
217  summary.Corpus <- function(object, ...) {  length.PCorpus <- length.VCorpus <-
218      print(object)  function(x)
219      if (length(DMetaData(object)) > 0) {      length(x$content)
         cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  
                              "\nThe metadata consists of %d tag-value pair and a data frame\n",  
                              "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  
                     length(CMetaData(object)$MetaData)))  
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
 }  
220    
221  inspect <- function(x) UseMethod("inspect", x)  print.PCorpus <- print.VCorpus <-
222  inspect.PCorpus <- function(x) {  function(x, ...)
223      summary(x)  {
224      cat("\n")      writeLines(sprintf("<<%s (documents: %d, metadata (corpus/indexed): %d/%d)>>",
225      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])                         class(x)[1],
226      show(filehash::dbMultiFetch(db, unlist(x)))                         length(x),
227  }                         length(meta(x, type = "corpus")),
228  inspect.VCorpus <- function(x) {                         ncol(meta(x, type = "indexed"))))
229      summary(x)      invisible(x)
     cat("\n")  
     print(noquote(lapply(x, identity)))  
230  }  }
231    
232  lapply.PCorpus <- function(X, FUN, ...) {  inspect <-
233      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  function(x)
234      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)      UseMethod("inspect", x)
235  }  inspect.PCorpus <- inspect.VCorpus <-
236  lapply.VCorpus <- function(X, FUN, ...) {  function(x)
237      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")  {
238      if (!is.null(lazyTmMap))      print(x)
239          .Call("copyCorpus", X, materialize(X))      cat("\n")
240      base::lapply(X, FUN, ...)      print(noquote(content(x)))
241        invisible(x)
242  }  }
243    
244  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
245    function(x, path = ".", filenames = NULL)
246    {
247      filenames <- file.path(path,      filenames <- file.path(path,
248                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
249              sprintf("%s.txt", as.character(meta(x, "id", "local")))
250                             else filenames)                             else filenames)
251      i <- 1  
252      for (o in x) {      stopifnot(length(x) == length(filenames))
253          writeLines(as.PlainTextDocument(o), filenames[i])  
254          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
255      }  
256        invisible(x)
257  }  }

Legend:
Removed from v.1070  
changed lines
  Added in v.1333

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge