SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1114, Fri Nov 26 14:05:54 2010 UTC revision 1379, Tue May 27 17:55:29 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  PCorpus <-
4      attr(x, "CMetaData") <- cmeta  function(x,
5      attr(x, "DMetaData") <- dmeta           readerControl = list(reader = reader(x), language = "en"),
6      attr(x, "DBControl") <- dbcontrol           dbControl = list(dbName = "", dbType = "DB1"))
7      class(x) <- c("PCorpus", "Corpus", "list")  {
8      x      stopifnot(inherits(x, "Source"))
 }  
 DBControl <- function(x) attr(x, "DBControl")  
9    
10  PCorpus <- function(x,      readerControl <- prepareReader(readerControl, reader(x))
                     readerControl = list(reader = x$DefaultReader, language = "en"),  
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
11    
12      if (is.function(readerControl$init))      if (is.function(readerControl$init))
13          readerControl$init()          readerControl$init()
# Line 25  Line 19 
19          stop("error in creating database")          stop("error in creating database")
20      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22      # Allocate memory in advance if length is known      tdl <- vector("list", length(x))
     tdl <- if (x$Length > 0)  
         vector("list", as.integer(x$Length))  
     else  
         list()  
   
23      counter <- 1      counter <- 1
24      while (!eoi(x)) {      while (!eoi(x)) {
25          x <- stepNext(x)          x <- stepNext(x)
26          elem <- getElem(x)          elem <- getElem(x)
27          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          doc <- readerControl$reader(elem,
28          filehash::dbInsert(db, ID(doc), doc)                                      readerControl$language,
29          if (x$Length > 0) tdl[[counter]] <- ID(doc)                                      as.character(counter))
30          else tdl <- c(tdl, ID(doc))          filehash::dbInsert(db, meta(doc, "id"), doc)
31            tdl[[counter]] <- meta(doc, "id")
32          counter <- counter + 1          counter <- counter + 1
33      }      }
     names(tdl) <- x$Names  
34    
35      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      structure(list(content = tdl,
36      filehash::dbInsert(db, "DMetaData", df)                     meta = CorpusMeta(),
37      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))                     dmeta = data.frame(row.names = seq_along(tdl)),
38                       dbcontrol = dbControl),
39      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)                class = c("PCorpus", "Corpus"))
40  }  }
41    
42  .VCorpus <- function(x, cmeta, dmeta) {  Corpus <-
43      attr(x, "CMetaData") <- cmeta  VCorpus <-
44      attr(x, "DMetaData") <- dmeta  function(x, readerControl = list(reader = reader(x), language = "en"))
45      class(x) <- c("VCorpus", "Corpus", "list")  {
46      x      stopifnot(inherits(x, "Source"))
 }  
47    
48  # Register S3 corpus classes to be recognized by S4 methods. This is      readerControl <- prepareReader(readerControl, reader(x))
 # mainly a fix to be compatible with packages which were originally  
 # developed to cooperate with corresponding S4 tm classes. Necessary  
 # since tm's class architecture was changed to S3 since tm version 0.5.  
 setOldClass(c("VCorpus", "Corpus", "list"))  
   
 # The "..." are additional arguments for the FunctionGenerator reader  
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "en"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
49    
50      if (is.function(readerControl$init))      if (is.function(readerControl$init))
51          readerControl$init()          readerControl$init()
# Line 75  Line 53 
53      if (is.function(readerControl$exit))      if (is.function(readerControl$exit))
54          on.exit(readerControl$exit())          on.exit(readerControl$exit())
55    
56      # Allocate memory in advance if length is known      tdl <- vector("list", length(x))
57      tdl <- if (x$Length > 0)      # Check for parallel element access
58          vector("list", as.integer(x$Length))      if (is.function(getS3method("pGetElem", class(x), TRUE)))
59      else          tdl <- mapply(function(elem, id)
60          list()                            readerControl$reader(elem, readerControl$language, id),
   
     if (x$Vectorized)  
         tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),  
61                        pGetElem(x),                        pGetElem(x),
62                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = as.character(seq_along(x)),
63                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
64      else {      else {
65          counter <- 1          counter <- 1
66          while (!eoi(x)) {          while (!eoi(x)) {
67              x <- stepNext(x)              x <- stepNext(x)
68              elem <- getElem(x)              elem <- getElem(x)
69              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              doc <- readerControl$reader(elem,
70              if (x$Length > 0)                                          readerControl$language,
71                                            as.character(counter))
72                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
             else  
                 tdl <- c(tdl, list(doc))  
73              counter <- counter + 1              counter <- counter + 1
74          }          }
75      }      }
     names(tdl) <- x$Names  
     df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
76    
77  `[.PCorpus` <- function(x, i) {      structure(list(content = tdl,
78      if (missing(i)) return(x)                     meta = CorpusMeta(),
79      index <- attr(x, "DMetaData")[[1 , "subset"]]                     dmeta = data.frame(row.names = seq_along(tdl))),
80      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i                class = c("VCorpus", "Corpus"))
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
81  }  }
82    
83  `[.VCorpus` <- function(x, i) {  `[.PCorpus` <-
84      if (missing(i)) return(x)  function(x, i)
85      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])  {
86        if (!missing(i)) {
87            x$content <- x$content[i]
88            x$dmeta <- x$dmeta[i, , drop = FALSE]
89        }
90        x
91  }  }
92    
93  `[<-.PCorpus` <- function(x, i, value) {  `[.VCorpus` <-
94      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  function(x, i)
95      counter <- 1  {
96      for (id in unclass(x)[i]) {      if (!missing(i)) {
97          if (identical(length(value), 1L)) db[[id]] <- value          x$content <- x$content[i]
98          else db[[id]] <- value[[counter]]          x$dmeta <- x$dmeta[i, , drop = FALSE]
99          counter <- counter + 1          if (!is.null(x$lazy))
100                x$lazy$index <- x$lazy$index[i]
101      }      }
102      x      x
103  }  }
104    
105  .map_name_index <- function(x, i) {  .map_name_index <-
106      if (is.character(i)) {  function(x, i)
107          if (is.null(names(x)))  {
108              match(i, meta(x, "ID", type = "local"))      if (is.character(i))
109            match(i, meta(x, "id", "local"))
110          else          else
             match(i, names(x))  
     }  
111      i      i
112  }  }
113    
114  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
115    function(x, i)
116    {
117      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
118      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
119      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, x$content[[i]])
120  }  }
121  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
122    function(x, i)
123    {
124      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
125      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
126      if (!is.null(lazyTmMap))          .Call(copyCorpus, x, materialize(x, i))
127          .Call("copyCorpus", x, materialize(x, i))      x$content[[i]]
     NextMethod("[[")  
128  }  }
129    
130  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
131    function(x, i, value)
132    {
133      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
134      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
135      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
136      x      x
137  }  }
138  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
139    function(x, i, value)
140    {
141      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
142      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
143      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
144      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
145          lazyTmMap$index[i] <- FALSE      x$content[[i]] <- value
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
146          x          x
147      }      }
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
   
 # Find indices to be updated for a CMetaData tree  
 .find_indices <- function(x) {  
     indices.mapping <- NULL  
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
     indices.mapping  
 }  
   
 c2 <- function(x, y, ...) {  
     # Update the CMetaData tree  
     cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  
     update.struct <- .update_id(cmeta)  
   
     new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)  
   
     # Find indices to be updated for the left tree  
     indices.mapping <- .find_indices(x)  
   
     # Update the DMetaData data frames for the left tree  
     for (i in 1:ncol(update.struct$left.mapping)) {  
         map <- update.struct$left.mapping[,i]  
         DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
   
     # Find indices to be updated for the right tree  
     indices.mapping <- .find_indices(y)  
   
     # Update the DMetaData data frames for the right tree  
     for (i in 1:ncol(update.struct$right.mapping)) {  
         map <- update.struct$right.mapping[,i]  
         DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
148    
149      # Merge the DMetaData data frames  as.list.PCorpus <- as.list.VCorpus <-
150      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  function(x, ...)
151      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))      content(x)
152      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)  
153      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))  as.VCorpus <-
154      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))  function(x)
155      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)      UseMethod("as.VCorpus")
156      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)  as.VCorpus.VCorpus <- identity
157    
158      new  outer_union <-
159    function(x, y, ...)
160    {
161        if (nrow(x) > 0L)
162            x[, setdiff(names(y), names(x))] <- NA
163        if (nrow(y) > 0L)
164            y[, setdiff(names(x), names(y))] <- NA
165        res <- rbind(x, y)
166        if (ncol(res) == 0L)
167            res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
168        res
169  }  }
170    
171  c.Corpus <-  c.VCorpus <-
172  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
173  {  {
174      args <- list(...)      args <- list(...)
175        x <- args[[1L]]
176    
177      if (identical(length(args), 0L))      if (length(args) == 1L)
178          return(x)          return(x)
179    
180      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
181          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
182    
183      if (inherits(x, "PCorpus"))      structure(list(content = do.call("c", lapply(args, content)),
184          stop("concatenation of corpora with underlying databases is not supported")                     meta = CorpusMeta(meta = do.call("c",
185                         lapply(args, function(a) meta(a, type = "corpus")))),
186      l <- base::c(list(x), args)                     dmeta = Reduce(outer_union, lapply(args, meta))),
187      if (recursive)                class = c("VCorpus", "Corpus"))
         Reduce(c2, l)  
     else {  
         l <- do.call("c", lapply(l, unclass))  
         .VCorpus(l,  
                  cmeta = .MetaDataNode(),  
                  dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))  
     }  
188  }  }
189    
190  c.TextDocument <- function(x, ..., recursive = FALSE) {  content.VCorpus <-
191      args <- list(...)  function(x)
192    {
193      if (identical(length(args), 0L))      if (!is.null(x$lazy))
194          return(x)          .Call(copyCorpus, x, materialize(x))
195        x$content
196      if (!all(unlist(lapply(args, inherits, class(x)))))  }
         stop("not all arguments are text documents")  
197    
198      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)  content.PCorpus <-
199      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)  function(x)
200    {
201        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
202        filehash::dbMultiFetch(db, unlist(x$content))
203  }  }
204    
205  print.Corpus <- function(x, ...) {  inspect <-
206      cat(sprintf(ngettext(length(x),  function(x)
207                           "A corpus with %d text document\n",      UseMethod("inspect", x)
208                           "A corpus with %d text documents\n"),  inspect.PCorpus <- inspect.VCorpus <-
209                  length(x)))  function(x)
210    {
211        print(x)
212        cat("\n")
213        print(noquote(content(x)))
214      invisible(x)      invisible(x)
215  }  }
216    
217  summary.Corpus <- function(object, ...) {  length.PCorpus <- length.VCorpus <-
218      print(object)  function(x)
219      if (length(DMetaData(object)) > 0) {      length(x$content)
220          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  
221                               "\nThe metadata consists of %d tag-value pair and a data frame\n",  names.PCorpus <- names.VCorpus <-
222                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  function(x)
223                      length(CMetaData(object)$MetaData)))      as.character(meta(x, "id", "local"))
         cat("Available tags are:\n")  
         cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  
         cat("Available variables in the data frame are:\n")  
         cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")  
     }  
 }  
224    
225  inspect <- function(x) UseMethod("inspect", x)  `names<-.PCorpus` <- `names<-.VCorpus` <-
226  inspect.PCorpus <- function(x) {  function(x, value)
227      summary(x)  {
228      cat("\n")      meta(x, "id", "local") <- as.character(value)
229      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      x
     show(filehash::dbMultiFetch(db, unlist(x)))  
 }  
 inspect.VCorpus <- function(x) {  
     summary(x)  
     cat("\n")  
     print(noquote(lapply(x, identity)))  
230  }  }
231    
232  lapply.PCorpus <- function(X, FUN, ...) {  print.PCorpus <- print.VCorpus <-
233      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  function(x, ...)
234      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)  {
235  }      writeLines(sprintf("<<%s (documents: %d, metadata (corpus/indexed): %d/%d)>>",
236  lapply.VCorpus <- function(X, FUN, ...) {                         class(x)[1],
237      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")                         length(x),
238      if (!is.null(lazyTmMap))                         length(meta(x, type = "corpus")),
239          .Call("copyCorpus", X, materialize(X))                         ncol(meta(x, type = "indexed"))))
240      base::lapply(X, FUN, ...)      invisible(x)
241  }  }
242    
243  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
244    function(x, path = ".", filenames = NULL)
245    {
246      filenames <- file.path(path,      filenames <- file.path(path,
247                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
248              sprintf("%s.txt", as.character(meta(x, "id", "local")))
249                             else filenames)                             else filenames)
250      i <- 1  
251      for (o in x) {      stopifnot(length(x) == length(filenames))
252          writeLines(as.PlainTextDocument(o), filenames[i])  
253          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
254      }  
255        invisible(x)
256  }  }

Legend:
Removed from v.1114  
changed lines
  Added in v.1379

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge