SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1114, Fri Nov 26 14:05:54 2010 UTC revision 1437, Wed Jul 13 19:23:49 2016 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  PCorpus <-
4      attr(x, "CMetaData") <- cmeta  function(x,
5      attr(x, "DMetaData") <- dmeta           readerControl = list(reader = reader(x), language = "en"),
6      attr(x, "DBControl") <- dbcontrol           dbControl = list(dbName = "", dbType = "DB1"))
7      class(x) <- c("PCorpus", "Corpus", "list")  {
8      x      stopifnot(inherits(x, "Source"))
 }  
 DBControl <- function(x) attr(x, "DBControl")  
   
 PCorpus <- function(x,  
                     readerControl = list(reader = x$DefaultReader, language = "en"),  
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
   
     if (is.function(readerControl$init))  
         readerControl$init()  
9    
10      if (is.function(readerControl$exit))      readerControl <- prepareReader(readerControl, reader(x))
         on.exit(readerControl$exit())  
11    
12      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
13          stop("error in creating database")          stop("error in creating database")
14      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
15    
16      # Allocate memory in advance if length is known      x <- open(x)
17      tdl <- if (x$Length > 0)      tdl <- vector("list", length(x))
         vector("list", as.integer(x$Length))  
     else  
         list()  
   
18      counter <- 1      counter <- 1
19      while (!eoi(x)) {      while (!eoi(x)) {
20          x <- stepNext(x)          x <- stepNext(x)
21          elem <- getElem(x)          elem <- getElem(x)
22          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          doc <- readerControl$reader(elem,
23          filehash::dbInsert(db, ID(doc), doc)                                      readerControl$language,
24          if (x$Length > 0) tdl[[counter]] <- ID(doc)                                      as.character(counter))
25          else tdl <- c(tdl, ID(doc))          filehash::dbInsert(db, meta(doc, "id"), doc)
26            tdl[[counter]] <- meta(doc, "id")
27          counter <- counter + 1          counter <- counter + 1
28      }      }
29      names(tdl) <- x$Names      x <- close(x)
   
     df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
     filehash::dbInsert(db, "DMetaData", df)  
     dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))  
30    
31      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      p <- list(content = tdl,
32                  meta = CorpusMeta(),
33                  dmeta = data.frame(row.names = seq_along(tdl)),
34                  dbcontrol = dbControl)
35        class(p) <- c("PCorpus", "Corpus")
36        p
37  }  }
38    
39  .VCorpus <- function(x, cmeta, dmeta) {  SimpleCorpus <-
40      attr(x, "CMetaData") <- cmeta  function(x, control = list(language = "en"))
41      attr(x, "DMetaData") <- dmeta  {
42      class(x) <- c("VCorpus", "Corpus", "list")      stopifnot(inherits(x, "Source"))
     x  
 }  
43    
44  # Register S3 corpus classes to be recognized by S4 methods. This is      if (!is.null(control$reader))
45  # mainly a fix to be compatible with packages which were originally          warning("custom reader is ignored")
46  # developed to cooperate with corresponding S4 tm classes. Necessary  
47  # since tm's class architecture was changed to S3 since tm version 0.5.      content <- if (inherits(x, "VectorSource"))
48  setOldClass(c("VCorpus", "Corpus", "list"))          x$content
49        else if (inherits(x, "DirSource")) {
50  # The "..." are additional arguments for the FunctionGenerator reader          setNames(as.character(
51  VCorpus <- Corpus <- function(x,                     lapply(x$filelist,
52                                readerControl = list(reader = x$DefaultReader, language = "en"),                            function(f) paste(readContent(f, x$encoding, "text"),
53                                ...) {                                              collapse = "\n"))
54      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)                     ),
55                     basename(x$filelist))
56      if (is.function(readerControl$init))      } else
57          readerControl$init()          stop("unsupported source type")
58        s <- list(content = content,
59      if (is.function(readerControl$exit))                meta = CorpusMeta(language = control$language),
60          on.exit(readerControl$exit())                dmeta = data.frame(row.names = seq_along(x)))
61        class(s) <- c("SimpleCorpus", "Corpus")
62      # Allocate memory in advance if length is known      s
63      tdl <- if (x$Length > 0)  }
64          vector("list", as.integer(x$Length))  
65      else  Corpus <-
66          list()  VCorpus <-
67    function(x, readerControl = list(reader = reader(x), language = "en"))
68    {
69        stopifnot(inherits(x, "Source"))
70    
71        readerControl <- prepareReader(readerControl, reader(x))
72    
73      if (x$Vectorized)      x <- open(x)
74          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),      tdl <- vector("list", length(x))
75        # Check for parallel element access
76        if (is.function(getS3method("pGetElem", class(x), TRUE)))
77            tdl <- mapply(function(elem, id)
78                              readerControl$reader(elem, readerControl$language, id),
79                        pGetElem(x),                        pGetElem(x),
80                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = as.character(seq_along(x)),
81                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
82      else {      else {
83          counter <- 1          counter <- 1
84          while (!eoi(x)) {          while (!eoi(x)) {
85              x <- stepNext(x)              x <- stepNext(x)
86              elem <- getElem(x)              elem <- getElem(x)
87              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              doc <- readerControl$reader(elem,
88              if (x$Length > 0)                                          readerControl$language,
89                                            as.character(counter))
90                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
             else  
                 tdl <- c(tdl, list(doc))  
91              counter <- counter + 1              counter <- counter + 1
92          }          }
93      }      }
94      names(tdl) <- x$Names      x <- close(x)
     df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
95    
96  `[.PCorpus` <- function(x, i) {      as.VCorpus(tdl)
     if (missing(i)) return(x)  
     index <- attr(x, "DMetaData")[[1 , "subset"]]  
     attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i  
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
97  }  }
98    
99  `[.VCorpus` <- function(x, i) {  `[.PCorpus` <-
100      if (missing(i)) return(x)  `[.SimpleCorpus` <-
101      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])  function(x, i)
102    {
103        if (!missing(i)) {
104            x$content <- x$content[i]
105            x$dmeta <- x$dmeta[i, , drop = FALSE]
106  }  }
107        x
108  `[<-.PCorpus` <- function(x, i, value) {  }
109      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  `[.VCorpus` <-
110      counter <- 1  function(x, i)
111      for (id in unclass(x)[i]) {  {
112          if (identical(length(value), 1L)) db[[id]] <- value      if (!missing(i)) {
113          else db[[id]] <- value[[counter]]          x$content <- x$content[i]
114          counter <- counter + 1          x$dmeta <- x$dmeta[i, , drop = FALSE]
115            if (!is.null(x$lazy))
116                x$lazy$index <- x$lazy$index[i]
117      }      }
118      x      x
119  }  }
120    
121  .map_name_index <- function(x, i) {  .map_name_index <-
122      if (is.character(i)) {  function(x, i)
123          if (is.null(names(x)))  {
124              match(i, meta(x, "ID", type = "local"))      if (is.character(i))
125            match(i, meta(x, "id", "local"))
126          else          else
             match(i, names(x))  
     }  
127      i      i
128  }  }
129    
130  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
131    function(x, i)
132    {
133      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
134      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
135      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, x$content[[i]])
136  }  }
137  `[[.VCorpus` <-  function(x, i) {  `[[.SimpleCorpus` <-
138    function(x, i)
139    {
140      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
141      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      n <- names(x$content)
142      if (!is.null(lazyTmMap))      PlainTextDocument(x$content[[i]],
143          .Call("copyCorpus", x, materialize(x, i))                        id = if (is.null(n)) i else n[i],
144      NextMethod("[[")                        language = meta(x, "language"))
145    }
146    `[[.VCorpus` <-
147    function(x, i)
148    {
149        i <- .map_name_index(x, i)
150        if (!is.null(x$lazy))
151            .Call(copyCorpus, x, materialize(x, i))
152        x$content[[i]]
153  }  }
154    
155  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
156    function(x, i, value)
157    {
158      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
159      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
160      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
161      x      x
162  }  }
163  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
164    function(x, i, value)
165    {
166      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
167      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
168      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
169      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
170          lazyTmMap$index[i] <- FALSE      x$content[[i]] <- value
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
171          x          x
172      }      }
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
   
 # Find indices to be updated for a CMetaData tree  
 .find_indices <- function(x) {  
     indices.mapping <- NULL  
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
     indices.mapping  
 }  
   
 c2 <- function(x, y, ...) {  
     # Update the CMetaData tree  
     cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  
     update.struct <- .update_id(cmeta)  
   
     new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)  
   
     # Find indices to be updated for the left tree  
     indices.mapping <- .find_indices(x)  
   
     # Update the DMetaData data frames for the left tree  
     for (i in 1:ncol(update.struct$left.mapping)) {  
         map <- update.struct$left.mapping[,i]  
         DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
   
     # Find indices to be updated for the right tree  
     indices.mapping <- .find_indices(y)  
173    
174      # Update the DMetaData data frames for the right tree  as.list.PCorpus <- as.list.VCorpus <-
175      for (i in 1:ncol(update.struct$right.mapping)) {  function(x, ...)
176          map <- update.struct$right.mapping[,i]      setNames(content(x), as.character(lapply(content(x), meta, "id")))
177          DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
178    as.list.SimpleCorpus <-
179    function(x, ...)
180        as.list(content(x))
181    
182    as.VCorpus <-
183    function(x)
184        UseMethod("as.VCorpus")
185    as.VCorpus.VCorpus <- identity
186    as.VCorpus.list <-
187    function(x)
188    {
189        v <- list(content = x,
190                  meta = CorpusMeta(),
191                  dmeta = data.frame(row.names = seq_along(x)))
192        class(v) <- c("VCorpus", "Corpus")
193        v
194      }      }
195    
196      # Merge the DMetaData data frames  outer_union <-
197      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  function(x, y, ...)
198      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))  {
199      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)      if (nrow(x) > 0L)
200      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))          x[, setdiff(names(y), names(x))] <- NA
201      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))      if (nrow(y) > 0L)
202      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)          y[, setdiff(names(x), names(y))] <- NA
203      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)      res <- rbind(x, y)
204        if (ncol(res) == 0L)
205      new          res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
206        res
207  }  }
208    
209  c.Corpus <-  c.VCorpus <-
210  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
211  {  {
212      args <- list(...)      args <- list(...)
213        x <- args[[1L]]
214    
215      if (identical(length(args), 0L))      if (length(args) == 1L)
216          return(x)          return(x)
217    
218      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
219          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
220    
221      if (inherits(x, "PCorpus"))      v <- list(content = do.call("c", lapply(args, content)),
222          stop("concatenation of corpora with underlying databases is not supported")                meta = CorpusMeta(meta = do.call("c",
223                    lapply(args, function(a) meta(a, type = "corpus")))),
224      l <- base::c(list(x), args)                dmeta = Reduce(outer_union, lapply(args, meta)))
225      if (recursive)      class(v) <- c("VCorpus", "Corpus")
226          Reduce(c2, l)      v
     else {  
         l <- do.call("c", lapply(l, unclass))  
         .VCorpus(l,  
                  cmeta = .MetaDataNode(),  
                  dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))  
     }  
227  }  }
228    
229  c.TextDocument <- function(x, ..., recursive = FALSE) {  content.VCorpus <-
230      args <- list(...)  function(x)
231    {
232      if (identical(length(args), 0L))      if (!is.null(x$lazy))
233          return(x)          .Call(copyCorpus, x, materialize(x))
234        x$content
235    }
236    
237      if (!all(unlist(lapply(args, inherits, class(x)))))  content.SimpleCorpus <-
238          stop("not all arguments are text documents")  function(x)
239        x$content
240    
241      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)  content.PCorpus <-
242      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)  function(x)
243    {
244        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
245        filehash::dbMultiFetch(db, unlist(x$content))
246  }  }
247    
248  print.Corpus <- function(x, ...) {  inspect <-
249      cat(sprintf(ngettext(length(x),  function(x)
250                           "A corpus with %d text document\n",      UseMethod("inspect", x)
251                           "A corpus with %d text documents\n"),  inspect.PCorpus <-
252                  length(x)))  inspect.SimpleCorpus <-
253    inspect.VCorpus <-
254    function(x)
255    {
256        print(x)
257        cat("\n")
258        print(noquote(content(x)))
259      invisible(x)      invisible(x)
260  }  }
261    
262  summary.Corpus <- function(object, ...) {  length.PCorpus <-
263      print(object)  length.SimpleCorpus <-
264      if (length(DMetaData(object)) > 0) {  length.VCorpus <-
265          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  function(x)
266                               "\nThe metadata consists of %d tag-value pair and a data frame\n",      length(x$content)
267                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  
268                      length(CMetaData(object)$MetaData)))  names.PCorpus <-
269          cat("Available tags are:\n")  names.SimpleCorpus <-
270          cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  names.VCorpus <-
271          cat("Available variables in the data frame are:\n")  function(x)
272          cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")      as.character(meta(x, "id", "local"))
     }  
 }  
273    
274  inspect <- function(x) UseMethod("inspect", x)  `names<-.PCorpus` <- `names<-.VCorpus` <-
275  inspect.PCorpus <- function(x) {  function(x, value)
276      summary(x)  {
277      cat("\n")      meta(x, "id", "local") <- as.character(value)
278      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      x
     show(filehash::dbMultiFetch(db, unlist(x)))  
 }  
 inspect.VCorpus <- function(x) {  
     summary(x)  
     cat("\n")  
     print(noquote(lapply(x, identity)))  
279  }  }
280    
281  lapply.PCorpus <- function(X, FUN, ...) {  format.PCorpus <-
282      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  format.SimpleCorpus <-
283      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)  format.VCorpus <-
284  }  function(x, ...)
285  lapply.VCorpus <- function(X, FUN, ...) {  {
286      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")      c(sprintf("<<%s>>", class(x)[1L]),
287      if (!is.null(lazyTmMap))        sprintf("Metadata:  corpus specific: %d, document level (indexed): %d",
288          .Call("copyCorpus", X, materialize(X))                length(meta(x, type = "corpus")),
289      base::lapply(X, FUN, ...)                ncol(meta(x, type = "indexed"))),
290          sprintf("Content:  documents: %d", length(x)))
291  }  }
292    
293  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
294    function(x, path = ".", filenames = NULL)
295    {
296      filenames <- file.path(path,      filenames <- file.path(path,
297                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
298              sprintf("%s.txt", as.character(meta(x, "id", "local")))
299                             else filenames)                             else filenames)
300      i <- 1  
301      for (o in x) {      stopifnot(length(x) == length(filenames))
302          writeLines(as.PlainTextDocument(o), filenames[i])  
303          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
304      }  
305        invisible(x)
306  }  }

Legend:
Removed from v.1114  
changed lines
  Added in v.1437

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge