SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1114, Fri Nov 26 14:05:54 2010 UTC revision 1481, Sat May 20 10:28:00 2017 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  Corpus <-
4      attr(x, "CMetaData") <- cmeta  function(x, readerControl = list(reader = reader(x), language = "en"))
5      attr(x, "DMetaData") <- dmeta  {
6      attr(x, "DBControl") <- dbcontrol      stopifnot(inherits(x, "Source"))
     class(x) <- c("PCorpus", "Corpus", "list")  
     x  
 }  
 DBControl <- function(x) attr(x, "DBControl")  
7    
8  PCorpus <- function(x,      readerControl <- prepareReader(readerControl, reader(x))
                     readerControl = list(reader = x$DefaultReader, language = "en"),  
                     dbControl = list(dbName = "", dbType = "DB1"),  
                     ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
9    
10      if (is.function(readerControl$init))      if ( (inherits(x, "DataframeSource") || inherits(x, "DirSource") ||
11          readerControl$init()            inherits(x, "VectorSource") ) &&
12            identical(readerControl$reader, reader(x)))
13            SimpleCorpus(x, readerControl)
14        else
15            VCorpus(x, readerControl)
16    }
17    
18      if (is.function(readerControl$exit))  PCorpus <-
19          on.exit(readerControl$exit())  function(x,
20             readerControl = list(reader = reader(x), language = "en"),
21             dbControl = list(dbName = "", dbType = "DB1"))
22    {
23        stopifnot(inherits(x, "Source"))
24    
25        readerControl <- prepareReader(readerControl, reader(x))
26    
27      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
28          stop("error in creating database")          stop("error in creating database")
29      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
30    
31      # Allocate memory in advance if length is known      x <- open(x)
32      tdl <- if (x$Length > 0)      tdl <- vector("list", length(x))
         vector("list", as.integer(x$Length))  
     else  
         list()  
   
33      counter <- 1      counter <- 1
34      while (!eoi(x)) {      while (!eoi(x)) {
35          x <- stepNext(x)          x <- stepNext(x)
36          elem <- getElem(x)          elem <- getElem(x)
37          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          doc <- readerControl$reader(elem,
38          filehash::dbInsert(db, ID(doc), doc)                                      readerControl$language,
39          if (x$Length > 0) tdl[[counter]] <- ID(doc)                                      as.character(counter))
40          else tdl <- c(tdl, ID(doc))          filehash::dbInsert(db, meta(doc, "id"), doc)
41            tdl[[counter]] <- meta(doc, "id")
42          counter <- counter + 1          counter <- counter + 1
43      }      }
44      names(tdl) <- x$Names      x <- close(x)
45    
46      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      cmeta <- CorpusMeta()
47      filehash::dbInsert(db, "DMetaData", df)      dmeta <- data.frame(row.names = seq_along(tdl))
48      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))      # Check if metadata retrieval is supported
49        if (is.function(getS3method("getMeta", class(x), TRUE))) {
50      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)          m <- getMeta(x)
51            if (!is.null(m$cmeta)) cmeta <- m$cmeta
52            if (!is.null(m$dmeta)) dmeta <- m$dmeta
53  }  }
54    
55  .VCorpus <- function(x, cmeta, dmeta) {      p <- list(content = tdl, meta = cmeta, dmeta = dmeta, dbcontrol = dbControl)
56      attr(x, "CMetaData") <- cmeta      class(p) <- c("PCorpus", "Corpus")
57      attr(x, "DMetaData") <- dmeta      p
     class(x) <- c("VCorpus", "Corpus", "list")  
     x  
58  }  }
59    
60  # Register S3 corpus classes to be recognized by S4 methods. This is  SimpleCorpus <-
61  # mainly a fix to be compatible with packages which were originally  function(x, control = list(language = "en"))
62  # developed to cooperate with corresponding S4 tm classes. Necessary  {
63  # since tm's class architecture was changed to S3 since tm version 0.5.      stopifnot(inherits(x, "Source"))
64  setOldClass(c("VCorpus", "Corpus", "list"))  
65        if (!is.null(control$reader) && !identical(control$reader, reader(x)))
66  # The "..." are additional arguments for the FunctionGenerator reader          warning("custom reader is ignored")
67  VCorpus <- Corpus <- function(x,  
68                                readerControl = list(reader = x$DefaultReader, language = "en"),      content <- if (inherits(x, "VectorSource")) {
69                                ...) {          if (is.character(x$content)) x$content else as.character(x$content)
70      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)      } else if (inherits(x, "DirSource")) {
71            setNames(as.character(
72      if (is.function(readerControl$init))                     lapply(x$filelist,
73          readerControl$init()                            function(f) paste(readContent(f, x$encoding, "text"),
74                                                collapse = "\n"))
75      if (is.function(readerControl$exit))                     ),
76          on.exit(readerControl$exit())                   basename(x$filelist))
77        } else if (inherits(x, "DataframeSource")) {
78      # Allocate memory in advance if length is known          setNames(as.character(x$content[, "text"]), x$content[, "doc_id"])
79      tdl <- if (x$Length > 0)      } else
80          vector("list", as.integer(x$Length))          stop("unsupported source type")
81    
82        dmeta <- if (inherits(x, "DataframeSource"))
83            x$content[, !names(x$content) %in% c("doc_id", "text")]
84      else      else
85          list()          data.frame(row.names = seq_along(x))
86    
87        s <- list(content = content,
88                  meta = CorpusMeta(language = control$language),
89                  dmeta = dmeta)
90        class(s) <- c("SimpleCorpus", "Corpus")
91        s
92    }
93    
94    VCorpus <-
95    function(x, readerControl = list(reader = reader(x), language = "en"))
96    {
97        stopifnot(inherits(x, "Source"))
98    
99      if (x$Vectorized)      readerControl <- prepareReader(readerControl, reader(x))
100          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),  
101        x <- open(x)
102        tdl <- vector("list", length(x))
103        # Check for parallel element access
104        if (is.function(getS3method("pGetElem", class(x), TRUE)))
105            tdl <- mapply(function(elem, id)
106                            readerControl$reader(elem, readerControl$language, id),
107                        pGetElem(x),                        pGetElem(x),
108                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = as.character(seq_along(x)),
109                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
110      else {      else {
111          counter <- 1          counter <- 1
112          while (!eoi(x)) {          while (!eoi(x)) {
113              x <- stepNext(x)              x <- stepNext(x)
114              elem <- getElem(x)              elem <- getElem(x)
115              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              doc <- readerControl$reader(elem,
116              if (x$Length > 0)                                          readerControl$language,
117                                            as.character(counter))
118                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
             else  
                 tdl <- c(tdl, list(doc))  
119              counter <- counter + 1              counter <- counter + 1
120          }          }
121      }      }
122      names(tdl) <- x$Names      x <- close(x)
     df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
123    
124  `[.PCorpus` <- function(x, i) {      cmeta <- CorpusMeta()
125      if (missing(i)) return(x)      dmeta <- data.frame(row.names = seq_along(tdl))
126      index <- attr(x, "DMetaData")[[1 , "subset"]]      # Check if metadata retrieval is supported
127      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i      if (is.function(getS3method("getMeta", class(x), TRUE))) {
128      dmeta <- attr(x, "DMetaData")          m <- getMeta(x)
129      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))          if (!is.null(m$cmeta)) cmeta <- m$cmeta
130            if (!is.null(m$dmeta)) dmeta <- m$dmeta
131  }  }
132    
133  `[.VCorpus` <- function(x, i) {      v <- as.VCorpus(tdl)
134      if (missing(i)) return(x)      v$meta <- cmeta
135      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])      v$dmeta <- dmeta
136    
137        v
138  }  }
139    
140  `[<-.PCorpus` <- function(x, i, value) {  `[.PCorpus` <-
141      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  `[.SimpleCorpus` <-
142      counter <- 1  function(x, i)
143      for (id in unclass(x)[i]) {  {
144          if (identical(length(value), 1L)) db[[id]] <- value      if (!missing(i)) {
145          else db[[id]] <- value[[counter]]          x$content <- x$content[i]
146          counter <- counter + 1          x$dmeta <- x$dmeta[i, , drop = FALSE]
147        }
148        x
149    }
150    `[.VCorpus` <-
151    function(x, i)
152    {
153        if (!missing(i)) {
154            x$content <- x$content[i]
155            x$dmeta <- x$dmeta[i, , drop = FALSE]
156            if (!is.null(x$lazy))
157                x$lazy$index <- x$lazy$index[i]
158      }      }
159      x      x
160  }  }
161    
162  .map_name_index <- function(x, i) {  .map_name_index <-
163      if (is.character(i)) {  function(x, i)
164          if (is.null(names(x)))  {
165              match(i, meta(x, "ID", type = "local"))      if (is.character(i))
166            match(i, meta(x, "id", "local"))
167          else          else
             match(i, names(x))  
     }  
168      i      i
169  }  }
170    
171  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
172    function(x, i)
173    {
174        i <- .map_name_index(x, i)
175        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
176        filehash::dbFetch(db, x$content[[i]])
177    }
178    `[[.SimpleCorpus` <-
179    function(x, i)
180    {
181      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
182      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      n <- names(x$content)
183      filehash::dbFetch(db, NextMethod("[["))      PlainTextDocument(x$content[[i]],
184                          id = if (is.null(n)) i else n[i],
185                          language = meta(x, "language"))
186  }  }
187  `[[.VCorpus` <-  function(x, i) {  `[[.VCorpus` <-
188    function(x, i)
189    {
190      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
191      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
192      if (!is.null(lazyTmMap))          .Call(tm_copyCorpus, x, materialize(x, i))
193          .Call("copyCorpus", x, materialize(x, i))      x$content[[i]]
     NextMethod("[[")  
194  }  }
195    
196  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
197    function(x, i, value)
198    {
199      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
200      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
201      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
202      x      x
203  }  }
204  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
205    function(x, i, value)
206    {
207      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
208      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
209      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
210      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
211          lazyTmMap$index[i] <- FALSE      x$content[[i]] <- value
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
212          x          x
213      }      }
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
   
 # Find indices to be updated for a CMetaData tree  
 .find_indices <- function(x) {  
     indices.mapping <- NULL  
     for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
         indices <- (DMetaData(x)$MetaID == m)  
         indices.mapping <- c(indices.mapping, list(m = indices))  
         names(indices.mapping)[length(indices.mapping)] <- m  
     }  
     indices.mapping  
 }  
   
 c2 <- function(x, y, ...) {  
     # Update the CMetaData tree  
     cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  
     update.struct <- .update_id(cmeta)  
   
     new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)  
   
     # Find indices to be updated for the left tree  
     indices.mapping <- .find_indices(x)  
   
     # Update the DMetaData data frames for the left tree  
     for (i in 1:ncol(update.struct$left.mapping)) {  
         map <- update.struct$left.mapping[,i]  
         DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
   
     # Find indices to be updated for the right tree  
     indices.mapping <- .find_indices(y)  
214    
215      # Update the DMetaData data frames for the right tree  as.list.PCorpus <- as.list.VCorpus <-
216      for (i in 1:ncol(update.struct$right.mapping)) {  function(x, ...)
217          map <- update.struct$right.mapping[,i]      setNames(content(x), as.character(lapply(content(x), meta, "id")))
218          DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
219    as.list.SimpleCorpus <-
220    function(x, ...)
221        as.list(content(x))
222    
223    as.VCorpus <-
224    function(x)
225        UseMethod("as.VCorpus")
226    as.VCorpus.VCorpus <- identity
227    as.VCorpus.list <-
228    function(x)
229    {
230        v <- list(content = x,
231                  meta = CorpusMeta(),
232                  dmeta = data.frame(row.names = seq_along(x)))
233        class(v) <- c("VCorpus", "Corpus")
234        v
235      }      }
236    
237      # Merge the DMetaData data frames  outer_union <-
238      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  function(x, y, ...)
239      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))  {
240      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)      if (nrow(x) > 0L)
241      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))          x[, setdiff(names(y), names(x))] <- NA
242      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))      if (nrow(y) > 0L)
243      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)          y[, setdiff(names(x), names(y))] <- NA
244      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)      res <- rbind(x, y)
245        if (ncol(res) == 0L)
246      new          res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
247        res
248  }  }
249    
250  c.Corpus <-  c.VCorpus <-
251  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
252  {  {
253      args <- list(...)      args <- list(...)
254        x <- args[[1L]]
255    
256      if (identical(length(args), 0L))      if (length(args) == 1L)
257          return(x)          return(x)
258    
259      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
260          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
261    
262      if (inherits(x, "PCorpus"))      v <- list(content = do.call("c", lapply(args, content)),
263          stop("concatenation of corpora with underlying databases is not supported")                meta = CorpusMeta(meta = do.call("c",
264                    lapply(args, function(a) meta(a, type = "corpus")))),
265      l <- base::c(list(x), args)                dmeta = Reduce(outer_union, lapply(args, meta)))
266      if (recursive)      class(v) <- c("VCorpus", "Corpus")
267          Reduce(c2, l)      v
     else {  
         l <- do.call("c", lapply(l, unclass))  
         .VCorpus(l,  
                  cmeta = .MetaDataNode(),  
                  dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))  
268      }      }
 }  
   
 c.TextDocument <- function(x, ..., recursive = FALSE) {  
     args <- list(...)  
269    
270      if (identical(length(args), 0L))  content.VCorpus <-
271          return(x)  function(x)
272    {
273        if (!is.null(x$lazy))
274            .Call(tm_copyCorpus, x, materialize(x))
275        x$content
276    }
277    
278      if (!all(unlist(lapply(args, inherits, class(x)))))  content.SimpleCorpus <-
279          stop("not all arguments are text documents")  function(x)
280        x$content
281    
282      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)  content.PCorpus <-
283      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)  function(x)
284    {
285        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
286        filehash::dbMultiFetch(db, unlist(x$content))
287  }  }
288    
289  print.Corpus <- function(x, ...) {  inspect <-
290      cat(sprintf(ngettext(length(x),  function(x)
291                           "A corpus with %d text document\n",      UseMethod("inspect", x)
292                           "A corpus with %d text documents\n"),  inspect.PCorpus <-
293                  length(x)))  inspect.SimpleCorpus <-
294    inspect.VCorpus <-
295    function(x)
296    {
297        print(x)
298        cat("\n")
299        print(noquote(content(x)))
300      invisible(x)      invisible(x)
301  }  }
302    
303  summary.Corpus <- function(object, ...) {  length.PCorpus <-
304      print(object)  length.SimpleCorpus <-
305      if (length(DMetaData(object)) > 0) {  length.VCorpus <-
306          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  function(x)
307                               "\nThe metadata consists of %d tag-value pair and a data frame\n",      length(x$content)
308                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  
309                      length(CMetaData(object)$MetaData)))  names.PCorpus <-
310          cat("Available tags are:\n")  names.SimpleCorpus <-
311          cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  names.VCorpus <-
312          cat("Available variables in the data frame are:\n")  function(x)
313          cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")      as.character(meta(x, "id", "local"))
     }  
 }  
314    
315  inspect <- function(x) UseMethod("inspect", x)  `names<-.PCorpus` <- `names<-.VCorpus` <-
316  inspect.PCorpus <- function(x) {  function(x, value)
317      summary(x)  {
318      cat("\n")      meta(x, "id", "local") <- as.character(value)
319      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      x
     show(filehash::dbMultiFetch(db, unlist(x)))  
 }  
 inspect.VCorpus <- function(x) {  
     summary(x)  
     cat("\n")  
     print(noquote(lapply(x, identity)))  
320  }  }
321    
322  lapply.PCorpus <- function(X, FUN, ...) {  format.PCorpus <-
323      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  format.SimpleCorpus <-
324      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)  format.VCorpus <-
325  }  function(x, ...)
326  lapply.VCorpus <- function(X, FUN, ...) {  {
327      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")      c(sprintf("<<%s>>", class(x)[1L]),
328      if (!is.null(lazyTmMap))        sprintf("Metadata:  corpus specific: %d, document level (indexed): %d",
329          .Call("copyCorpus", X, materialize(X))                length(meta(x, type = "corpus")),
330      base::lapply(X, FUN, ...)                ncol(meta(x, type = "indexed"))),
331          sprintf("Content:  documents: %d", length(x)))
332  }  }
333    
334  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
335    function(x, path = ".", filenames = NULL)
336    {
337      filenames <- file.path(path,      filenames <- file.path(path,
338                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
339              sprintf("%s.txt", as.character(meta(x, "id", "local")))
340                             else filenames)                             else filenames)
341      i <- 1  
342      for (o in x) {      stopifnot(length(x) == length(filenames))
343          writeLines(as.PlainTextDocument(o), filenames[i])  
344          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
345      }  
346        invisible(x)
347  }  }

Legend:
Removed from v.1114  
changed lines
  Added in v.1481

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge