SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1114, Fri Nov 26 14:05:54 2010 UTC revision 1460, Mon Jan 9 17:01:04 2017 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {  Corpus <-
4      attr(x, "CMetaData") <- cmeta  function(x, readerControl = list(reader = reader(x), language = "en"))
5      attr(x, "DMetaData") <- dmeta  {
6      attr(x, "DBControl") <- dbcontrol      stopifnot(inherits(x, "Source"))
     class(x) <- c("PCorpus", "Corpus", "list")  
     x  
 }  
 DBControl <- function(x) attr(x, "DBControl")  
7    
8  PCorpus <- function(x,      readerControl <- prepareReader(readerControl, reader(x))
9                      readerControl = list(reader = x$DefaultReader, language = "en"),  
10                      dbControl = list(dbName = "", dbType = "DB1"),      if ((inherits(x, "DirSource") || inherits(x, "VectorSource")) &&
11                      ...) {          identical(readerControl$reader, readPlain))
12      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)          SimpleCorpus(x, readerControl)
13        else
14            VCorpus(x, readerControl)
15    }
16    
17      if (is.function(readerControl$init))  PCorpus <-
18          readerControl$init()  function(x,
19             readerControl = list(reader = reader(x), language = "en"),
20             dbControl = list(dbName = "", dbType = "DB1"))
21    {
22        stopifnot(inherits(x, "Source"))
23    
24      if (is.function(readerControl$exit))      readerControl <- prepareReader(readerControl, reader(x))
         on.exit(readerControl$exit())  
25    
26      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
27          stop("error in creating database")          stop("error in creating database")
28      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
29    
30      # Allocate memory in advance if length is known      x <- open(x)
31      tdl <- if (x$Length > 0)      tdl <- vector("list", length(x))
         vector("list", as.integer(x$Length))  
     else  
         list()  
   
32      counter <- 1      counter <- 1
33      while (!eoi(x)) {      while (!eoi(x)) {
34          x <- stepNext(x)          x <- stepNext(x)
35          elem <- getElem(x)          elem <- getElem(x)
36          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])          doc <- readerControl$reader(elem,
37          filehash::dbInsert(db, ID(doc), doc)                                      readerControl$language,
38          if (x$Length > 0) tdl[[counter]] <- ID(doc)                                      as.character(counter))
39          else tdl <- c(tdl, ID(doc))          filehash::dbInsert(db, meta(doc, "id"), doc)
40            tdl[[counter]] <- meta(doc, "id")
41          counter <- counter + 1          counter <- counter + 1
42      }      }
43      names(tdl) <- x$Names      x <- close(x)
   
     df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
     filehash::dbInsert(db, "DMetaData", df)  
     dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))  
44    
45      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)      p <- list(content = tdl,
46                  meta = CorpusMeta(),
47                  dmeta = data.frame(row.names = seq_along(tdl)),
48                  dbcontrol = dbControl)
49        class(p) <- c("PCorpus", "Corpus")
50        p
51  }  }
52    
53  .VCorpus <- function(x, cmeta, dmeta) {  SimpleCorpus <-
54      attr(x, "CMetaData") <- cmeta  function(x, control = list(language = "en"))
55      attr(x, "DMetaData") <- dmeta  {
56      class(x) <- c("VCorpus", "Corpus", "list")      stopifnot(inherits(x, "Source"))
57      x  
58        if (!is.null(control$reader) && !identical(control$reader, readPlain))
59            warning("custom reader is ignored")
60    
61        content <- if (inherits(x, "VectorSource")) {
62            if (is.character(x$content)) x$content else as.character(x$content)
63        } else if (inherits(x, "DirSource")) {
64            setNames(as.character(
65                       lapply(x$filelist,
66                              function(f) paste(readContent(f, x$encoding, "text"),
67                                                collapse = "\n"))
68                       ),
69                     basename(x$filelist))
70        } else
71            stop("unsupported source type")
72        s <- list(content = content,
73                  meta = CorpusMeta(language = control$language),
74                  dmeta = data.frame(row.names = seq_along(x)))
75        class(s) <- c("SimpleCorpus", "Corpus")
76        s
77  }  }
78    
79  # Register S3 corpus classes to be recognized by S4 methods. This is  VCorpus <-
80  # mainly a fix to be compatible with packages which were originally  function(x, readerControl = list(reader = reader(x), language = "en"))
81  # developed to cooperate with corresponding S4 tm classes. Necessary  {
82  # since tm's class architecture was changed to S3 since tm version 0.5.      stopifnot(inherits(x, "Source"))
83  setOldClass(c("VCorpus", "Corpus", "list"))  
84        readerControl <- prepareReader(readerControl, reader(x))
 # The "..." are additional arguments for the FunctionGenerator reader  
 VCorpus <- Corpus <- function(x,  
                               readerControl = list(reader = x$DefaultReader, language = "en"),  
                               ...) {  
     readerControl <- prepareReader(readerControl, x$DefaultReader, ...)  
   
     if (is.function(readerControl$init))  
         readerControl$init()  
   
     if (is.function(readerControl$exit))  
         on.exit(readerControl$exit())  
   
     # Allocate memory in advance if length is known  
     tdl <- if (x$Length > 0)  
         vector("list", as.integer(x$Length))  
     else  
         list()  
85    
86      if (x$Vectorized)      x <- open(x)
87          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),      tdl <- vector("list", length(x))
88        # Check for parallel element access
89        if (is.function(getS3method("pGetElem", class(x), TRUE)))
90            tdl <- mapply(function(elem, id)
91                            readerControl$reader(elem, readerControl$language, id),
92                        pGetElem(x),                        pGetElem(x),
93                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = as.character(seq_along(x)),
94                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
95      else {      else {
96          counter <- 1          counter <- 1
97          while (!eoi(x)) {          while (!eoi(x)) {
98              x <- stepNext(x)              x <- stepNext(x)
99              elem <- getElem(x)              elem <- getElem(x)
100              doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])              doc <- readerControl$reader(elem,
101              if (x$Length > 0)                                          readerControl$language,
102                                            as.character(counter))
103                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
             else  
                 tdl <- c(tdl, list(doc))  
104              counter <- counter + 1              counter <- counter + 1
105          }          }
106      }      }
107      names(tdl) <- x$Names      x <- close(x)
     df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)  
     .VCorpus(tdl, .MetaDataNode(), df)  
 }  
108    
109  `[.PCorpus` <- function(x, i) {      as.VCorpus(tdl)
     if (missing(i)) return(x)  
     index <- attr(x, "DMetaData")[[1 , "subset"]]  
     attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i  
     dmeta <- attr(x, "DMetaData")  
     .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))  
110  }  }
111    
112  `[.VCorpus` <- function(x, i) {  `[.PCorpus` <-
113      if (missing(i)) return(x)  `[.SimpleCorpus` <-
114      .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])  function(x, i)
115    {
116        if (!missing(i)) {
117            x$content <- x$content[i]
118            x$dmeta <- x$dmeta[i, , drop = FALSE]
119  }  }
120        x
121  `[<-.PCorpus` <- function(x, i, value) {  }
122      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])  `[.VCorpus` <-
123      counter <- 1  function(x, i)
124      for (id in unclass(x)[i]) {  {
125          if (identical(length(value), 1L)) db[[id]] <- value      if (!missing(i)) {
126          else db[[id]] <- value[[counter]]          x$content <- x$content[i]
127          counter <- counter + 1          x$dmeta <- x$dmeta[i, , drop = FALSE]
128            if (!is.null(x$lazy))
129                x$lazy$index <- x$lazy$index[i]
130      }      }
131      x      x
132  }  }
133    
134  .map_name_index <- function(x, i) {  .map_name_index <-
135      if (is.character(i)) {  function(x, i)
136          if (is.null(names(x)))  {
137              match(i, meta(x, "ID", type = "local"))      if (is.character(i))
138            match(i, meta(x, "id", "local"))
139          else          else
             match(i, names(x))  
     }  
140      i      i
141  }  }
142    
143  `[[.PCorpus` <-  function(x, i) {  `[[.PCorpus` <-
144    function(x, i)
145    {
146      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
147      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
148      filehash::dbFetch(db, NextMethod("[["))      filehash::dbFetch(db, x$content[[i]])
149  }  }
150  `[[.VCorpus` <-  function(x, i) {  `[[.SimpleCorpus` <-
151    function(x, i)
152    {
153      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
154      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      n <- names(x$content)
155      if (!is.null(lazyTmMap))      PlainTextDocument(x$content[[i]],
156          .Call("copyCorpus", x, materialize(x, i))                        id = if (is.null(n)) i else n[i],
157      NextMethod("[[")                        language = meta(x, "language"))
158    }
159    `[[.VCorpus` <-
160    function(x, i)
161    {
162        i <- .map_name_index(x, i)
163        if (!is.null(x$lazy))
164            .Call(copyCorpus, x, materialize(x, i))
165        x$content[[i]]
166  }  }
167    
168  `[[<-.PCorpus` <-  function(x, i, value) {  `[[<-.PCorpus` <-
169    function(x, i, value)
170    {
171      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
172      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
173      index <- unclass(x)[[i]]      db[[x$content[[i]]]] <- value
     db[[index]] <- value  
174      x      x
175  }  }
176  `[[<-.VCorpus` <-  function(x, i, value) {  `[[<-.VCorpus` <-
177    function(x, i, value)
178    {
179      i <- .map_name_index(x, i)      i <- .map_name_index(x, i)
180      # Mark new objects as not active for lazy mapping      # Mark new objects as inactive for lazy mapping
181      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")      if (!is.null(x$lazy))
182      if (!is.null(lazyTmMap)) {          x$lazy$index[i] <- FALSE
183          lazyTmMap$index[i] <- FALSE      x$content[[i]] <- value
         meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap  
     }  
     # Set the value  
     cl <- class(x)  
     y <- NextMethod("[[<-")  
     class(y) <- cl  
     y  
 }  
   
 # Update NodeIDs of a CMetaData tree  
 .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  
     # Traversal of (binary) CMetaData tree with setup of NodeIDs  
     set_id <- function(x) {  
         x$NodeID <- id  
         id <<- id + 1  
         level <<- level + 1  
         if (length(x$Children) > 0) {  
             mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))  
             left <- set_id(x$Children[[1]])  
             if (level == 1) {  
                 left.mapping <<- mapping  
                 mapping <<- NULL  
             }  
             mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))  
             right <- set_id(x$Children[[2]])  
   
             x$Children <- list(left, right)  
         }  
         level <<- level - 1  
184          x          x
185      }      }
     list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)  
 }  
186    
187  # Find indices to be updated for a CMetaData tree  as.list.PCorpus <- as.list.VCorpus <-
188  .find_indices <- function(x) {  function(x, ...)
189      indices.mapping <- NULL      setNames(content(x), as.character(lapply(content(x), meta, "id")))
190      for (m in levels(as.factor(DMetaData(x)$MetaID))) {  
191          indices <- (DMetaData(x)$MetaID == m)  as.list.SimpleCorpus <-
192          indices.mapping <- c(indices.mapping, list(m = indices))  function(x, ...)
193          names(indices.mapping)[length(indices.mapping)] <- m      as.list(content(x))
194      }  
195      indices.mapping  as.VCorpus <-
196  }  function(x)
197        UseMethod("as.VCorpus")
198  c2 <- function(x, y, ...) {  as.VCorpus.VCorpus <- identity
199      # Update the CMetaData tree  as.VCorpus.list <-
200      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))  function(x)
201      update.struct <- .update_id(cmeta)  {
202        v <- list(content = x,
203      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)                meta = CorpusMeta(),
204                  dmeta = data.frame(row.names = seq_along(x)))
205      # Find indices to be updated for the left tree      class(v) <- c("VCorpus", "Corpus")
206      indices.mapping <- .find_indices(x)      v
   
     # Update the DMetaData data frames for the left tree  
     for (i in 1:ncol(update.struct$left.mapping)) {  
         map <- update.struct$left.mapping[,i]  
         DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
     }  
   
     # Find indices to be updated for the right tree  
     indices.mapping <- .find_indices(y)  
   
     # Update the DMetaData data frames for the right tree  
     for (i in 1:ncol(update.struct$right.mapping)) {  
         map <- update.struct$right.mapping[,i]  
         DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])  
207      }      }
208    
209      # Merge the DMetaData data frames  outer_union <-
210      labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))  function(x, y, ...)
211      na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))  {
212      x.dmeta.aug <- cbind(DMetaData(x), na.matrix)      if (nrow(x) > 0L)
213      labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))          x[, setdiff(names(y), names(x))] <- NA
214      na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))      if (nrow(y) > 0L)
215      y.dmeta.aug <- cbind(DMetaData(y), na.matrix)          y[, setdiff(names(x), names(y))] <- NA
216      DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)      res <- rbind(x, y)
217        if (ncol(res) == 0L)
218      new          res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
219        res
220  }  }
221    
222  c.Corpus <-  c.VCorpus <-
223  function(x, ..., recursive = FALSE)  function(..., recursive = FALSE)
224  {  {
225      args <- list(...)      args <- list(...)
226        x <- args[[1L]]
227    
228      if (identical(length(args), 0L))      if (length(args) == 1L)
229          return(x)          return(x)
230    
231      if (!all(unlist(lapply(args, inherits, class(x)))))      if (!all(unlist(lapply(args, inherits, class(x)))))
232          stop("not all arguments are of the same corpus type")          stop("not all arguments are of the same corpus type")
233    
234      if (inherits(x, "PCorpus"))      v <- list(content = do.call("c", lapply(args, content)),
235          stop("concatenation of corpora with underlying databases is not supported")                meta = CorpusMeta(meta = do.call("c",
236                    lapply(args, function(a) meta(a, type = "corpus")))),
237      l <- base::c(list(x), args)                dmeta = Reduce(outer_union, lapply(args, meta)))
238      if (recursive)      class(v) <- c("VCorpus", "Corpus")
239          Reduce(c2, l)      v
     else {  
         l <- do.call("c", lapply(l, unclass))  
         .VCorpus(l,  
                  cmeta = .MetaDataNode(),  
                  dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))  
     }  
240  }  }
241    
242  c.TextDocument <- function(x, ..., recursive = FALSE) {  content.VCorpus <-
243      args <- list(...)  function(x)
244    {
245        if (!is.null(x$lazy))
246            .Call(copyCorpus, x, materialize(x))
247        x$content
248    }
249    
250      if (identical(length(args), 0L))  content.SimpleCorpus <-
251          return(x)  function(x)
252        x$content
253    
254      if (!all(unlist(lapply(args, inherits, class(x)))))  content.PCorpus <-
255          stop("not all arguments are text documents")  function(x)
256    {
257      dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
258      .VCorpus(list(x, ...), .MetaDataNode(), dmeta)      filehash::dbMultiFetch(db, unlist(x$content))
259  }  }
260    
261  print.Corpus <- function(x, ...) {  inspect <-
262      cat(sprintf(ngettext(length(x),  function(x)
263                           "A corpus with %d text document\n",      UseMethod("inspect", x)
264                           "A corpus with %d text documents\n"),  inspect.PCorpus <-
265                  length(x)))  inspect.SimpleCorpus <-
266    inspect.VCorpus <-
267    function(x)
268    {
269        print(x)
270        cat("\n")
271        print(noquote(content(x)))
272      invisible(x)      invisible(x)
273  }  }
274    
275  summary.Corpus <- function(object, ...) {  length.PCorpus <-
276      print(object)  length.SimpleCorpus <-
277      if (length(DMetaData(object)) > 0) {  length.VCorpus <-
278          cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),  function(x)
279                               "\nThe metadata consists of %d tag-value pair and a data frame\n",      length(x$content)
280                               "\nThe metadata consists of %d tag-value pairs and a data frame\n"),  
281                      length(CMetaData(object)$MetaData)))  names.PCorpus <-
282          cat("Available tags are:\n")  names.SimpleCorpus <-
283          cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")  names.VCorpus <-
284          cat("Available variables in the data frame are:\n")  function(x)
285          cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")      as.character(meta(x, "id", "local"))
     }  
 }  
286    
287  inspect <- function(x) UseMethod("inspect", x)  `names<-.PCorpus` <- `names<-.VCorpus` <-
288  inspect.PCorpus <- function(x) {  function(x, value)
289      summary(x)  {
290      cat("\n")      meta(x, "id", "local") <- as.character(value)
291      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      x
     show(filehash::dbMultiFetch(db, unlist(x)))  
 }  
 inspect.VCorpus <- function(x) {  
     summary(x)  
     cat("\n")  
     print(noquote(lapply(x, identity)))  
292  }  }
293    
294  lapply.PCorpus <- function(X, FUN, ...) {  format.PCorpus <-
295      db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])  format.SimpleCorpus <-
296      lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)  format.VCorpus <-
297  }  function(x, ...)
298  lapply.VCorpus <- function(X, FUN, ...) {  {
299      lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")      c(sprintf("<<%s>>", class(x)[1L]),
300      if (!is.null(lazyTmMap))        sprintf("Metadata:  corpus specific: %d, document level (indexed): %d",
301          .Call("copyCorpus", X, materialize(X))                length(meta(x, type = "corpus")),
302      base::lapply(X, FUN, ...)                ncol(meta(x, type = "indexed"))),
303          sprintf("Content:  documents: %d", length(x)))
304  }  }
305    
306  writeCorpus <-  function(x, path = ".", filenames = NULL) {  writeCorpus <-
307    function(x, path = ".", filenames = NULL)
308    {
309      filenames <- file.path(path,      filenames <- file.path(path,
310                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))        if (is.null(filenames))
311              sprintf("%s.txt", as.character(meta(x, "id", "local")))
312                             else filenames)                             else filenames)
313      i <- 1  
314      for (o in x) {      stopifnot(length(x) == length(filenames))
315          writeLines(as.PlainTextDocument(o), filenames[i])  
316          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
317      }  
318        invisible(x)
319  }  }

Legend:
Removed from v.1114  
changed lines
  Added in v.1460

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge