SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 950, Thu May 14 15:17:18 2009 UTC revision 973, Sat Jul 4 08:10:25 2009 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  FCorpus <- function(object, readerControl = list(language = "eng")) {  prepareReader <- function(readerControl, defaultReader = NULL, ...) {
4        if (is.null(readerControl$reader))
5            readerControl$reader <- defaultReader
6        if (is(readerControl$reader, "FunctionGenerator"))
7            readerControl$reader <- readerControl$reader(...)
8      if (is.null(readerControl$language))      if (is.null(readerControl$language))
9          readerControl$language <- "eng"          readerControl$language <- "eng"
10        readerControl
     if (!object@Vectorized)  
         stop("Source is not vectorized")  
   
     tdl <- lapply(mapply(c, pGetElem(object), id = seq_len(object@Length), SIMPLIFY = FALSE),  
                   function(x) readSlim(x[c("content", "uri")],  
                                        readerControl$language,  
                                        as.character(x$id)))  
   
     new("FCorpus", .Data = tdl)  
11  }  }
12    
13    ## Fast Corpus
14    ##   - provides a prototype implementation of a more time and memory efficient representation of a corpus
15    ##   - allows performance tests and comparisons to other corpus types
16    #FCorpus <- function(object, readerControl = list(language = "eng")) {
17    #    readerControl <- prepareReader(readerControl)
18    #
19    #    if (!object@Vectorized)
20    #        stop("Source is not vectorized")
21    #
22    #    tdl <- lapply(mapply(c, pGetElem(object), id = seq_len(object@Length), SIMPLIFY = FALSE),
23    #                  function(x) readSlim(x[c("content", "uri")],
24    #                                       readerControl$language,
25    #                                       as.character(x$id)))
26    #
27    #    new("FCorpus", .Data = tdl)
28    #}
29    
30  PCorpus <- function(object,  PCorpus <- function(object,
31                      readerControl = list(reader = object@DefaultReader, language = "eng"),                      readerControl = list(reader = object@DefaultReader, language = "eng"),
32                      dbControl = list(dbName = "", dbType = "DB1"),                      dbControl = list(dbName = "", dbType = "DB1"),
33                      ...) {                      ...) {
34      if (is.null(readerControl$reader))      readerControl <- prepareReader(readerControl, object@DefaultReader, ...)
         readerControl$reader <- object@DefaultReader  
     if (is(readerControl$reader, "FunctionGenerator"))  
         readerControl$reader <- readerControl$reader(...)  
     if (is.null(readerControl$language))  
         readerControl$language <- "eng"  
35    
36      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
37          stop("error in creating database")          stop("error in creating database")
# Line 60  Line 67 
67  }  }
68    
69  # The "..." are additional arguments for the FunctionGenerator reader  # The "..." are additional arguments for the FunctionGenerator reader
70  SCorpus <- Corpus <- function(object,  VCorpus <- Corpus <- function(object,
71                      readerControl = list(reader = object@DefaultReader, language = "eng"),                      readerControl = list(reader = object@DefaultReader, language = "eng"),
72                      ...) {                      ...) {
73      if (is.null(readerControl$reader))      readerControl <- prepareReader(readerControl, object@DefaultReader, ...)
         readerControl$reader <- object@DefaultReader  
     if (is(readerControl$reader, "FunctionGenerator"))  
         readerControl$reader <- readerControl$reader(...)  
     if (is.null(readerControl$language))  
         readerControl$language <- "eng"  
74    
75      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
76      tdl <- if (object@Length > 0)      tdl <- if (object@Length > 0)
# Line 101  Line 103 
103                        MetaData = list(create_date = as.POSIXlt(Sys.time(), tz = "GMT"), creator = Sys.getenv("LOGNAME")),                        MetaData = list(create_date = as.POSIXlt(Sys.time(), tz = "GMT"), creator = Sys.getenv("LOGNAME")),
104                        children = list())                        children = list())
105    
106      new("SCorpus", .Data = tdl, DMetaData = df, CMetaData = cmeta.node)      new("VCorpus", .Data = tdl, DMetaData = df, CMetaData = cmeta.node)
107  }  }
108    
109  setGeneric("tmMap", function(object, FUN, ..., lazy = FALSE) standardGeneric("tmMap"))  setGeneric("tmMap", function(object, FUN, ..., lazy = FALSE) standardGeneric("tmMap"))
110    #setMethod("tmMap",
111    #          signature(object = "FCorpus", FUN = "function"),
112    #          function(object, FUN, ..., lazy = FALSE) {
113    #              if (lazy)
114    #                  warning("lazy mapping is deactivated")
115    #
116    #              new("FCorpus", .Data = lapply(object, FUN, ..., DMetaData = data.frame()))
117    #          })
118  setMethod("tmMap",  setMethod("tmMap",
119            signature(object = "FCorpus", FUN = "function"),            signature(object = "VCorpus", FUN = "function"),
           function(object, FUN, ..., lazy = FALSE) {  
               if (lazy)  
                   warning("lazy mapping is deactivated")  
   
               lapply(object, FUN, ..., DMetaData = data.frame())  
           })  
 setMethod("tmMap",  
           signature(object = "SCorpus", FUN = "function"),  
120            function(object, FUN, ..., lazy = FALSE) {            function(object, FUN, ..., lazy = FALSE) {
121                result <- object                result <- object
122                # Lazy mapping                # Lazy mapping
# Line 141  Line 143 
143  setMethod("tmMap",  setMethod("tmMap",
144            signature(object = "PCorpus", FUN = "function"),            signature(object = "PCorpus", FUN = "function"),
145            function(object, FUN, ..., lazy = FALSE) {            function(object, FUN, ..., lazy = FALSE) {
               # TODO: When should lazy mapping be conceptually available?  
146                if (lazy)                if (lazy)
147                    warning("lazy mapping is deactived when using database backend")                    warning("lazy mapping is deactived when using database backend")
148                db <- filehash::dbInit(DBControl(object)[["dbName"]], DBControl(object)[["dbType"]])                db <- filehash::dbInit(DBControl(object)[["dbName"]], DBControl(object)[["dbType"]])
# Line 210  Line 211 
211            })            })
212  setMethod("asPlain", signature(object = "RCV1Document"),  setMethod("asPlain", signature(object = "RCV1Document"),
213            function(object, FUN, ...) convertRCV1Plain(object, ...))            function(object, FUN, ...) convertRCV1Plain(object, ...))
214  setMethod("asPlain",  setMethod("asPlain", signature(object = "MailDocument"),
215            signature(object = "NewsgroupDocument"),            function(object, FUN, ...) as(object, "PlainTextDocument"))
           function(object, FUN, ...) {  
               new("PlainTextDocument", .Data = Content(object), Author = Author(object),  
                   DateTimeStamp = DateTimeStamp(object), Description = Description(object), ID = ID(object),  
                   Origin = Origin(object), Heading = Heading(object), Language = Language(object),  
                   LocalMetaData = LocalMetaData(object))  
           })  
216  setMethod("asPlain",  setMethod("asPlain",
217            signature(object = "StructuredTextDocument"),            signature(object = "StructuredTextDocument"),
218            function(object, FUN, ...) {            function(object, FUN, ...) {
# Line 249  Line 244 
244                    return(FUN(object, ...))                    return(FUN(object, ...))
245            })            })
246    
 # TODO: Replace with c(Corpus, TextDocument)?  
247  setGeneric("appendElem", function(object, data, meta = NULL) standardGeneric("appendElem"))  setGeneric("appendElem", function(object, data, meta = NULL) standardGeneric("appendElem"))
248  setMethod("appendElem",  setMethod("appendElem",
249            signature(object = "Corpus", data = "TextDocument"),            signature(object = "Corpus", data = "TextDocument"),
# Line 276  Line 270 
270      df      df
271  }  }
272    
273    #setMethod("[",
274    #          signature(x = "FCorpus", i = "ANY", j = "ANY", drop = "ANY"),
275    #          function(x, i, j, ... , drop) {
276    #              if (missing(i)) return(x)
277    #
278    #              x@.Data <- x@.Data[i, ..., drop = FALSE]
279    #              x
280    #          })
281  setMethod("[",  setMethod("[",
282            signature(x = "PCorpus", i = "ANY", j = "ANY", drop = "ANY"),            signature(x = "PCorpus", i = "ANY", j = "ANY", drop = "ANY"),
283            function(x, i, j, ... , drop) {            function(x, i, j, ... , drop) {
# Line 287  Line 289 
289                else x@DMetaData[[1 , "subset"]] <- index[i]                else x@DMetaData[[1 , "subset"]] <- index[i]
290                x                x
291            })            })
   
292  setMethod("[",  setMethod("[",
293            signature(x = "SCorpus", i = "ANY", j = "ANY", drop = "ANY"),            signature(x = "VCorpus", i = "ANY", j = "ANY", drop = "ANY"),
294            function(x, i, j, ... , drop) {            function(x, i, j, ... , drop) {
295                if (missing(i)) return(x)                if (missing(i)) return(x)
296    
# Line 311  Line 312 
312                x                x
313            })            })
314  setMethod("[<-",  setMethod("[<-",
315            signature(x = "SCorpus", i = "ANY", j = "ANY", value = "ANY"),            signature(x = "VCorpus", i = "ANY", j = "ANY", value = "ANY"),
316            function(x, i, j, ... , value) {            function(x, i, j, ... , value) {
317                x@.Data[i, ...] <- value                x@.Data[i, ...] <- value
318                x                x
# Line 324  Line 325 
325                filehash::dbFetch(db, x@.Data[[i]])                filehash::dbFetch(db, x@.Data[[i]])
326            })            })
327  setMethod("[[",  setMethod("[[",
328            signature(x = "SCorpus", i = "ANY", j = "ANY"),            signature(x = "VCorpus", i = "ANY", j = "ANY"),
329            function(x, i, j, ...) {            function(x, i, j, ...) {
               # TODO: For which corpora should lazy mapping be available?  
330                lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")                lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
331                if (!is.null(lazyTmMap))                if (!is.null(lazyTmMap))
332                    .Call("copyCorpus", x, materialize(x, i))                    .Call("copyCorpus", x, materialize(x, i))
# Line 342  Line 342 
342                x                x
343            })            })
344  setMethod("[[<-",  setMethod("[[<-",
345            signature(x = "SCorpus", i = "ANY", j = "ANY", value = "ANY"),            signature(x = "VCorpus", i = "ANY", j = "ANY", value = "ANY"),
346            function(x, i, j, ..., value) {            function(x, i, j, ..., value) {
347                # Mark new objects as not active for lazy mapping                # Mark new objects as not active for lazy mapping
348                lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")                lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
# Line 381  Line 381 
381          return(object)          return(object)
382      }      }
383    
384      return(list(root = set_id(object), left.mapping = left.mapping, right.mapping = mapping))      list(root = set_id(object), left.mapping = left.mapping, right.mapping = mapping)
385  }  }
386    
 # TODO: Implement concatenation for other corpus types  
387  setMethod("c",  setMethod("c",
388            signature(x = "Corpus"),            signature(x = "Corpus"),
389            function(x, ..., meta = list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), recursive = TRUE) {            function(x, ..., meta = list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), recursive = FALSE) {
390                args <- list(...)                args <- list(...)
391                if (length(args) == 0)                if (identical(length(args), 0)) return(x)
392                    return(x)  
393                  if (!all(sapply(args, inherits, class(x))))
394                      stop("not all arguments are of the same corpus type")
395    
396                if (!all(sapply(args, inherits, "SCorpus")))                if (inherits(x, "PCorpus"))
397                    stop("not all arguments are standard corpora")                    stop("concatenation of corpora with underlying databases is not supported")
398    
399                Reduce(c2, base::c(list(x), args))                Reduce(c2, base::c(list(x), args))
400            })            })
401    
402  setGeneric("c2", function(x, y, ..., meta = list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), recursive = TRUE) standardGeneric("c2"))  setGeneric("c2", function(x, y, ..., meta = list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME"))) standardGeneric("c2"))
403  setMethod("c2",  #setMethod("c2", signature(x = "FCorpus", y = "FCorpus"),
404            signature(x = "SCorpus", y = "SCorpus"),  #          function(x, y, ..., meta = list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME"))) {
405            function(x, y, ..., meta = list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), recursive = TRUE) {  #              new("FCorpus", .Data = c(as(x, "list"), as(y, "list")))
406    #          })
407    setMethod("c2", signature(x = "VCorpus", y = "VCorpus"),
408              function(x, y, ..., meta = list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME"))) {
409                object <- x                object <- x
410                # Concatenate data slots                # Concatenate data slots
411                object@.Data <- c(as(x, "list"), as(y, "list"))                object@.Data <- c(as(x, "list"), as(y, "list"))
# Line 448  Line 452 
452                y.dmeta.aug <- cbind(DMetaData(y), na.matrix)                y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
453                object@DMetaData <- rbind(x.dmeta.aug, y.dmeta.aug)                object@DMetaData <- rbind(x.dmeta.aug, y.dmeta.aug)
454    
455                return(object)                object
456            })            })
457    
458  setMethod("c",  setMethod("c",
459            signature(x = "TextDocument"),            signature(x = "TextDocument"),
460            function(x, ..., recursive = TRUE){            function(x, ..., recursive = FALSE){
461                args <- list(...)                args <- list(...)
462                if (identical(length(args), 0)) return(x)                if (identical(length(args), 0)) return(x)
463    
# Line 463  Line 467 
467                              MetaData = list(create_date = as.POSIXlt(Sys.time(), tz = "GMT"), creator = Sys.getenv("LOGNAME")),                              MetaData = list(create_date = as.POSIXlt(Sys.time(), tz = "GMT"), creator = Sys.getenv("LOGNAME")),
468                              children = list())                              children = list())
469    
470                new("SCorpus", .Data = list(x, ...), DMetaData = dmeta.df, CMetaData = cmeta.node)                new("VCorpus", .Data = list(x, ...), DMetaData = dmeta.df, CMetaData = cmeta.node)
471            })            })
472    
473  setMethod("show",  setMethod("show",
# Line 498  Line 502 
502      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
503      show(filehash::dbMultiFetch(db, unlist(x)))      show(filehash::dbMultiFetch(db, unlist(x)))
504  }  }
505  inspect.FCorpus <- inspect.SCorpus <- function(x) {  #inspect.FCorpus <-
506    inspect.VCorpus <- function(x) {
507      summary(x)      summary(x)
508      cat("\n")      cat("\n")
509      print(noquote(lapply(x, identity)))      print(noquote(lapply(x, identity)))
# Line 511  Line 516 
516                db <- filehash::dbInit(DBControl(y)[["dbName"]], DBControl(y)[["dbType"]])                db <- filehash::dbInit(DBControl(y)[["dbName"]], DBControl(y)[["dbType"]])
517                any(sapply(y, function(x, z) {x %in% Content(z)}, x))                any(sapply(y, function(x, z) {x %in% Content(z)}, x))
518            })            })
519  setMethod("%IN%", signature(x = "TextDocument", y = "SCorpus"),  setMethod("%IN%", signature(x = "TextDocument", y = "VCorpus"),
520            function(x, y) x %in% y)            function(x, y) x %in% y)
521    
522  setMethod("lapply",  setMethod("lapply",
523            signature(X = "SCorpus"),            signature(X = "VCorpus"),
524            function(X, FUN, ...) {            function(X, FUN, ...) {
525                lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")                lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
526                if (!is.null(lazyTmMap))                if (!is.null(lazyTmMap))
# Line 530  Line 535 
535            })            })
536    
537  setMethod("sapply",  setMethod("sapply",
538            signature(X = "SCorpus"),            signature(X = "VCorpus"),
539            function(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE) {            function(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE) {
540                lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")                lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
541                if (!is.null(lazyTmMap))                if (!is.null(lazyTmMap))
# Line 544  Line 549 
549                sapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)                sapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
550            })            })
551    
552  setAs("list", "SCorpus", function(from) {  setAs("list", "VCorpus", function(from) {
553      cmeta.node <- new("MetaDataNode",      cmeta.node <- new("MetaDataNode",
554                        NodeID = 0,                        NodeID = 0,
555                        MetaData = list(create_date = as.POSIXlt(Sys.time(), tz = "GMT"), creator = Sys.getenv("LOGNAME")),                        MetaData = list(create_date = as.POSIXlt(Sys.time(), tz = "GMT"), creator = Sys.getenv("LOGNAME")),
556                        children = list())                        children = list())
557      data <- list()      data <- vector("list", length(from))
558      counter <- 1      counter <- 1
559      for (f in from) {      for (f in from) {
560          doc <- new("PlainTextDocument",          data[[counter]] <- new("PlainTextDocument",
561                     .Data = f,                     .Data = f,
562                     Author = "", DateTimeStamp = as.POSIXlt(Sys.time(), tz = "GMT"),                                 DateTimeStamp = as.POSIXlt(Sys.time(), tz = "GMT"),
563                     Description = "", ID = as.character(counter),                                 ID = as.character(counter),
564                     Origin = "", Heading = "", Language = "eng")                                 Language = "eng")
         data <- c(data, list(doc))  
565          counter <- counter + 1          counter <- counter + 1
566      }      }
567      new("SCorpus", .Data = data,      new("VCorpus", .Data = data,
568          DMetaData = data.frame(MetaID = rep(0, length(from)), stringsAsFactors = FALSE),          DMetaData = data.frame(MetaID = rep(0, length(from)), stringsAsFactors = FALSE),
569          CMetaData = cmeta.node)          CMetaData = cmeta.node)
570  })  })

Legend:
Removed from v.950  
changed lines
  Added in v.973

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge