SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 812, Tue Jan 22 13:36:33 2008 UTC revision 816, Thu Jan 24 14:36:41 2008 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # The "..." are additional arguments for the FunctionGenerator reader  # The "..." are additional arguments for the FunctionGenerator reader
4  setGeneric("TextDocCol", function(object,  setGeneric("Corpus", function(object,
5                                    readerControl = list(reader = object@DefaultReader, language = "en_US", load = FALSE),                                    readerControl = list(reader = object@DefaultReader, language = "en_US", load = FALSE),
6                                    dbControl = list(useDb = FALSE, dbName = "", dbType = "DB1"),                                    dbControl = list(useDb = FALSE, dbName = "", dbType = "DB1"),
7                                    ...) standardGeneric("TextDocCol"))                                    ...) standardGeneric("Corpus"))
8  setMethod("TextDocCol",  setMethod("Corpus",
9            signature(object = "Source"),            signature(object = "Source"),
10            function(object,            function(object,
11                     readerControl = list(reader = object@DefaultReader, language = "en_US", load = FALSE),                     readerControl = list(reader = object@DefaultReader, language = "en_US", load = FALSE),
# Line 58  Line 58 
58                              MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")),                              MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")),
59                              children = list())                              children = list())
60    
61                return(new("TextDocCol", .Data = tdl, DMetaData = dmeta.df, CMetaData = cmeta.node, DBControl = dbControl))                return(new("Corpus", .Data = tdl, DMetaData = dmeta.df, CMetaData = cmeta.node, DBControl = dbControl))
62            })            })
63    
64  setGeneric("loadDoc", function(object, ...) standardGeneric("loadDoc"))  setGeneric("loadDoc", function(object, ...) standardGeneric("loadDoc"))
# Line 69  Line 69 
69                    con <- eval(URI(object))                    con <- eval(URI(object))
70                    corpus <- readLines(con)                    corpus <- readLines(con)
71                    close(con)                    close(con)
72                    Corpus(object) <- corpus                    Content(object) <- corpus
73                    Cached(object) <- TRUE                    Cached(object) <- TRUE
74                    return(object)                    return(object)
75                } else {                } else {
# Line 85  Line 85 
85                    close(con)                    close(con)
86                    doc <- xmlTreeParse(corpus, asText = TRUE)                    doc <- xmlTreeParse(corpus, asText = TRUE)
87                    class(doc) <- "list"                    class(doc) <- "list"
88                    Corpus(object) <- doc                    Content(object) <- doc
89                    Cached(object) <- TRUE                    Cached(object) <- TRUE
90                    return(object)                    return(object)
91                } else {                } else {
# Line 104  Line 104 
104                        if (mail[index] == "")                        if (mail[index] == "")
105                            break                            break
106                    }                    }
107                    Corpus(object) <- mail[(index + 1):length(mail)]                    Content(object) <- mail[(index + 1):length(mail)]
108                    return(object)                    return(object)
109                } else {                } else {
110                    return(object)                    return(object)
# Line 127  Line 127 
127  # Update is only supported for directories  # Update is only supported for directories
128  # At the moment no other LoD devices are available anyway  # At the moment no other LoD devices are available anyway
129  setMethod("tmUpdate",  setMethod("tmUpdate",
130            signature(object = "TextDocCol", origin = "DirSource"),            signature(object = "Corpus", origin = "DirSource"),
131            function(object, origin,            function(object, origin,
132                     readerControl = list(reader = origin@DefaultReader, language = "en_US", load = FALSE),                     readerControl = list(reader = origin@DefaultReader, language = "en_US", load = FALSE),
133                     ...) {                     ...) {
# Line 155  Line 155 
155    
156  setGeneric("tmMap", function(object, FUN, ...) standardGeneric("tmMap"))  setGeneric("tmMap", function(object, FUN, ...) standardGeneric("tmMap"))
157  setMethod("tmMap",  setMethod("tmMap",
158            signature(object = "TextDocCol", FUN = "function"),            signature(object = "Corpus", FUN = "function"),
159            function(object, FUN, ...) {            function(object, FUN, ...) {
160                result <- object                result <- object
161                # Note that text corpora are automatically loaded into memory via \code{[[}                # Note that text corpora are automatically loaded into memory via \code{[[}
# Line 181  Line 181 
181  setMethod("asPlain",  setMethod("asPlain",
182            signature(object = "XMLTextDocument", FUN = "function"),            signature(object = "XMLTextDocument", FUN = "function"),
183            function(object, FUN, ...) {            function(object, FUN, ...) {
184                corpus <- Corpus(object)                corpus <- Content(object)
185    
186                # As XMLDocument is no native S4 class, restore valid information                # As XMLDocument is no native S4 class, restore valid information
187                class(corpus) <- "XMLDocument"                class(corpus) <- "XMLDocument"
# Line 193  Line 193 
193            signature(object = "Reuters21578Document"),            signature(object = "Reuters21578Document"),
194            function(object, FUN, ...) {            function(object, FUN, ...) {
195                FUN <- convertReut21578XMLPlain                FUN <- convertReut21578XMLPlain
196                corpus <- Corpus(object)                corpus <- Content(object)
197    
198                # As XMLDocument is no native S4 class, restore valid information                # As XMLDocument is no native S4 class, restore valid information
199                class(corpus) <- "XMLDocument"                class(corpus) <- "XMLDocument"
# Line 205  Line 205 
205            signature(object = "RCV1Document"),            signature(object = "RCV1Document"),
206            function(object, FUN, ...) {            function(object, FUN, ...) {
207                FUN <- convertRCV1Plain                FUN <- convertRCV1Plain
208                corpus <- Corpus(object)                corpus <- Content(object)
209    
210                # As XMLDocument is no native S4 class, restore valid information                # As XMLDocument is no native S4 class, restore valid information
211                class(corpus) <- "XMLDocument"                class(corpus) <- "XMLDocument"
# Line 216  Line 216 
216  setMethod("asPlain",  setMethod("asPlain",
217            signature(object = "NewsgroupDocument"),            signature(object = "NewsgroupDocument"),
218            function(object, FUN, ...) {            function(object, FUN, ...) {
219                new("PlainTextDocument", .Data = Corpus(object), Cached = TRUE, URI = "", Author = Author(object),                new("PlainTextDocument", .Data = Content(object), Cached = TRUE, URI = "", Author = Author(object),
220                    DateTimeStamp = DateTimeStamp(object), Description = Description(object), ID = ID(object),                    DateTimeStamp = DateTimeStamp(object), Description = Description(object), ID = ID(object),
221                    Origin = Origin(object), Heading = Heading(object), Language = Language(object))                    Origin = Origin(object), Heading = Heading(object), Language = Language(object))
222            })            })
223  setMethod("asPlain",  setMethod("asPlain",
224            signature(object = "StructuredTextDocument"),            signature(object = "StructuredTextDocument"),
225            function(object, FUN, ...) {            function(object, FUN, ...) {
226                new("PlainTextDocument", .Data = unlist(Corpus(object)), Cached = TRUE,                new("PlainTextDocument", .Data = unlist(Content(object)), Cached = TRUE,
227                    URI = "", Author = Author(object), DateTimeStamp = DateTimeStamp(object),                    URI = "", Author = Author(object), DateTimeStamp = DateTimeStamp(object),
228                    Description = Description(object), ID = ID(object), Origin = Origin(object),                    Description = Description(object), ID = ID(object), Origin = Origin(object),
229                    Heading = Heading(object), Language = Language(object))                    Heading = Heading(object), Language = Language(object))
# Line 231  Line 231 
231    
232  setGeneric("tmFilter", function(object, ..., FUN = sFilter, doclevel = FALSE) standardGeneric("tmFilter"))  setGeneric("tmFilter", function(object, ..., FUN = sFilter, doclevel = FALSE) standardGeneric("tmFilter"))
233  setMethod("tmFilter",  setMethod("tmFilter",
234            signature(object = "TextDocCol"),            signature(object = "Corpus"),
235            function(object, ..., FUN = sFilter, doclevel = FALSE) {            function(object, ..., FUN = sFilter, doclevel = FALSE) {
236                if (doclevel)                if (doclevel)
237                    return(object[sapply(object, FUN, ..., DMetaData = DMetaData(object))])                    return(object[sapply(object, FUN, ..., DMetaData = DMetaData(object))])
# Line 241  Line 241 
241    
242  setGeneric("tmIndex", function(object, ..., FUN = sFilter, doclevel = FALSE) standardGeneric("tmIndex"))  setGeneric("tmIndex", function(object, ..., FUN = sFilter, doclevel = FALSE) standardGeneric("tmIndex"))
243  setMethod("tmIndex",  setMethod("tmIndex",
244            signature(object = "TextDocCol"),            signature(object = "Corpus"),
245            function(object, ..., FUN = sFilter, doclevel = FALSE) {            function(object, ..., FUN = sFilter, doclevel = FALSE) {
246                if (doclevel)                if (doclevel)
247                    return(sapply(object, FUN, ..., DMetaData = DMetaData(object)))                    return(sapply(object, FUN, ..., DMetaData = DMetaData(object)))
# Line 276  Line 276 
276    
277  setGeneric("appendElem", function(object, data, meta = NULL) standardGeneric("appendElem"))  setGeneric("appendElem", function(object, data, meta = NULL) standardGeneric("appendElem"))
278  setMethod("appendElem",  setMethod("appendElem",
279            signature(object = "TextDocCol", data = "TextDocument"),            signature(object = "Corpus", data = "TextDocument"),
280            function(object, data, meta = NULL) {            function(object, data, meta = NULL) {
281                if (DBControl(object)[["useDb"]]) {                if (DBControl(object)[["useDb"]]) {
282                    db <- dbInit(DBControl(object)[["dbName"]], DBControl(object)[["dbType"]])                    db <- dbInit(DBControl(object)[["dbName"]], DBControl(object)[["dbType"]])
# Line 293  Line 293 
293    
294  setGeneric("appendMeta", function(object, cmeta = NULL, dmeta = NULL) standardGeneric("appendMeta"))  setGeneric("appendMeta", function(object, cmeta = NULL, dmeta = NULL) standardGeneric("appendMeta"))
295  setMethod("appendMeta",  setMethod("appendMeta",
296            signature(object = "TextDocCol"),            signature(object = "Corpus"),
297            function(object, cmeta = NULL, dmeta = NULL) {            function(object, cmeta = NULL, dmeta = NULL) {
298                object@CMetaData@MetaData <- c(CMetaData(object)@MetaData, cmeta)                object@CMetaData@MetaData <- c(CMetaData(object)@MetaData, cmeta)
299                if (!is.null(dmeta)) {                if (!is.null(dmeta)) {
# Line 304  Line 304 
304    
305  setGeneric("removeMeta", function(object, cname = NULL, dname = NULL) standardGeneric("removeMeta"))  setGeneric("removeMeta", function(object, cname = NULL, dname = NULL) standardGeneric("removeMeta"))
306  setMethod("removeMeta",  setMethod("removeMeta",
307            signature(object = "TextDocCol"),            signature(object = "Corpus"),
308            function(object, cname = NULL, dname = NULL) {            function(object, cname = NULL, dname = NULL) {
309                if (!is.null(cname))                if (!is.null(cname))
310                    object@CMetaData@MetaData <- CMetaData(object)@MetaData[names(CMetaData(object)@MetaData) != cname]                    object@CMetaData@MetaData <- CMetaData(object)@MetaData[names(CMetaData(object)@MetaData) != cname]
# Line 315  Line 315 
315    
316  setGeneric("prescindMeta", function(object, meta) standardGeneric("prescindMeta"))  setGeneric("prescindMeta", function(object, meta) standardGeneric("prescindMeta"))
317  setMethod("prescindMeta",  setMethod("prescindMeta",
318            signature(object = "TextDocCol", meta = "character"),            signature(object = "Corpus", meta = "character"),
319            function(object, meta) {            function(object, meta) {
320                for (m in meta) {                for (m in meta) {
321                    if (m %in% c("Author", "DateTimeStamp", "Description", "ID", "Origin", "Heading", "Language")) {                    if (m %in% c("Author", "DateTimeStamp", "Description", "ID", "Origin", "Heading", "Language")) {
# Line 341  Line 341 
341            })            })
342    
343  setMethod("[",  setMethod("[",
344            signature(x = "TextDocCol", i = "ANY", j = "ANY", drop = "ANY"),            signature(x = "Corpus", i = "ANY", j = "ANY", drop = "ANY"),
345            function(x, i, j, ... , drop) {            function(x, i, j, ... , drop) {
346                if(missing(i))                if(missing(i))
347                    return(x)                    return(x)
# Line 361  Line 361 
361            })            })
362    
363  setMethod("[<-",  setMethod("[<-",
364            signature(x = "TextDocCol", i = "ANY", j = "ANY", value = "ANY"),            signature(x = "Corpus", i = "ANY", j = "ANY", value = "ANY"),
365            function(x, i, j, ... , value) {            function(x, i, j, ... , value) {
366                object <- x                object <- x
367                if (DBControl(object)[["useDb"]]) {                if (DBControl(object)[["useDb"]]) {
# Line 382  Line 382 
382            })            })
383    
384  setMethod("[[",  setMethod("[[",
385            signature(x = "TextDocCol", i = "ANY", j = "ANY"),            signature(x = "Corpus", i = "ANY", j = "ANY"),
386            function(x, i, j, ...) {            function(x, i, j, ...) {
387                if (DBControl(x)[["useDb"]]) {                if (DBControl(x)[["useDb"]]) {
388                    db <- dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])                    db <- dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
# Line 394  Line 394 
394            })            })
395    
396  setMethod("[[<-",  setMethod("[[<-",
397            signature(x = "TextDocCol", i = "ANY", j = "ANY", value = "ANY"),            signature(x = "Corpus", i = "ANY", j = "ANY", value = "ANY"),
398            function(x, i, j, ..., value) {            function(x, i, j, ..., value) {
399                object <- x                object <- x
400                if (DBControl(object)[["useDb"]]) {                if (DBControl(object)[["useDb"]]) {
# Line 436  Line 436 
436  }  }
437    
438  setMethod("c",  setMethod("c",
439            signature(x = "TextDocCol"),            signature(x = "Corpus"),
440            function(x, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) {            function(x, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) {
441                args <- list(...)                args <- list(...)
442                if (length(args) == 0)                if (length(args) == 0)
443                    return(x)                    return(x)
444    
445                if (!all(sapply(args, inherits, "TextDocCol")))                if (!all(sapply(args, inherits, "Corpus")))
446                    stop("not all arguments are text document collections")                    stop("not all arguments are text document collections")
447                if (DBControl(x)[["useDb"]] == TRUE || any(unlist(sapply(args, DBControl)["useDb", ])))                if (DBControl(x)[["useDb"]] == TRUE || any(unlist(sapply(args, DBControl)["useDb", ])))
448                    stop("concatenating text document collections with activated database is not supported")                    stop("concatenating text document collections with activated database is not supported")
# Line 456  Line 456 
456    
457  setGeneric("c2", function(x, y, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) standardGeneric("c2"))  setGeneric("c2", function(x, y, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) standardGeneric("c2"))
458  setMethod("c2",  setMethod("c2",
459            signature(x = "TextDocCol", y = "TextDocCol"),            signature(x = "Corpus", y = "Corpus"),
460            function(x, y, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) {            function(x, y, ..., meta = list(merge_date = Sys.time(), merger = Sys.getenv("LOGNAME")), recursive = TRUE) {
461                object <- x                object <- x
462                # Concatenate data slots                # Concatenate data slots
# Line 523  Line 523 
523                              MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")),                              MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")),
524                              children = list())                              children = list())
525    
526                return(new("TextDocCol",                return(new("Corpus",
527                           .Data = list(x, ...),                           .Data = list(x, ...),
528                           DMetaData = dmeta.df,                           DMetaData = dmeta.df,
529                           CMetaData = cmeta.node,                           CMetaData = cmeta.node,
# Line 531  Line 531 
531            })            })
532    
533  setMethod("length",  setMethod("length",
534            signature(x = "TextDocCol"),            signature(x = "Corpus"),
535            function(x){            function(x){
536                return(length(as(x, "list")))                return(length(as(x, "list")))
537      })      })
538    
539  setMethod("show",  setMethod("show",
540            signature(object = "TextDocCol"),            signature(object = "Corpus"),
541            function(object){            function(object){
542                cat(sprintf(ngettext(length(object),                cat(sprintf(ngettext(length(object),
543                                     "A text document collection with %d text document\n",                                     "A text document collection with %d text document\n",
# Line 546  Line 546 
546      })      })
547    
548  setMethod("summary",  setMethod("summary",
549            signature(object = "TextDocCol"),            signature(object = "Corpus"),
550            function(object){            function(object){
551                show(object)                show(object)
552                if (length(DMetaData(object)) > 0) {                if (length(DMetaData(object)) > 0) {
# Line 563  Line 563 
563    
564  setGeneric("inspect", function(object) standardGeneric("inspect"))  setGeneric("inspect", function(object) standardGeneric("inspect"))
565  setMethod("inspect",  setMethod("inspect",
566            signature("TextDocCol"),            signature("Corpus"),
567            function(object) {            function(object) {
568                summary(object)                summary(object)
569                cat("\n")                cat("\n")
# Line 578  Line 578 
578  # No metadata is checked  # No metadata is checked
579  setGeneric("%IN%", function(x, y) standardGeneric("%IN%"))  setGeneric("%IN%", function(x, y) standardGeneric("%IN%"))
580  setMethod("%IN%",  setMethod("%IN%",
581            signature(x = "TextDocument", y = "TextDocCol"),            signature(x = "TextDocument", y = "Corpus"),
582            function(x, y) {            function(x, y) {
583                if (DBControl(y)[["useDb"]]) {                if (DBControl(y)[["useDb"]]) {
584                    db <- dbInit(DBControl(y)[["dbName"]], DBControl(y)[["dbType"]])                    db <- dbInit(DBControl(y)[["dbName"]], DBControl(y)[["dbType"]])
585                    result <- any(sapply(y, function(x, z) {x %in% Corpus(z)}, x))                    result <- any(sapply(y, function(x, z) {x %in% Content(z)}, x))
586                }                }
587                else                else
588                    result <- x %in% y                    result <- x %in% y
# Line 590  Line 590 
590            })            })
591    
592  setMethod("lapply",  setMethod("lapply",
593            signature(X = "TextDocCol"),            signature(X = "Corpus"),
594            function(X, FUN, ...) {            function(X, FUN, ...) {
595                if (DBControl(X)[["useDb"]]) {                if (DBControl(X)[["useDb"]]) {
596                    db <- dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])                    db <- dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
# Line 602  Line 602 
602            })            })
603    
604  setMethod("sapply",  setMethod("sapply",
605            signature(X = "TextDocCol"),            signature(X = "Corpus"),
606            function(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE) {            function(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE) {
607                if (DBControl(X)[["useDb"]]) {                if (DBControl(X)[["useDb"]]) {
608                    db <- dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])                    db <- dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])

Legend:
Removed from v.812  
changed lines
  Added in v.816

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge