SCM

SCM Repository

[tm] Diff of /pkg/R/textdoccol.R
ViewVC logotype

Diff of /pkg/R/textdoccol.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 697, Fri Jan 5 23:09:12 2007 UTC revision 698, Sat Jan 6 17:05:44 2007 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # The "..." are additional arguments for the function_generator parser  # The "..." are additional arguments for the FunctionGenerator parser
4  setGeneric("TextDocCol", function(object, parser = read_plain, load = FALSE, ...) standardGeneric("TextDocCol"))  setGeneric("TextDocCol", function(object, parser = readPlain, load = FALSE, ...) standardGeneric("TextDocCol"))
5  setMethod("TextDocCol",  setMethod("TextDocCol",
6            signature(object = "Source"),            signature(object = "Source"),
7            function(object, parser = read_plain, load = FALSE, ...) {            function(object, parser = readPlain, load = FALSE, ...) {
8                if (inherits(parser, "function_generator"))                if (inherits(parser, "FunctionGenerator"))
9                    parser <- parser(...)                    parser <- parser(...)
10    
11                tdl <- list()                tdl <- list()
12                counter <- 1                counter <- 1
13                while (!eoi(object)) {                while (!eoi(object)) {
14                    object <- step_next(object)                    object <- stepNext(object)
15                    elem <- get_elem(object)                    elem <- getElem(object)
16                    # If there is no Load on Demand support                    # If there is no Load on Demand support
17                    # we need to load the corpus into memory at startup                    # we need to load the corpus into memory at startup
18                    if (!object@LoDSupport)                    if (!object@LoDSupport)
# Line 22  Line 22 
22                }                }
23    
24                dmeta.df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)                dmeta.df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
25                dcmeta.node <- new("MetaDataNode",                cmeta.node <- new("MetaDataNode",
26                              NodeID = 0,                              NodeID = 0,
27                              MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")),                              MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")),
28                              children = list())                              children = list())
29    
30                return(new("TextDocCol", .Data = tdl, DMetaData = dmeta.df, DCMetaData = dcmeta.node))                return(new("TextDocCol", .Data = tdl, DMetaData = dmeta.df, CMetaData = cmeta.node))
31            })            })
32    
33  setGeneric("load_doc", function(object, ...) standardGeneric("load_doc"))  setGeneric("loadDoc", function(object, ...) standardGeneric("loadDoc"))
34  setMethod("load_doc",  setMethod("loadDoc",
35            signature(object = "PlainTextDocument"),            signature(object = "PlainTextDocument"),
36            function(object, ...) {            function(object, ...) {
37                if (!Cached(object)) {                if (!Cached(object)) {
# Line 45  Line 45 
45                    return(object)                    return(object)
46                }                }
47            })            })
48  setMethod("load_doc",  setMethod("loadDoc",
49            signature(object =  "XMLTextDocument"),            signature(object =  "XMLTextDocument"),
50            function(object, ...) {            function(object, ...) {
51                if (!Cached(object)) {                if (!Cached(object)) {
# Line 61  Line 61 
61                    return(object)                    return(object)
62                }                }
63            })            })
64  setMethod("load_doc",  setMethod("loadDoc",
65            signature(object = "NewsgroupDocument"),            signature(object = "NewsgroupDocument"),
66            function(object, ...) {            function(object, ...) {
67                if (!Cached(object)) {                if (!Cached(object)) {
# Line 80  Line 80 
80                }                }
81            })            })
82    
83  setGeneric("tm_update", function(object, origin, parser = read_plain, ...) standardGeneric("tm_update"))  setGeneric("tmUpdate", function(object, origin, parser = readPlain, ...) standardGeneric("tmUpdate"))
84  # Update is only supported for directories  # Update is only supported for directories
85  # At the moment no other LoD devices are available anyway  # At the moment no other LoD devices are available anyway
86  setMethod("tm_update",  setMethod("tmUpdate",
87            signature(object = "TextDocCol", origin = "DirSource"),            signature(object = "TextDocCol", origin = "DirSource"),
88            function(object, origin, parser = read_plain, ...) {            function(object, origin, parser = readPlain, load = FALSE, ...) {
89                if (inherits(parser, "function_generator"))                if (inherits(parser, "FunctionGenerator"))
90                    parser <- parser(...)                    parser <- parser(...)
91    
92                object.filelist <- unlist(lapply(object, function(x) {as.character(URI(x))[2]}))                object.filelist <- unlist(lapply(object, function(x) {as.character(URI(x))[2]}))
# Line 95  Line 95 
95                for (filename in new.files) {                for (filename in new.files) {
96                    elem <- list(content = readLines(filename),                    elem <- list(content = readLines(filename),
97                                 uri = substitute(file(filename)))                                 uri = substitute(file(filename)))
98                    object <- append_doc(object, parser(elem, TRUE, origin@Load, filename), NA)                    object <- appendElem(object, parser(elem, load, filename))
99                }                }
100    
101                return(object)                return(object)
102            })            })
103    
104  setGeneric("tm_map", function(object, FUN, ...) standardGeneric("tm_map"))  setGeneric("tmMap", function(object, FUN, ...) standardGeneric("tmMap"))
105  setMethod("tm_map",  setMethod("tmMap",
106            signature(object = "TextDocCol", FUN = "function"),            signature(object = "TextDocCol", FUN = "function"),
107            function(object, FUN, ...) {            function(object, FUN, ...) {
108                result <- object                result <- object
# Line 111  Line 111 
111                return(result)                return(result)
112            })            })
113    
114  setGeneric("as.plain", function(object, FUN, ...) standardGeneric("as.plain"))  setGeneric("asPlain", function(object, FUN, ...) standardGeneric("asPlain"))
115  setMethod("as.plain",  setMethod("asPlain",
116            signature(object = "PlainTextDocument"),            signature(object = "PlainTextDocument"),
117            function(object, FUN, ...) {            function(object, FUN, ...) {
118                return(object)                return(object)
119            })            })
120  setMethod("as.plain",  setMethod("asPlain",
121            signature(object = "XMLTextDocument", FUN = "function"),            signature(object = "XMLTextDocument", FUN = "function"),
122            function(object, FUN, ...) {            function(object, FUN, ...) {
123                corpus <- Corpus(object)                corpus <- Corpus(object)
# Line 129  Line 129 
129                return(FUN(xmlRoot(corpus), ...))                return(FUN(xmlRoot(corpus), ...))
130            })            })
131    
132  setGeneric("tm_tolower", function(object, ...) standardGeneric("tm_tolower"))  setGeneric("tmTolower", function(object, ...) standardGeneric("tmTolower"))
133  setMethod("tm_tolower",  setMethod("tmTolower",
134            signature(object = "PlainTextDocument"),            signature(object = "PlainTextDocument"),
135            function(object, ...) {            function(object, ...) {
136                Corpus(object) <- tolower(object)                Corpus(object) <- tolower(object)
137                return(object)                return(object)
138            })            })
139    
140  setGeneric("strip_whitespace", function(object, ...) standardGeneric("strip_whitespace"))  setGeneric("stripWhitespace", function(object, ...) standardGeneric("stripWhitespace"))
141  setMethod("strip_whitespace",  setMethod("stripWhitespace",
142            signature(object = "PlainTextDocument"),            signature(object = "PlainTextDocument"),
143            function(object, ...) {            function(object, ...) {
144                Corpus(object) <- gsub("[[:space:]]+", " ", object)                Corpus(object) <- gsub("[[:space:]]+", " ", object)
145                return(object)                return(object)
146            })            })
147    
148  setGeneric("stem_doc", function(object, ...) standardGeneric("stem_doc"))  setGeneric("stemDoc", function(object, ...) standardGeneric("stemDoc"))
149  setMethod("stem_doc",  setMethod("stemDoc",
150            signature(object = "PlainTextDocument"),            signature(object = "PlainTextDocument"),
151            function(object, ...) {            function(object, ...) {
152                require("Rstem")                require("Rstem")
# Line 156  Line 156 
156                return(object)                return(object)
157            })            })
158    
159  setGeneric("remove_words", function(object, stopwords, ...) standardGeneric("remove_words"))  setGeneric("removeWords", function(object, stopwords, ...) standardGeneric("removeWords"))
160  setMethod("remove_words",  setMethod("removeWords",
161            signature(object = "PlainTextDocument", stopwords = "character"),            signature(object = "PlainTextDocument", stopwords = "character"),
162            function(object, stopwords, ...) {            function(object, stopwords, ...) {
163                require("Rstem")                require("Rstem")
# Line 167  Line 167 
167                return(object)                return(object)
168            })            })
169    
170  setGeneric("tm_filter", function(object, ..., FUN = s_filter, doclevel = FALSE) standardGeneric("tm_filter"))  setGeneric("tmFilter", function(object, ..., FUN = sFilter, doclevel = FALSE) standardGeneric("tmFilter"))
171  setMethod("tm_filter",  setMethod("tmFilter",
172            signature(object = "TextDocCol"),            signature(object = "TextDocCol"),
173            function(object, ..., FUN = s_filter, doclevel = FALSE) {            function(object, ..., FUN = sFilter, doclevel = FALSE) {
174                if (doclevel)                if (doclevel)
175                    return(object[sapply(object, FUN, ..., DMetaData = DMetaData(object))])                    return(object[sapply(object, FUN, ..., DMetaData = DMetaData(object))])
176                else                else
177                    return(object[FUN(object, ...)])                    return(object[FUN(object, ...)])
178            })            })
179    
180  setGeneric("tm_index", function(object, ..., FUN = s_filter, doclevel = FALSE) standardGeneric("tm_index"))  setGeneric("tmIndex", function(object, ..., FUN = sFilter, doclevel = FALSE) standardGeneric("tmIndex"))
181  setMethod("tm_index",  setMethod("tmIndex",
182            signature(object = "TextDocCol"),            signature(object = "TextDocCol"),
183            function(object, ..., FUN = s_filter, doclevel = FALSE) {            function(object, ..., FUN = sFilter, doclevel = FALSE) {
184                if (doclevel)                if (doclevel)
185                    return(sapply(object, FUN, ..., DMetaData = DMetaData(object)))                    return(sapply(object, FUN, ..., DMetaData = DMetaData(object)))
186                else                else
187                    return(FUN(object, ...))                    return(FUN(object, ...))
188            })            })
189    
190  s_filter <- function(object, s, ...) {  sFilter <- function(object, s, ...) {
191      query.df <- DMetaData(object)      query.df <- DMetaData(object)
192      con <- textConnection(s)      con <- textConnection(s)
193      tokens <- scan(con, "character")      tokens <- scan(con, "character")
# Line 245  Line 245 
245      return(result)      return(result)
246  }  }
247    
248  setGeneric("search_fulltext", function(object, pattern, ...) standardGeneric("search_fulltext"))  setGeneric("searchFullText", function(object, pattern, ...) standardGeneric("searchFullText"))
249  setMethod("search_fulltext",  setMethod("searchFullText",
250            signature(object = "PlainTextDocument", pattern = "character"),            signature(object = "PlainTextDocument", pattern = "character"),
251            function(object, pattern, ...) {            function(object, pattern, ...) {
252                return(any(grep(pattern, Corpus(object))))                return(any(grep(pattern, Corpus(object))))
253            })            })
254    
255  setGeneric("append_elem", function(object, data, meta = NULL) standardGeneric("append_elem"))  setGeneric("appendElem", function(object, data, meta = NULL) standardGeneric("appendElem"))
256  setMethod("append_elem",  setMethod("appendElem",
257            signature(object = "TextDocCol", data = "TextDocument"),            signature(object = "TextDocCol", data = "TextDocument"),
258            function(object, data, meta = NULL) {            function(object, data, meta = NULL) {
259                object@.Data[[length(object)+1]] <- data                object@.Data[[length(object)+1]] <- data
260                object@DMetaData <- rbind(object@DMetaData, c(MetaID = DCMetaData(object)@NodeID, meta))                object@DMetaData <- rbind(object@DMetaData, c(MetaID = CMetaData(object)@NodeID, meta))
261                return(object)                return(object)
262            })            })
263    
264  setGeneric("append_meta", function(object, dcmeta = NULL, dmeta = NULL) standardGeneric("append_meta"))  setGeneric("appendMeta", function(object, cmeta = NULL, dmeta = NULL) standardGeneric("appendMeta"))
265  setMethod("append_meta",  setMethod("appendMeta",
266            signature(object = "TextDocCol"),            signature(object = "TextDocCol"),
267            function(object, dcmeta = NULL, dmeta = NULL) {            function(object, cmeta = NULL, dmeta = NULL) {
268                object@DCMetaData@MetaData <- c(object@DCMetaData@MetaData, dcmeta)                object@CMetaData@MetaData <- c(object@CMetaData@MetaData, cmeta)
269                if (!is.null(dcmeta))                if (!is.null(cmeta))
270                    object@DMetaData <- cbind(object@DMetaData, dmeta)                    object@DMetaData <- cbind(object@DMetaData, dmeta)
271                return(object)                return(object)
272            })            })
273    
274  setGeneric("remove_meta", function(object, dcname = NULL, dname = NULL) standardGeneric("remove_meta"))  setGeneric("removeMeta", function(object, cname = NULL, dname = NULL) standardGeneric("removeMeta"))
275  setMethod("remove_meta",  setMethod("removeMeta",
276            signature(object = "TextDocCol"),            signature(object = "TextDocCol"),
277            function(object, dcname = NULL, dname = NULL) {            function(object, cname = NULL, dname = NULL) {
278                if (!is.null(dcname)) {                if (!is.null(cname)) {
279                    object@DCMetaData@MetaData <- DCMetaData(object)@MetaData[names(DCMetaData(object)@MetaData) != dcname]                    object@CMetaData@MetaData <- CMetaData(object)@MetaData[names(CMetaData(object)@MetaData) != cname]
280                }                }
281                if (!is.null(dname)) {                if (!is.null(dname)) {
282                    object@DMetaData <- DMetaData(object)[names(DMetaData(object)) != dname]                    object@DMetaData <- DMetaData(object)[names(DMetaData(object)) != dname]
# Line 284  Line 284 
284                return(object)                return(object)
285            })            })
286    
287  setGeneric("prescind_meta", function(object, meta) standardGeneric("prescind_meta"))  setGeneric("prescindMeta", function(object, meta) standardGeneric("prescindMeta"))
288  setMethod("prescind_meta",  setMethod("prescindMeta",
289            signature(object = "TextDocCol", meta = "character"),            signature(object = "TextDocCol", meta = "character"),
290            function(object, meta) {            function(object, meta) {
291                for (m in meta) {                for (m in meta) {
# Line 336  Line 336 
336  setMethod("[[",  setMethod("[[",
337            signature(x = "TextDocCol", i = "ANY", j = "ANY"),            signature(x = "TextDocCol", i = "ANY", j = "ANY"),
338            function(x, i, j, ...) {            function(x, i, j, ...) {
339                return(load_doc(x@.Data[[i]]))                return(loadDoc(x@.Data[[i]]))
340            })            })
341    
342  setMethod("[[<-",  setMethod("[[<-",
# Line 347  Line 347 
347                return(object)                return(object)
348            })            })
349    
350  # Update \code{NodeID}s of a DCMetaData tree  # Update \code{NodeID}s of a CMetaData tree
351  update_id <- function(object, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {  update_id <- function(object, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
352      # Traversal of (binary) DCMetaData tree with setup of \code{NodeID}s      # Traversal of (binary) CMetaData tree with setup of \code{NodeID}s
353      set_id <- function(object) {      set_id <- function(object) {
354          object@NodeID <- id          object@NodeID <- id
355          id <<- id + 1          id <<- id + 1
# Line 399  Line 399 
399                # Concatenate data slots                # Concatenate data slots
400                object@.Data <- c(as(x, "list"), as(y, "list"))                object@.Data <- c(as(x, "list"), as(y, "list"))
401    
402                # Update the DCMetaData tree                # Update the CMetaData tree
403                dcmeta <- new("MetaDataNode", NodeID = 0, MetaData = meta, children = list(DCMetaData(x), DCMetaData(y)))                cmeta <- new("MetaDataNode", NodeID = 0, MetaData = meta, children = list(CMetaData(x), CMetaData(y)))
404                update.struct <- update_id(dcmeta)                update.struct <- update_id(cmeta)
405                object@DCMetaData <- update.struct$root                object@CMetaData <- update.struct$root
406    
407                # Find indices to be updated for the left tree                # Find indices to be updated for the left tree
408                indices.mapping <- NULL                indices.mapping <- NULL
# Line 453  Line 453 
453                    return(x)                    return(x)
454    
455                dmeta.df <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)                dmeta.df <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
456                dcmeta.node <- new("MetaDataNode",                cmeta.node <- new("MetaDataNode",
457                              NodeID = 0,                              NodeID = 0,
458                              MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")),                              MetaData = list(create_date = Sys.time(), creator = Sys.getenv("LOGNAME")),
459                              children = list())                              children = list())
460    
461                return(new("TextDocCol", .Data = list(x, ...), DMetaData = dmeta.df, DCMetaData = dcmeta.node))                return(new("TextDocCol", .Data = list(x, ...), DMetaData = dmeta.df, CMetaData = cmeta.node))
462            })            })
463    
464  setMethod("length",  setMethod("length",
# Line 481  Line 481 
481            function(object){            function(object){
482                show(object)                show(object)
483                if (length(DMetaData(object)) > 0) {                if (length(DMetaData(object)) > 0) {
484                    cat(sprintf(ngettext(length(DCMetaData(object)@MetaData),                    cat(sprintf(ngettext(length(CMetaData(object)@MetaData),
485                                                "\nThe metadata consists of %d tag-value pair and a data frame\n",                                                "\nThe metadata consists of %d tag-value pair and a data frame\n",
486                                                "\nThe metadata consists of %d tag-value pairs and a data frame\n"),                                                "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
487                                         length(DCMetaData(object)@MetaData)))                                         length(CMetaData(object)@MetaData)))
488                    cat("Available tags are:\n")                    cat("Available tags are:\n")
489                    cat(names(DCMetaData(object)@MetaData), "\n")                    cat(names(CMetaData(object)@MetaData), "\n")
490                    cat("Available variables in the data frame are:\n")                    cat("Available variables in the data frame are:\n")
491                    cat(names(DMetaData(object)), "\n")                    cat(names(DMetaData(object)), "\n")
492                }                }

Legend:
Removed from v.697  
changed lines
  Added in v.698

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge