SCM

SCM Repository

[tm] Diff of /trunk/tm/R/textdoccol.R
ViewVC logotype

Diff of /trunk/tm/R/textdoccol.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 65, Tue Oct 31 17:10:24 2006 UTC revision 66, Tue Oct 31 22:03:33 2006 UTC
# Line 11  Line 11 
11                tdl <- list()                tdl <- list()
12                counter <- 1                counter <- 1
13                while (!eoi(object)) {                while (!eoi(object)) {
14                    object <- stepNext(object)                    object <- step_next(object)
15                    elem <- getElem(object)                    elem <- get_elem(object)
16                    # If there is no Load on Demand support                    # If there is no Load on Demand support
17                    # we need to load the corpus into memory at startup                    # we need to load the corpus into memory at startup
18                    if (object@LoDSupport)                    if (object@LoDSupport)
# Line 63  Line 63 
63                    Content = content, Position = 0)                    Content = content, Position = 0)
64            })            })
65    
66  setGeneric("stepNext", function(object) standardGeneric("stepNext"))  setGeneric("step_next", function(object) standardGeneric("step_next"))
67  setMethod("stepNext",  setMethod("step_next",
68            signature(object = "DirSource"),            signature(object = "DirSource"),
69            function(object) {            function(object) {
70                object@Position <- object@Position + 1                object@Position <- object@Position + 1
71                object                object
72            })            })
73  setMethod("stepNext",  setMethod("step_next",
74            signature(object = "CSVSource"),            signature(object = "CSVSource"),
75            function(object) {            function(object) {
76                object@Position <- object@Position + 1                object@Position <- object@Position + 1
77                object                object
78            })            })
79  setMethod("stepNext",  setMethod("step_next",
80            signature(object = "ReutersSource"),            signature(object = "ReutersSource"),
81            function(object) {            function(object) {
82                object@Position <- object@Position + 1                object@Position <- object@Position + 1
83                object                object
84            })            })
85    
86  setGeneric("getElem", function(object) standardGeneric("getElem"))  setGeneric("get_elem", function(object) standardGeneric("get_elem"))
87  setMethod("getElem",  setMethod("get_elem",
88            signature(object = "DirSource"),            signature(object = "DirSource"),
89            function(object) {            function(object) {
90                list(content = readLines(object@FileList[object@Position]),                list(content = readLines(object@FileList[object@Position]),
91                     uri = paste('file("', object@FileList[object@Position], '")', sep = ""))                     uri = paste('file("', object@FileList[object@Position], '")', sep = ""))
92            })            })
93  setMethod("getElem",  setMethod("get_elem",
94            signature(object = "CSVSource"),            signature(object = "CSVSource"),
95            function(object) {            function(object) {
96                list(content = object@Content[object@Position],                list(content = object@Content[object@Position],
97                     uri = object@URI)                     uri = object@URI)
98            })            })
99  setMethod("getElem",  setMethod("get_elem",
100            signature(object = "ReutersSource"),            signature(object = "ReutersSource"),
101            function(object) {            function(object) {
102                # Construct a character representation from the XMLNode                # Construct a character representation from the XMLNode
# Line 149  Line 149 
149  }  }
150  class(plaintext_parser) <- "function_generator"  class(plaintext_parser) <- "function_generator"
151    
152  reuters21578xml_parser <- function(...) {  reut21578xml_parser <- function(...) {
153      function(elem, lodsupport, load, id) {      function(elem, lodsupport, load, id) {
154          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
155          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- xmlTreeParse(corpus, asText = TRUE)
# Line 189  Line 189 
189          return(doc)          return(doc)
190      }      }
191  }  }
192  class(reuters21578xml_parser) <- "function_generator"  class(reut21578xml_parser) <- "function_generator"
193    
194  rcv1_parser <- function(...) {  rcv1_parser <- function(...) {
195      function(elem, lodsupport, load, id) {      function(elem, lodsupport, load, id) {
# Line 219  Line 219 
219  }  }
220  class(rcv1_parser) <- "function_generator"  class(rcv1_parser) <- "function_generator"
221    
222  uci_kdd_newsgroup_parser <- function(...) {  newsgroup_parser <- function(...) {
223      function(elem, lodsupport, load, id) {      function(elem, lodsupport, load, id) {
224          mail <- elem$content          mail <- elem$content
225          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))
# Line 249  Line 249 
249          return(doc)          return(doc)
250      }      }
251  }  }
252  class(uci_kdd_newsgroup_parser) <- "function_generator"  class(newsgroup_parser) <- "function_generator"
253    
254  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
255  rcv1_to_plain <- function(node, ...) {  rcv1_to_plain <- function(node, ...) {
# Line 264  Line 264 
264  }  }
265    
266  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
267  reuters21578xml_to_plain <- function(node, ...) {  reut21578xml_to_plain <- function(node, ...) {
268      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
269      if (!is.null(node[["TEXT"]][["AUTHOR"]]))      if (!is.null(node[["TEXT"]][["AUTHOR"]]))
270          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
# Line 295  Line 295 
295          Description = description, ID = id, Origin = origin, Heading = heading, LocalMetaData = list(Topics = topics))          Description = description, ID = id, Origin = origin, Heading = heading, LocalMetaData = list(Topics = topics))
296  }  }
297    
298  setGeneric("loadFileIntoMem", function(object, ...) standardGeneric("loadFileIntoMem"))  setGeneric("load_doc", function(object, ...) standardGeneric("load_doc"))
299  setMethod("loadFileIntoMem",  setMethod("load_doc",
300            signature(object = "PlainTextDocument"),            signature(object = "PlainTextDocument"),
301            function(object, ...) {            function(object, ...) {
302                if (!Cached(object)) {                if (!Cached(object)) {
# Line 310  Line 310 
310                    return(object)                    return(object)
311                }                }
312            })            })
313  setMethod("loadFileIntoMem",  setMethod("load_doc",
314            signature(object =  "XMLTextDocument"),            signature(object =  "XMLTextDocument"),
315            function(object, ...) {            function(object, ...) {
316                if (!Cached(object)) {                if (!Cached(object)) {
# Line 326  Line 326 
326                    return(object)                    return(object)
327                }                }
328            })            })
329  setMethod("loadFileIntoMem",  setMethod("load_doc",
330            signature(object = "NewsgroupDocument"),            signature(object = "NewsgroupDocument"),
331            function(object, ...) {            function(object, ...) {
332                if (!Cached(object)) {                if (!Cached(object)) {
# Line 354  Line 354 
354                return(result)                return(result)
355            })            })
356    
357  setGeneric("toPlainTextDocument", function(object, FUN, ...) standardGeneric("toPlainTextDocument"))  setGeneric("as.plaintext_doc", function(object, FUN, ...) standardGeneric("as.plaintext_doc"))
358  setMethod("toPlainTextDocument",  setMethod("as.plaintext_doc",
359            signature(object = "PlainTextDocument"),            signature(object = "PlainTextDocument"),
360            function(object, FUN, ...) {            function(object, FUN, ...) {
361                return(object)                return(object)
362            })            })
363  setMethod("toPlainTextDocument",  setMethod("as.plaintext_doc",
364            signature(object = "XMLTextDocument", FUN = "function"),            signature(object = "XMLTextDocument", FUN = "function"),
365            function(object, FUN, ...) {            function(object, FUN, ...) {
366                if (!Cached(object))                if (!Cached(object))
367                    object <- loadFileIntoMem(object)                    object <- load_doc(object)
368    
369                corpus <- Corpus(object)                corpus <- Corpus(object)
370    
# Line 375  Line 375 
375                return(FUN(xmlRoot(corpus), ...))                return(FUN(xmlRoot(corpus), ...))
376            })            })
377    
378  setGeneric("stemTextDocument", function(object, ...) standardGeneric("stemTextDocument"))  setGeneric("stem_doc", function(object, ...) standardGeneric("stem_doc"))
379  setMethod("stemTextDocument",  setMethod("stem_doc",
380            signature(object = "PlainTextDocument"),            signature(object = "PlainTextDocument"),
381            function(object, ...) {            function(object, ...) {
382                if (!Cached(object))                if (!Cached(object))
383                    object <- loadFileIntoMem(object)                    object <- load_doc(object)
384    
385                require(Rstem)                require(Rstem)
386                splittedCorpus <- unlist(strsplit(object, " ", fixed = TRUE))                splittedCorpus <- unlist(strsplit(object, " ", fixed = TRUE))
# Line 389  Line 389 
389                return(object)                return(object)
390            })            })
391    
392  setGeneric("removeStopWords", function(object, stopwords, ...) standardGeneric("removeStopWords"))  setGeneric("remove_words", function(object, stopwords, ...) standardGeneric("remove_words"))
393  setMethod("removeStopWords",  setMethod("remove_words",
394            signature(object = "PlainTextDocument", stopwords = "character"),            signature(object = "PlainTextDocument", stopwords = "character"),
395            function(object, stopwords, ...) {            function(object, stopwords, ...) {
396                if (!Cached(object))                if (!Cached(object))
397                    object <- loadFileIntoMem(object)                    object <- load_doc(object)
398    
399                require(Rstem)                require(Rstem)
400                splittedCorpus <- unlist(strsplit(object, " ", fixed = TRUE))                splittedCorpus <- unlist(strsplit(object, " ", fixed = TRUE))
# Line 436  Line 436 
436            signature(object = "PlainTextDocument", pattern = "character"),            signature(object = "PlainTextDocument", pattern = "character"),
437            function(object, pattern, ...) {            function(object, pattern, ...) {
438                if (!Cached(object))                if (!Cached(object))
439                    object <- loadFileIntoMem(object)                    object <- load_doc(object)
440    
441                return(any(grep(pattern, Corpus(object))))                return(any(grep(pattern, Corpus(object))))
442            })            })
443    
444  setGeneric("attachData", function(object, data) standardGeneric("attachData"))  setGeneric("attach_data", function(object, data) standardGeneric("attach_data"))
445  setMethod("attachData",  setMethod("attach_data",
446            signature(object = "TextDocCol", data = "TextDocument"),            signature(object = "TextDocCol", data = "TextDocument"),
447            function(object, data) {            function(object, data) {
448                data <- as(list(data), "TextDocCol")                data <- as(list(data), "TextDocCol")
# Line 450  Line 450 
450                return(object)                return(object)
451            })            })
452    
453  setGeneric("attachMetaData", function(object, name, metadata) standardGeneric("attachMetaData"))  setGeneric("attach_metadata", function(object, name, metadata) standardGeneric("attach_metadata"))
454  setMethod("attachMetaData",  setMethod("attach_metadata",
455            signature(object = "TextDocCol"),            signature(object = "TextDocCol"),
456            function(object, name, metadata) {            function(object, name, metadata) {
457                object@GlobalMetaData <- c(GlobalMetaData(object), new = list(metadata))                object@GlobalMetaData <- c(GlobalMetaData(object), new = list(metadata))
# Line 459  Line 459 
459                return(object)                return(object)
460            })            })
461    
462  setGeneric("setSubscriptable", function(object, name) standardGeneric("setSubscriptable"))  setGeneric("set_subscriptable", function(object, name) standardGeneric("set_subscriptable"))
463  setMethod("setSubscriptable",  setMethod("set_subscriptable",
464            signature(object = "TextDocCol"),            signature(object = "TextDocCol"),
465            function(object, name) {            function(object, name) {
466                if (!is.character(GlobalMetaData(object)$subscriptable))                if (!is.character(GlobalMetaData(object)$subscriptable))
467                    object <- attachMetaData(object, "subscriptable", name)                    object <- attach_metadata(object, "subscriptable", name)
468                else                else
469                    object@GlobalMetaData$subscriptable <- c(GlobalMetaData(object)$subscriptable, name)                    object@GlobalMetaData$subscriptable <- c(GlobalMetaData(object)$subscriptable, name)
470                return(object)                return(object)

Legend:
Removed from v.65  
changed lines
  Added in v.66

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge