SCM

SCM Repository

[tm] Diff of /trunk/R/textmin/R/textdoccol.R
ViewVC logotype

Diff of /trunk/R/textmin/R/textdoccol.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 60, Sun Oct 22 17:57:47 2006 UTC revision 61, Mon Oct 23 20:07:05 2006 UTC
# Line 23  Line 23 
23      return(doc)      return(doc)
24  }  }
25    
26  reuter21578xml.parser <- function(file, lod) {  reuters21578xml.parser <- function(file, lod) {
27      tree <- xmlTreeParse(file)      tree <- xmlTreeParse(file)
28      node <- xmlRoot(tree)      node <- xmlRoot(tree)
29    
# Line 94  Line 94 
94  }  }
95    
96  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
 # TODO: Check if it works with example  
97  rcv1.to.plain <- function(node) {  rcv1.to.plain <- function(node) {
98      datetimestamp <- xmlAttrs(node)[["date"]]      datetimestamp <- xmlAttrs(node)[["date"]]
99      id <- xmlAttrs(node)[["itemid"]]      id <- xmlAttrs(node)[["itemid"]]
# Line 107  Line 106 
106  }  }
107    
108  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
 # TODO: Ensure it works  
109  reuters21578xml.to.plain <- function(node) {  reuters21578xml.to.plain <- function(node) {
110      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
111      if (!is.null(node[["TEXT"]][["AUTHOR"]]))      if (!is.null(node[["TEXT"]][["AUTHOR"]]))
# Line 139  Line 137 
137          Description = description, ID = id, Origin = origin, Heading = heading, LocalMetaData = list(Topics = topics))          Description = description, ID = id, Origin = origin, Heading = heading, LocalMetaData = list(Topics = topics))
138  }  }
139    
140  setGeneric("loadFileIntoMem", function(object, ...) standardGeneric("loadFileIntoMem"))  setGeneric("loadFileIntoMem", function(object) standardGeneric("loadFileIntoMem"))
141  setMethod("loadFileIntoMem",  setMethod("loadFileIntoMem",
142            c("PlainTextDocument"),            c("PlainTextDocument"),
143            function(object, ...) {            function(object) {
144                if (Cached(object) == FALSE) {                if (!Cached(object)) {
145                    corpus <- readLines(FileName(object))                    corpus <- readLines(FileName(object))
146                    Corpus(object) <- corpus                    Corpus(object) <- corpus
147                    Cached(object) <- TRUE                    Cached(object) <- TRUE
# Line 154  Line 152 
152            })            })
153  setMethod("loadFileIntoMem",  setMethod("loadFileIntoMem",
154            c("XMLTextDocument"),            c("XMLTextDocument"),
155            function(object, ...) {            function(object) {
156                if (Cached(object) == FALSE) {                if (!Cached(object)) {
157                    file <- FileName(object)                    file <- FileName(object)
158                    doc <- xmlTreeParse(file)                    doc <- xmlTreeParse(file)
159                    class(doc) <- "list"                    class(doc) <- "list"
# Line 168  Line 166 
166            })            })
167  setMethod("loadFileIntoMem",  setMethod("loadFileIntoMem",
168            c("NewsgroupDocument"),            c("NewsgroupDocument"),
169            function(object, ...) {            function(object) {
170                if (Cached(object) == FALSE) {                if (!Cached(object)) {
171                    mail <- readLines(FileName(object))                    mail <- readLines(FileName(object))
172                    Cached(object) <- TRUE                    Cached(object) <- TRUE
173                    index <- grep("^Lines:", mail)                    index <- grep("^Lines:", mail)
# Line 198  Line 196 
196  setMethod("toPlainTextDocument",  setMethod("toPlainTextDocument",
197            c("XMLTextDocument"),            c("XMLTextDocument"),
198            function(object, FUN, ...) {            function(object, FUN, ...) {
199                if (Cached(object) == FALSE)                if (!Cached(object))
200                    object <- loadFileIntoMem(object)                    object <- loadFileIntoMem(object)
201    
202                corpus <- Corpus(object)                corpus <- Corpus(object)
# Line 207  Line 205 
205                class(corpus) <- "XMLDocument"                class(corpus) <- "XMLDocument"
206                names(corpus) <- c("doc","dtd")                names(corpus) <- c("doc","dtd")
207    
208                return(FUN(xmlRoot(corpus), ...)))                return(FUN(xmlRoot(corpus), ...))
209            })            })
210    
211  setGeneric("stemTextDocument", function(object, ...) standardGeneric("stemTextDocument"))  setGeneric("stemTextDocument", function(object, ...) standardGeneric("stemTextDocument"))
212  setMethod("stemTextDocument",  setMethod("stemTextDocument",
213            c("PlainTextDocument"),            c("PlainTextDocument"),
214            function(object) {            function(object, ...) {
215                if (Cached(object) == FALSE)                if (!Cached(object))
216                    object <- loadFileIntoMem(object)                    object <- loadFileIntoMem(object)
217    
218                require(Rstem)                require(Rstem)
# Line 227  Line 225 
225  setGeneric("removeStopWords", function(object, stopwords, ...) standardGeneric("removeStopWords"))  setGeneric("removeStopWords", function(object, stopwords, ...) standardGeneric("removeStopWords"))
226  setMethod("removeStopWords",  setMethod("removeStopWords",
227            signature(object = "PlainTextDocument", stopwords = "character"),            signature(object = "PlainTextDocument", stopwords = "character"),
228            function(object, stopwords) {            function(object, stopwords, ...) {
229                if (Cached(object) == FALSE)                if (!Cached(object))
230                    object <- loadFileIntoMem(object)                    object <- loadFileIntoMem(object)
231    
232                require(Rstem)                require(Rstem)
# Line 238  Line 236 
236                return(object)                return(object)
237            })            })
238    
239  setGeneric("tm_filter", function(object, FUN, ...) standardGeneric("tm_filter"))  setGeneric("tm_filter", function(object, ..., FUN = s.filter) standardGeneric("tm_filter"))
240  setMethod("tm_filter",  setMethod("tm_filter",
241            c("TextDocCol"),            signature(object = "TextDocCol"),
242            function(object, FUN, ...) {            function(object, ..., FUN = s.filter) {
243                  object[tm_index(object, ..., FUN)]
244              })
245    
246    setGeneric("tm_index", function(object, ..., FUN = s.filter) standardGeneric("tm_index"))
247    setMethod("tm_index",
248              signature(object = "TextDocCol"),
249              function(object, ..., FUN = s.filter) {
250                sapply(object, FUN, ..., GlobalMetaData = GlobalMetaData(object))                sapply(object, FUN, ..., GlobalMetaData = GlobalMetaData(object))
251            })            })
252    
253  setGeneric("filterREUT21578Topics", function(object, topics, ...) standardGeneric("filterREUT21578Topics"))  s.filter <- function(object, s, ..., GlobalMetaData) {
254  setMethod("filterREUT21578Topics",      b <- TRUE
255        for (tag in names(s)) {
256            if (tag %in% names(LocalMetaData(object))) {
257                b <- b && any(grep(s[[tag]], LocalMetaData(object)[[tag]]))
258            } else if (tag %in% names(GlobalMetaData)){
259                b <- b && any(grep(s[[tag]], GlobalMetaData[[tag]]))
260            } else {
261                b <- b && any(grep(s[[tag]], eval(call(tag, object))))
262            }
263        }
264        return(b)
265    }
266    
267    setGeneric("fulltext.search.filter", function(object, pattern, ...) standardGeneric("fulltext.search.filter"))
268    setMethod("fulltext.search.filter",
269              signature(object = "PlainTextDocument", pattern = "character"),
270              function(object, pattern, ...) {
271                  if (!Cached(object))
272                      object <- loadFileIntoMem(object)
273    
274                  return(any(grep(pattern, Corpus(object))))
275              })
276    
277    setGeneric("reuters21578.topic.filter", function(object, topics, ...) standardGeneric("reuters21578.topic.filter"))
278    setMethod("reuters21578.topic.filter",
279            c("PlainTextDocument", "character"),            c("PlainTextDocument", "character"),
280            function(object, topics) {            function(object, topics, ...) {
281                if (Cached(object) == FALSE)                if (!Cached(object))
282                    object <- loadFileIntoMem(object)                    object <- loadFileIntoMem(object)
283    
284                if (any(LocalMetaData(object)$Topics %in% topics))                if (any(LocalMetaData(object)$Topics %in% topics))
# Line 258  Line 287 
287                    return(FALSE)                    return(FALSE)
288            })            })
289    
290  setGeneric("filterIDs", function(object, IDs, ...) standardGeneric("filterIDs"))  setGeneric("id.filter", function(object, IDs, ...) standardGeneric("id.filter"))
291  setMethod("filterIDs",  setMethod("id.filter",
292            c("TextDocument", "numeric"),            c("TextDocument", "numeric"),
293            function(object, IDs) {            function(object, IDs, ...) {
294                if (ID(object) %in% IDs)                if (ID(object) %in% IDs)
295                    return(TRUE)                    return(TRUE)
296                else                else

Legend:
Removed from v.60  
changed lines
  Added in v.61

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge