SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 62, Tue Oct 24 10:08:58 2006 UTC revision 63, Thu Oct 26 14:59:09 2006 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  setGeneric("TextDocCol", function(object, parser = plaintext.parser, lod = FALSE) standardGeneric("TextDocCol"))  # ... fuer Argumente des Funktionengenerators
4    setGeneric("TextDocCol", function(object, parser = plaintext_parser, ...) standardGeneric("TextDocCol"))
5  setMethod("TextDocCol",  setMethod("TextDocCol",
6            signature(object = "character"),            signature(object = "Source"),
7            function(object, parser = plaintext.parser, lod = FALSE) {            function(object, parser = plaintext_parser) {
8                filelist <- dir(object, full.names = TRUE)                if (inherits(parser, "function_generator"))
9                tdl <- lapply(filelist, parser, lod)                    parser <- parser(...)
10    
11                  tdl <- list()
12                  counter <- 1
13                  while (!eoi(object)) {
14                      object <- stepNext(object)
15                      elem <- getElem(object)
16                      # If there is no Load on Demand support
17                      # we need to load the corpus into memory at startup
18                      if (object@LoDSupport)
19                          load <- object@Load
20                      else
21                          load <- TRUE
22                      tdl <- c(tdl, list(parser(elem, object@LoDSupport, load, as.character(counter))))
23                      counter <- counter + 1
24                  }
25    
26                return(new("TextDocCol", .Data = tdl))                return(new("TextDocCol", .Data = tdl))
27            })            })
28    
29  plaintext.parser <- function(file, lod) {  setGeneric("DirSource", function(directory, load = FALSE) standardGeneric("DirSource"))
30      id <- file  setMethod("DirSource",
31      origin <- dirname(file)            signature(directory = "character"),
32              function(directory, load = FALSE) {
33                  new("DirSource", LoDSupport = TRUE, FileList = dir(directory, full.names = TRUE),
34                      Position = 0, Load = load)
35              })
36    
37    setGeneric("CSVSource", function(file) standardGeneric("CSVSource"))
38    setMethod("CSVSource",
39              signature(file = "character"),
40              function(file) {
41                  new("CSVSource", LoDSupport = FALSE, FileName = file,
42                      Content = scan(file, what = "character"), Position = 0)
43              })
44    
45    setGeneric("Reuters21578XMLSource", function(file) standardGeneric("Reuters21578XMLSource"))
46    setMethod("Reuters21578XMLSource",
47              signature(file = "character"),
48              function(file) {
49                  tree <- xmlTreeParse(file)
50                  content <- xmlRoot(tree)$children
51                  new("Reuters21578XMLSource", LoDSupport = FALSE, FileName = file,
52                      Content = content, Position = 0)
53              })
54    
55      doc <- new("PlainTextDocument", FileName = file, Cached = FALSE, Author = "Unknown",  setGeneric("stepNext", function(object) standardGeneric("stepNext"))
56                 DateTimeStamp = date(), Description = "", ID = id, Origin = origin, Heading = "")  setMethod("stepNext",
57              signature(object = "DirSource"),
58              function(object) {
59                  object@Position <- object@Position + 1
60                  object
61              })
62    setMethod("stepNext",
63              signature(object = "CSVSource"),
64              function(object) {
65                  object@Position <- object@Position + 1
66                  object
67              })
68    setMethod("stepNext",
69              signature(object = "Reuters21578XMLSource"),
70              function(object) {
71                  object@Position <- object@Position + 1
72                  object
73              })
74    
75      if (lod) {  setGeneric("getElem", function(object) standardGeneric("getElem"))
76          doc <- loadFileIntoMem(doc)  setMethod("getElem",
77              signature(object = "DirSource"),
78              function(object) {
79                  list(content = readLines(object@FileList[object@Position]),
80                       filename = object@FileList[object@Position])
81              })
82    setMethod("getElem",
83              signature(object = "CSVSource"),
84              function(object) {
85                  list(content = object@Content[object@Position],
86                       filename = object@FileName)
87              })
88    setMethod("getElem",
89              signature(object = "Reuters21578XMLSource"),
90              function(object) {
91                  list(content = object@Content[object@Position],
92                       filename = object@FileName)
93              })
94    
95    setGeneric("eoi", function(object) standardGeneric("eoi"))
96    setMethod("eoi",
97              signature(object = "DirSource"),
98              function(object) {
99                  if (length(object@FileList) <= object@Position)
100                      return(TRUE)
101                  else
102                      return(FALSE)
103              })
104    setMethod("eoi",
105              signature(object = "CSVSource"),
106              function(object) {
107                  if (length(object@Content) <= object@Position)
108                      return(TRUE)
109                  else
110                      return(FALSE)
111              })
112    setMethod("eoi",
113              signature(object = "Reuters21578XMLSource"),
114              function(object) {
115                  if (length(object@Content) <= object@Position)
116                      return(TRUE)
117                  else
118                      return(FALSE)
119              })
120    
121    plaintext_parser <- function(...) {
122        function(elem, lodsupport, load, id) {
123            if (!lodsupport || (lodsupport && load)) {
124                doc <- new("PlainTextDocument", .Data = elem$content, FileName = elem$filename, Cached = TRUE,
125                           Author = "", DateTimeStamp = date(), Description = "", ID = id, Origin = "", Heading = "")
126            }
127            else {
128                doc <- new("PlainTextDocument", FileName = elem$filename, Cached = FALSE,
129                           Author = "", DateTimeStamp = date(), Description = "", ID = id, Origin = "", Heading = "")
130      }      }
131    
132      return(doc)      return(doc)
133  }  }
134    }
135    class(plaintext_parser) <- "function_generator"
136    
137  reuters21578xml.parser <- function(file, lod) {  reuters21578xml_parser <- function(...) {
138      tree <- xmlTreeParse(file)      function(elem, lodsupport, load, id) {
139            tree <- xmlTreeParse(elem$filename)
140      node <- xmlRoot(tree)      node <- xmlRoot(tree)
141    
142      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
# Line 45  Line 157 
157    
158      topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)      topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
159    
160      doc <- new("XMLTextDocument", FileName = file, Cached = FALSE, Author = author,          if (!lodsupport || (lodsupport && load)) {
161                doc <- new("XMLTextDocument", .Data = elem$content, FileName = elem$filename, Cached = TRUE, Author = author,
162                           DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
163                           Heading = heading, LocalMetaData = list(Topics = topics))
164            } else {
165                doc <- new("XMLTextDocument", FileName = elem$filename, Cached = FALSE, Author = author,
166                 DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",                 DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
167                 Heading = heading, LocalMetaData = list(Topics = topics))                 Heading = heading, LocalMetaData = list(Topics = topics))
   
     if (lod) {  
         doc <- loadFileIntoMem(doc)  
168      }      }
169    
170      return(doc)      return(doc)
171  }  }
172    }
173    class(reuters21578xml_parser) <- "function_generator"
174    
175  rcv1.parser <- function(file, lod) {  rcv1_parser <- function(...) {
176      tree <- xmlTreeParse(file)      function(elem, lodsupport, load, id) {
177            tree <- xmlTreeParse(elem$filename)
178      node <- xmlRoot(tree)      node <- xmlRoot(tree)
179    
180      datetimestamp <- xmlAttrs(node)[["date"]]      datetimestamp <- xmlAttrs(node)[["date"]]
181      id <- xmlAttrs(node)[["itemid"]]      id <- xmlAttrs(node)[["itemid"]]
182      heading <- xmlValue(node[["title"]])      heading <- xmlValue(node[["title"]])
183    
184      doc <- new("XMLTextDocument", FileName = file, Cached = FALSE, Author = "",          if (!lodsupport || (lodsupport && load)) {
185                doc <- new("XMLTextDocument", .Data = elem$content, FileName = elem$filename, Cached = TRUE, Author = "",
186                           DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
187                           Heading = heading)
188            } else {
189                doc <- new("XMLTextDocument", FileName = elem$filename, Cached = FALSE, Author = "",
190                 DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",                 DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
191                 Heading = heading)                 Heading = heading)
   
     if (lod) {  
         doc <- loadFileIntoMem(doc)  
192      }      }
193    
194      return(doc)      return(doc)
195  }  }
196    }
197    class(rcv1_parser) <- "function_generator"
198    
199  uci.kdd.newsgroup.parser <-  function(file, lod) {  uci_kdd_newsgroup_parser <- function(...) {
200      mail <- readLines(file)      function(elem, lodsupport, load, id) {
201            mail <- readLines(elem$filename)
202      author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))      author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))
203      datetimestamp <- gsub("Date: ", "", grep("^Date:", mail, value = TRUE))      datetimestamp <- gsub("Date: ", "", grep("^Date:", mail, value = TRUE))
204      origin <- gsub("Path: ", "", grep("^Path:", mail, value = TRUE))      origin <- gsub("Path: ", "", grep("^Path:", mail, value = TRUE))
205      heading <- gsub("Subject: ", "", grep("^Subject:", mail, value = TRUE))      heading <- gsub("Subject: ", "", grep("^Subject:", mail, value = TRUE))
206      newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE))      newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE))
207    
208      new("NewsgroupDocument", FileName = file, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,          if (!lodsupport || (lodsupport && load)) {
209          Description = "", ID = file, Origin = origin, Heading = heading, Newsgroup = newsgroup)              index <- grep("^Lines:", mail)
210                content <- mail[(index + 1):length(mail)]
211    
212      if (lod) {              doc <- new("NewsgroupDocument", .Data = content, FileName = elem$filename, Cached = TRUE,
213          doc <- loadFileIntoMem(doc)                         Author = author, DateTimeStamp = datetimestamp,
214                           Description = "", ID = elem$filename, Origin = origin,
215                           Heading = heading, Newsgroup = newsgroup)
216            } else {
217                doc <- new("NewsgroupDocument", FileName = elem$filename, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
218                           Description = "", ID = elem$filename, Origin = origin, Heading = heading, Newsgroup = newsgroup)
219      }      }
220    
221      return(doc)      return(doc)
222  }  }
223    }
224    class(uci_kdd_newsgroup_parser) <- "function_generator"
225    
226  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
227  rcv1.to.plain <- function(node) {  rcv1_to_plain <- function(node) {
228      datetimestamp <- xmlAttrs(node)[["date"]]      datetimestamp <- xmlAttrs(node)[["date"]]
229      id <- xmlAttrs(node)[["itemid"]]      id <- xmlAttrs(node)[["itemid"]]
230      origin <- "Reuters Corpus Volume 1 XML"      origin <- "Reuters Corpus Volume 1 XML"
# Line 106  Line 236 
236  }  }
237    
238  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
239  reuters21578xml.to.plain <- function(node) {  reuters21578xml_to_plain <- function(node) {
240      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
241      if (!is.null(node[["TEXT"]][["AUTHOR"]]))      if (!is.null(node[["TEXT"]][["AUTHOR"]]))
242          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
# Line 243  Line 373 
373                object[tm_index(object, ..., FUN)]                object[tm_index(object, ..., FUN)]
374            })            })
375    
376  setGeneric("tm_index", function(object, ..., FUN = s.filter) standardGeneric("tm_index"))  setGeneric("tm_index", function(object, ..., FUN = s_filter) standardGeneric("tm_index"))
377  setMethod("tm_index",  setMethod("tm_index",
378            signature(object = "TextDocCol"),            signature(object = "TextDocCol"),
379            function(object, ..., FUN = s.filter) {            function(object, ..., FUN = s.filter) {
380                sapply(object, FUN, ..., GlobalMetaData = GlobalMetaData(object))                sapply(object, FUN, ..., GlobalMetaData = GlobalMetaData(object))
381            })            })
382    
383  s.filter <- function(object, s, ..., GlobalMetaData) {  s_filter <- function(object, s, ..., GlobalMetaData) {
384      b <- TRUE      b <- TRUE
385      for (tag in names(s)) {      for (tag in names(s)) {
386          if (tag %in% names(LocalMetaData(object))) {          if (tag %in% names(LocalMetaData(object))) {
# Line 264  Line 394 
394      return(b)      return(b)
395  }  }
396    
397  setGeneric("fulltext.search.filter", function(object, pattern, ...) standardGeneric("fulltext.search.filter"))  setGeneric("fulltext_search_filter", function(object, pattern, ...) standardGeneric("fulltext_search_filter"))
398  setMethod("fulltext.search.filter",  setMethod("fulltext_search_filter",
399            signature(object = "PlainTextDocument", pattern = "character"),            signature(object = "PlainTextDocument", pattern = "character"),
400            function(object, pattern, ...) {            function(object, pattern, ...) {
401                if (!Cached(object))                if (!Cached(object))
# Line 274  Line 404 
404                return(any(grep(pattern, Corpus(object))))                return(any(grep(pattern, Corpus(object))))
405            })            })
406    
 setGeneric("reuters21578.topic.filter", function(object, topics, ...) standardGeneric("reuters21578.topic.filter"))  
 setMethod("reuters21578.topic.filter",  
           signature(object = "PlainTextDocument", topics = "character"),  
           function(object, topics, ...) {  
               if (!Cached(object))  
                   object <- loadFileIntoMem(object)  
   
               if (any(LocalMetaData(object)$Topics %in% topics))  
                   return(TRUE)  
               else  
                   return(FALSE)  
           })  
   
 setGeneric("id.filter", function(object, IDs, ...) standardGeneric("id.filter"))  
 setMethod("id.filter",  
           signature(object = "TextDocument", IDs = "numeric"),  
           function(object, IDs, ...) {  
               if (ID(object) %in% IDs)  
                   return(TRUE)  
               else  
                   return(FALSE)  
           })  
   
407  setGeneric("attachData", function(object, data) standardGeneric("attachData"))  setGeneric("attachData", function(object, data) standardGeneric("attachData"))
408  setMethod("attachData",  setMethod("attachData",
409            signature(object = "TextDocCol", data = "TextDocument"),            signature(object = "TextDocCol", data = "TextDocument"),
# Line 342  Line 449 
449                return(object)                return(object)
450            })            })
451    
452    setMethod("[<-",
453              signature(x = "TextDocCol", i = "ANY", j = "ANY", value = "ANY"),
454              function(x, i, j, ... , value) {
455                  object <- x
456                  object@.Data[i, ...] <- value
457                  return(object)
458              })
459    
460    setMethod("[[",
461              signature(x = "TextDocCol", i = "ANY", j = "ANY"),
462              function(x, i, j, ...) {
463                  return(x@.Data[[i, ...]])
464              })
465    
466    setMethod("[[<-",
467              signature(x = "TextDocCol", i = "ANY", j = "ANY", value = "ANY"),
468              function(x, i, j, ..., value) {
469                  object <- x
470                  object@.Data[[i, ...]] <- value
471                  return(object)
472              })
473    
474  setMethod("c",  setMethod("c",
475            signature(x = "TextDocCol"),            signature(x = "TextDocCol"),
476            function(x, ..., recursive = TRUE){            function(x, ..., recursive = TRUE){

Legend:
Removed from v.62  
changed lines
  Added in v.63

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge