SCM

SCM Repository

[tm] Diff of /trunk/tm/R/reader.R
ViewVC logotype

Diff of /trunk/tm/R/reader.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 697, Fri Jan 5 23:09:12 2007 UTC revision 698, Sat Jan 6 17:05:44 2007 UTC
# Line 2  Line 2 
2    
3  # Reader  # Reader
4    
5  read_plain <- function(...) {  readPlain <- function(...) {
6      function(elem, load, id) {      function(elem, load, id) {
7          doc <- if (load) {          doc <- if (load) {
8              new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,              new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,
# Line 16  Line 16 
16          return(doc)          return(doc)
17      }      }
18  }  }
19  class(read_plain) <- "function_generator"  class(readPlain) <- "FunctionGenerator"
20    
21  read_reut21578xml <- function(...) {  readReut21578XML <- function(...) {
22      function(elem, load, id) {      function(elem, load, id) {
23          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
24          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- xmlTreeParse(corpus, asText = TRUE)
# Line 57  Line 57 
57          return(doc)          return(doc)
58      }      }
59  }  }
60  class(read_reut21578xml) <- "function_generator"  class(readReut21578XML) <- "FunctionGenerator"
61    
62  read_rcv1 <- function(...) {  readRCV1 <- function(...) {
63      function(elem, load, id) {      function(elem, load, id) {
64          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
65          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- xmlTreeParse(corpus, asText = TRUE)
# Line 85  Line 85 
85          return(doc)          return(doc)
86      }      }
87  }  }
88  class(read_rcv1) <- "function_generator"  class(readRCV1) <- "FunctionGenerator"
89    
90  read_newsgroup <- function(...) {  readNewsgroup <- function(...) {
91      function(elem, load, id) {      function(elem, load, id) {
92          mail <- elem$content          mail <- elem$content
93          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))
# Line 117  Line 117 
117          return(doc)          return(doc)
118      }      }
119  }  }
120  class(read_newsgroup) <- "function_generator"  class(readNewsgroup) <- "FunctionGenerator"
121    
122  read_gmane <- function(...) {  readGmane <- function(...) {
123      function(elem, load, id) {      function(elem, load, id) {
124          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
125          # Remove namespaces          # Remove namespaces
# Line 150  Line 150 
150          return(doc)          return(doc)
151      }      }
152  }  }
153  class(read_gmane) <- "function_generator"  class(readGmane) <- "FunctionGenerator"
154    
155  # Converter  # Converter
156    
157  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
158  convert_rcv1_plain <- function(node, ...) {  convertRCV1Plain <- function(node, ...) {
159      datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])      datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])
160      id <- xmlAttrs(node)[["itemid"]]      id <- xmlAttrs(node)[["itemid"]]
161      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
# Line 166  Line 166 
166  }  }
167    
168  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
169  convert_reut21578xml_plain <- function(node, ...) {  convertReut21578XMLPlain <- function(node, ...) {
170      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
171      if (!is.null(node[["TEXT"]][["AUTHOR"]]))      if (!is.null(node[["TEXT"]][["AUTHOR"]]))
172          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])

Legend:
Removed from v.697  
changed lines
  Added in v.698

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge