SCM

SCM Repository

[tm] Diff of /trunk/tm/R/reader.R
ViewVC logotype

Diff of /trunk/tm/R/reader.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/textmin/R/reader.R revision 689, Fri Dec 8 14:21:46 2006 UTC trunk/tm/R/reader.R revision 693, Fri Dec 22 13:21:30 2006 UTC
# Line 2  Line 2 
2    
3  # Reader  # Reader
4    
5  plaintext_parser <- function(...) {  read_plain <- function(...) {
6      function(elem, lodsupport, load, id) {      function(elem, lodsupport, load, id) {
7          if (!lodsupport || (lodsupport && load)) {          if (!lodsupport || (lodsupport && load)) {
8              doc <- new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,              doc <- new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,
# Line 16  Line 16 
16          return(doc)          return(doc)
17      }      }
18  }  }
19  class(plaintext_parser) <- "function_generator"  class(read_plain) <- "function_generator"
20    
21  reut21578xml_parser <- function(...) {  read_reut21578xml <- function(...) {
22      function(elem, lodsupport, load, id) {      function(elem, lodsupport, load, id) {
23          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
24          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- xmlTreeParse(corpus, asText = TRUE)
# Line 58  Line 58 
58          return(doc)          return(doc)
59      }      }
60  }  }
61  class(reut21578xml_parser) <- "function_generator"  class(read_reut21578xml) <- "function_generator"
62    
63  rcv1_parser <- function(...) {  read_rcv1 <- function(...) {
64      function(elem, lodsupport, load, id) {      function(elem, lodsupport, load, id) {
65          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
66          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- xmlTreeParse(corpus, asText = TRUE)
# Line 86  Line 86 
86          return(doc)          return(doc)
87      }      }
88  }  }
89  class(rcv1_parser) <- "function_generator"  class(read_rcv1) <- "function_generator"
90    
91  newsgroup_parser <- function(...) {  read_newsgroup <- function(...) {
92      function(elem, lodsupport, load, id) {      function(elem, lodsupport, load, id) {
93          mail <- elem$content          mail <- elem$content
94          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))
# Line 118  Line 118 
118          return(doc)          return(doc)
119      }      }
120  }  }
121  class(newsgroup_parser) <- "function_generator"  class(read_newsgroup) <- "function_generator"
122    
123  gmane_r_reader <- function(...) {  read_gmane_r <- function(...) {
124      function(elem, lodsupport, load, id) {      function(elem, lodsupport, load, id) {
125          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
126          # Remove namespaces          # Remove namespaces
# Line 151  Line 151 
151          return(doc)          return(doc)
152      }      }
153  }  }
154  class(gmane_r_reader) <- "function_generator"  class(read_gmane_r) <- "function_generator"
155    
156    # Converter
 # Parser  
157    
158  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
159  rcv1_to_plain <- function(node, ...) {  convert_rcv1_plain <- function(node, ...) {
160      datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])      datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])
161      id <- xmlAttrs(node)[["itemid"]]      id <- xmlAttrs(node)[["itemid"]]
162      origin <- "Reuters Corpus Volume 1 XML"      origin <- "Reuters Corpus Volume 1 XML"
# Line 169  Line 168 
168  }  }
169    
170  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
171  reut21578xml_to_plain <- function(node, ...) {  convert_reut21578xml_plain <- function(node, ...) {
172      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
173      if (!is.null(node[["TEXT"]][["AUTHOR"]]))      if (!is.null(node[["TEXT"]][["AUTHOR"]]))
174          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])

Legend:
Removed from v.689  
changed lines
  Added in v.693

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge