SCM

SCM Repository

[tm] Diff of /trunk/tm/R/reader.R
ViewVC logotype

Diff of /trunk/tm/R/reader.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/textmin/R/reader.R revision 689, Fri Dec 8 14:21:46 2006 UTC trunk/tm/R/reader.R revision 697, Fri Jan 5 23:09:12 2007 UTC
# Line 2  Line 2 
2    
3  # Reader  # Reader
4    
5  plaintext_parser <- function(...) {  read_plain <- function(...) {
6      function(elem, lodsupport, load, id) {      function(elem, load, id) {
7          if (!lodsupport || (lodsupport && load)) {          doc <- if (load) {
8              doc <- new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,              new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,
9                         Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")                         Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")
10          }          }
11          else {          else {
12              doc <- new("PlainTextDocument", URI = elem$uri, Cached = FALSE,              new("PlainTextDocument", URI = elem$uri, Cached = FALSE,
13                         Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")                         Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")
14          }          }
15    
16          return(doc)          return(doc)
17      }      }
18  }  }
19  class(plaintext_parser) <- "function_generator"  class(read_plain) <- "function_generator"
20    
21  reut21578xml_parser <- function(...) {  read_reut21578xml <- function(...) {
22      function(elem, lodsupport, load, id) {      function(elem, load, id) {
23          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
24          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- xmlTreeParse(corpus, asText = TRUE)
25          node <- xmlRoot(tree)          node <- xmlRoot(tree)
# Line 28  Line 28 
28          class(tree) <- "list"          class(tree) <- "list"
29    
30          # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!          # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
31          if (!is.null(node[["TEXT"]][["AUTHOR"]]))          author <- if (!is.null(node[["TEXT"]][["AUTHOR"]]))
32              author <- xmlValue(node[["TEXT"]][["AUTHOR"]])              xmlValue(node[["TEXT"]][["AUTHOR"]])
33          else          else
34              author <- ""              ""
35    
36          datetimestamp <- as.POSIXct(strptime(xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))          datetimestamp <- as.POSIXct(strptime(xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))
         description <- ""  
37          id <- xmlAttrs(node)[["NEWID"]]          id <- xmlAttrs(node)[["NEWID"]]
38    
39          # The <TITLE></TITLE> tag is unfortunately NOT obligatory!          # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
40          if (!is.null(node[["TEXT"]][["TITLE"]]))          heading <- if (!is.null(node[["TEXT"]][["TITLE"]]))
41              heading <- xmlValue(node[["TEXT"]][["TITLE"]])              xmlValue(node[["TEXT"]][["TITLE"]])
42          else          else
43              heading <- ""              ""
44    
45          topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)          topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
46    
47          if (!lodsupport || (lodsupport && load)) {          doc <- if (load) {
48              doc <- new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,              new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,
49                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
50                         Heading = heading, LocalMetaData = list(Topics = topics))                         Heading = heading, LocalMetaData = list(Topics = topics))
51          } else {          } else {
52              doc <- new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = author,              new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = author,
53                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
54                         Heading = heading, LocalMetaData = list(Topics = topics))                         Heading = heading, LocalMetaData = list(Topics = topics))
55          }          }
# Line 58  Line 57 
57          return(doc)          return(doc)
58      }      }
59  }  }
60  class(reut21578xml_parser) <- "function_generator"  class(read_reut21578xml) <- "function_generator"
61    
62  rcv1_parser <- function(...) {  read_rcv1 <- function(...) {
63      function(elem, lodsupport, load, id) {      function(elem, load, id) {
64          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
65          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- xmlTreeParse(corpus, asText = TRUE)
66          node <- xmlRoot(tree)          node <- xmlRoot(tree)
# Line 73  Line 72 
72          id <- xmlAttrs(node)[["itemid"]]          id <- xmlAttrs(node)[["itemid"]]
73          heading <- xmlValue(node[["title"]])          heading <- xmlValue(node[["title"]])
74    
75          if (!lodsupport || (lodsupport && load)) {          doc <- if (load) {
76              doc <- new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",              new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",
77                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
78                         Heading = heading)                         Heading = heading)
79          } else {          } else {
80              doc <- new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = "",              new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = "",
81                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
82                         Heading = heading)                         Heading = heading)
83          }          }
# Line 86  Line 85 
85          return(doc)          return(doc)
86      }      }
87  }  }
88  class(rcv1_parser) <- "function_generator"  class(read_rcv1) <- "function_generator"
89    
90  newsgroup_parser <- function(...) {  read_newsgroup <- function(...) {
91      function(elem, lodsupport, load, id) {      function(elem, load, id) {
92          mail <- elem$content          mail <- elem$content
93          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))
94          datetimestamp <- as.POSIXct(strptime(gsub("Date: ", "", grep("^Date:", mail, value = TRUE)), format = "%d %B %Y %H:%M:%S"))          datetimestamp <- as.POSIXct(strptime(gsub("Date: ", "", grep("^Date:", mail, value = TRUE)), format = "%d %B %Y %H:%M:%S"))
# Line 97  Line 96 
96          heading <- gsub("Subject: ", "", grep("^Subject:", mail, value = TRUE))          heading <- gsub("Subject: ", "", grep("^Subject:", mail, value = TRUE))
97          newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE))          newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE))
98    
99          if (!lodsupport || (lodsupport && load)) {          doc <- if (load) {
100              # The header is separated from the body by a blank line.              # The header is separated from the body by a blank line.
101              # Reference: \url{http://en.wikipedia.org/wiki/E-mail#Internet_e-mail_format}              # Reference: \url{http://en.wikipedia.org/wiki/E-mail#Internet_e-mail_format}
102              for (index in seq(along = mail)) {              for (index in seq(along = mail)) {
# Line 106  Line 105 
105              }              }
106              content <- mail[(index + 1):length(mail)]              content <- mail[(index + 1):length(mail)]
107    
108              doc <- new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,              new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
109                         Author = author, DateTimeStamp = datetimestamp,                         Author = author, DateTimeStamp = datetimestamp,
110                         Description = "", ID = id, Origin = origin,                         Description = "", ID = id, Origin = origin,
111                         Heading = heading, Newsgroup = newsgroup)                         Heading = heading, Newsgroup = newsgroup)
112          } else {          } else {
113              doc <- new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,              new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
114                         Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)                         Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)
115          }          }
116    
117          return(doc)          return(doc)
118      }      }
119  }  }
120  class(newsgroup_parser) <- "function_generator"  class(read_newsgroup) <- "function_generator"
121    
122  gmane_r_reader <- function(...) {  read_gmane <- function(...) {
123      function(elem, lodsupport, load, id) {      function(elem, load, id) {
124          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
125          # Remove namespaces          # Remove namespaces
126          corpus <- gsub("dc:date", "date", corpus)          corpus <- gsub("dc:date", "date", corpus)
# Line 134  Line 133 
133          heading <- xmlValue(node[["title"]])          heading <- xmlValue(node[["title"]])
134          id <- xmlValue(node[["link"]])          id <- xmlValue(node[["link"]])
135          newsgroup <- gsub("[0-9]+", "", xmlValue(node[["link"]]))          newsgroup <- gsub("[0-9]+", "", xmlValue(node[["link"]]))
136          origin <- "Gmane R Mailing Lists Archive"          origin <- "Gmane Mailing List Archive"
137    
138          if (!lodsupport || (lodsupport && load)) {          doc <- if (load) {
139              content <- xmlValue(node[["description"]])              content <- xmlValue(node[["description"]])
140    
141              doc <- new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,              new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
142                         Author = author, DateTimeStamp = datetimestamp,                         Author = author, DateTimeStamp = datetimestamp,
143                         Description = "", ID = id, Origin = origin,                         Description = "", ID = id, Origin = origin,
144                         Heading = heading, Newsgroup = newsgroup)                         Heading = heading, Newsgroup = newsgroup)
145          } else {          } else {
146              doc <- new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,              new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
147                         Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)                         Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)
148          }          }
149    
150          return(doc)          return(doc)
151      }      }
152  }  }
153  class(gmane_r_reader) <- "function_generator"  class(read_gmane) <- "function_generator"
   
154    
155  # Parser  # Converter
156    
157  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
158  rcv1_to_plain <- function(node, ...) {  convert_rcv1_plain <- function(node, ...) {
159      datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])      datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])
160      id <- xmlAttrs(node)[["itemid"]]      id <- xmlAttrs(node)[["itemid"]]
     origin <- "Reuters Corpus Volume 1 XML"  
161      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
162      heading <- xmlValue(node[["title"]])      heading <- xmlValue(node[["title"]])
163    
# Line 169  Line 166 
166  }  }
167    
168  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
169  reut21578xml_to_plain <- function(node, ...) {  convert_reut21578xml_plain <- function(node, ...) {
170      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
171      if (!is.null(node[["TEXT"]][["AUTHOR"]]))      if (!is.null(node[["TEXT"]][["AUTHOR"]]))
172          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
# Line 180  Line 177 
177      description <- ""      description <- ""
178      id <- xmlAttrs(node)[["NEWID"]]      id <- xmlAttrs(node)[["NEWID"]]
179    
     origin <- "Reuters-21578 XML"  
   
180      # The <BODY></BODY> tag is unfortunately NOT obligatory!      # The <BODY></BODY> tag is unfortunately NOT obligatory!
181      if (!is.null(node[["TEXT"]][["BODY"]]))      corpus <- if (!is.null(node[["TEXT"]][["BODY"]]))
182          corpus <- xmlValue(node[["TEXT"]][["BODY"]])          xmlValue(node[["TEXT"]][["BODY"]])
183      else      else
184          corpus <- ""          ""
185    
186      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
187      if (!is.null(node[["TEXT"]][["TITLE"]]))      heading <- if (!is.null(node[["TEXT"]][["TITLE"]]))
188          heading <- xmlValue(node[["TEXT"]][["TITLE"]])          xmlValue(node[["TEXT"]][["TITLE"]])
189      else      else
190          heading <- ""          ""
191    
192      topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)      topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
193    
194      new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = author, DateTimeStamp = datetimestamp,      new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = author, DateTimeStamp = datetimestamp,
195          Description = description, ID = id, Origin = origin, Heading = heading, LocalMetaData = list(Topics = topics))          Description = description, ID = id, Origin = "Reuters-21578 XML", Heading = heading, LocalMetaData = list(Topics = topics))
196  }  }

Legend:
Removed from v.689  
changed lines
  Added in v.697

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge