SCM

SCM Repository

[tm] Diff of /trunk/tm/R/reader.R
ViewVC logotype

Diff of /trunk/tm/R/reader.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/textmin/R/reader.R revision 689, Fri Dec 8 14:21:46 2006 UTC trunk/tm/R/reader.R revision 694, Sun Dec 31 14:47:46 2006 UTC
# Line 2  Line 2 
2    
3  # Reader  # Reader
4    
5  plaintext_parser <- function(...) {  read_plain <- function(...) {
6      function(elem, lodsupport, load, id) {      function(elem, load, id) {
7          if (!lodsupport || (lodsupport && load)) {          doc <- if (load) {
8              doc <- new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,              new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,
9                         Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")                         Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")
10          }          }
11          else {          else {
12              doc <- new("PlainTextDocument", URI = elem$uri, Cached = FALSE,              new("PlainTextDocument", URI = elem$uri, Cached = FALSE,
13                         Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")                         Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")
14          }          }
15    
16          return(doc)          return(doc)
17      }      }
18  }  }
19  class(plaintext_parser) <- "function_generator"  class(read_plain) <- "function_generator"
20    
21  reut21578xml_parser <- function(...) {  read_reut21578xml <- function(...) {
22      function(elem, lodsupport, load, id) {      function(elem, load, id) {
23          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
24          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- xmlTreeParse(corpus, asText = TRUE)
25          node <- xmlRoot(tree)          node <- xmlRoot(tree)
# Line 28  Line 28 
28          class(tree) <- "list"          class(tree) <- "list"
29    
30          # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!          # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
31          if (!is.null(node[["TEXT"]][["AUTHOR"]]))          author <- if (!is.null(node[["TEXT"]][["AUTHOR"]]))
32              author <- xmlValue(node[["TEXT"]][["AUTHOR"]])              xmlValue(node[["TEXT"]][["AUTHOR"]])
33          else          else
34              author <- ""              ""
35    
36          datetimestamp <- as.POSIXct(strptime(xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))          datetimestamp <- as.POSIXct(strptime(xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))
37          description <- ""          description <- ""
38          id <- xmlAttrs(node)[["NEWID"]]          id <- xmlAttrs(node)[["NEWID"]]
39    
40          # The <TITLE></TITLE> tag is unfortunately NOT obligatory!          # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
41          if (!is.null(node[["TEXT"]][["TITLE"]]))          heading <- if (!is.null(node[["TEXT"]][["TITLE"]]))
42              heading <- xmlValue(node[["TEXT"]][["TITLE"]])              xmlValue(node[["TEXT"]][["TITLE"]])
43          else          else
44              heading <- ""              ""
45    
46          topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)          topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
47    
48          if (!lodsupport || (lodsupport && load)) {          doc <- if (load) {
49              doc <- new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,              new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,
50                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
51                         Heading = heading, LocalMetaData = list(Topics = topics))                         Heading = heading, LocalMetaData = list(Topics = topics))
52          } else {          } else {
53              doc <- new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = author,              new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = author,
54                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
55                         Heading = heading, LocalMetaData = list(Topics = topics))                         Heading = heading, LocalMetaData = list(Topics = topics))
56          }          }
# Line 58  Line 58 
58          return(doc)          return(doc)
59      }      }
60  }  }
61  class(reut21578xml_parser) <- "function_generator"  class(read_reut21578xml) <- "function_generator"
62    
63  rcv1_parser <- function(...) {  read_rcv1 <- function(...) {
64      function(elem, lodsupport, load, id) {      function(elem, load, id) {
65          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
66          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- xmlTreeParse(corpus, asText = TRUE)
67          node <- xmlRoot(tree)          node <- xmlRoot(tree)
# Line 73  Line 73 
73          id <- xmlAttrs(node)[["itemid"]]          id <- xmlAttrs(node)[["itemid"]]
74          heading <- xmlValue(node[["title"]])          heading <- xmlValue(node[["title"]])
75    
76          if (!lodsupport || (lodsupport && load)) {          doc <- if (load) {
77              doc <- new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",              new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",
78                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
79                         Heading = heading)                         Heading = heading)
80          } else {          } else {
81              doc <- new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = "",              new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = "",
82                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
83                         Heading = heading)                         Heading = heading)
84          }          }
# Line 86  Line 86 
86          return(doc)          return(doc)
87      }      }
88  }  }
89  class(rcv1_parser) <- "function_generator"  class(read_rcv1) <- "function_generator"
90    
91  newsgroup_parser <- function(...) {  read_newsgroup <- function(...) {
92      function(elem, lodsupport, load, id) {      function(elem, load, id) {
93          mail <- elem$content          mail <- elem$content
94          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))
95          datetimestamp <- as.POSIXct(strptime(gsub("Date: ", "", grep("^Date:", mail, value = TRUE)), format = "%d %B %Y %H:%M:%S"))          datetimestamp <- as.POSIXct(strptime(gsub("Date: ", "", grep("^Date:", mail, value = TRUE)), format = "%d %B %Y %H:%M:%S"))
# Line 97  Line 97 
97          heading <- gsub("Subject: ", "", grep("^Subject:", mail, value = TRUE))          heading <- gsub("Subject: ", "", grep("^Subject:", mail, value = TRUE))
98          newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE))          newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE))
99    
100          if (!lodsupport || (lodsupport && load)) {          doc <- if (load) {
101              # The header is separated from the body by a blank line.              # The header is separated from the body by a blank line.
102              # Reference: \url{http://en.wikipedia.org/wiki/E-mail#Internet_e-mail_format}              # Reference: \url{http://en.wikipedia.org/wiki/E-mail#Internet_e-mail_format}
103              for (index in seq(along = mail)) {              for (index in seq(along = mail)) {
# Line 106  Line 106 
106              }              }
107              content <- mail[(index + 1):length(mail)]              content <- mail[(index + 1):length(mail)]
108    
109              doc <- new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,              new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
110                         Author = author, DateTimeStamp = datetimestamp,                         Author = author, DateTimeStamp = datetimestamp,
111                         Description = "", ID = id, Origin = origin,                         Description = "", ID = id, Origin = origin,
112                         Heading = heading, Newsgroup = newsgroup)                         Heading = heading, Newsgroup = newsgroup)
113          } else {          } else {
114              doc <- new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,              new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
115                         Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)                         Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)
116          }          }
117    
118          return(doc)          return(doc)
119      }      }
120  }  }
121  class(newsgroup_parser) <- "function_generator"  class(read_newsgroup) <- "function_generator"
122    
123  gmane_r_reader <- function(...) {  read_gmane <- function(...) {
124      function(elem, lodsupport, load, id) {      function(elem, load, id) {
125          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
126          # Remove namespaces          # Remove namespaces
127          corpus <- gsub("dc:date", "date", corpus)          corpus <- gsub("dc:date", "date", corpus)
# Line 134  Line 134 
134          heading <- xmlValue(node[["title"]])          heading <- xmlValue(node[["title"]])
135          id <- xmlValue(node[["link"]])          id <- xmlValue(node[["link"]])
136          newsgroup <- gsub("[0-9]+", "", xmlValue(node[["link"]]))          newsgroup <- gsub("[0-9]+", "", xmlValue(node[["link"]]))
137          origin <- "Gmane R Mailing Lists Archive"          origin <- "Gmane Mailing List Archive"
138    
139          if (!lodsupport || (lodsupport && load)) {          doc <- if (load) {
140              content <- xmlValue(node[["description"]])              content <- xmlValue(node[["description"]])
141    
142              doc <- new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,              new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
143                         Author = author, DateTimeStamp = datetimestamp,                         Author = author, DateTimeStamp = datetimestamp,
144                         Description = "", ID = id, Origin = origin,                         Description = "", ID = id, Origin = origin,
145                         Heading = heading, Newsgroup = newsgroup)                         Heading = heading, Newsgroup = newsgroup)
146          } else {          } else {
147              doc <- new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,              new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
148                         Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)                         Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)
149          }          }
150    
151          return(doc)          return(doc)
152      }      }
153  }  }
154  class(gmane_r_reader) <- "function_generator"  class(read_gmane) <- "function_generator"
   
155    
156  # Parser  # Converter
157    
158  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
159  rcv1_to_plain <- function(node, ...) {  convert_rcv1_plain <- function(node, ...) {
160      datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])      datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])
161      id <- xmlAttrs(node)[["itemid"]]      id <- xmlAttrs(node)[["itemid"]]
162      origin <- "Reuters Corpus Volume 1 XML"      origin <- "Reuters Corpus Volume 1 XML"
# Line 169  Line 168 
168  }  }
169    
170  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
171  reut21578xml_to_plain <- function(node, ...) {  convert_reut21578xml_plain <- function(node, ...) {
172      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
173      if (!is.null(node[["TEXT"]][["AUTHOR"]]))      if (!is.null(node[["TEXT"]][["AUTHOR"]]))
174          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
# Line 183  Line 182 
182      origin <- "Reuters-21578 XML"      origin <- "Reuters-21578 XML"
183    
184      # The <BODY></BODY> tag is unfortunately NOT obligatory!      # The <BODY></BODY> tag is unfortunately NOT obligatory!
185      if (!is.null(node[["TEXT"]][["BODY"]]))      corpus <- if (!is.null(node[["TEXT"]][["BODY"]]))
186          corpus <- xmlValue(node[["TEXT"]][["BODY"]])          xmlValue(node[["TEXT"]][["BODY"]])
187      else      else
188          corpus <- ""          ""
189    
190      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
191      if (!is.null(node[["TEXT"]][["TITLE"]]))      heading <- if (!is.null(node[["TEXT"]][["TITLE"]]))
192          heading <- xmlValue(node[["TEXT"]][["TITLE"]])          xmlValue(node[["TEXT"]][["TITLE"]])
193      else      else
194          heading <- ""          ""
195    
196      topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)      topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
197    

Legend:
Removed from v.689  
changed lines
  Added in v.694

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge