SCM

SCM Repository

[tm] Diff of /pkg/R/reader.R
ViewVC logotype

Diff of /pkg/R/reader.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 885, Thu Jan 29 09:34:44 2009 UTC revision 886, Thu Jan 29 22:47:34 2009 UTC
# Line 21  Line 21 
21    
22  readReut21578XML <- FunctionGenerator(function(...) {  readReut21578XML <- FunctionGenerator(function(...) {
23      function(elem, load, language, id) {      function(elem, load, language, id) {
24            require("XML")
25    
26          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
27          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- XML::xmlTreeParse(corpus, asText = TRUE)
28          node <- xmlRoot(tree)          node <- XML::xmlRoot(tree)
29    
30          # Mask as list to bypass S4 checks          # Mask as list to bypass S4 checks
31          class(tree) <- "list"          class(tree) <- "list"
32    
33          # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!          # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
34          author <- if (!is.null(node[["TEXT"]][["AUTHOR"]]))          author <- if (!is.null(node[["TEXT"]][["AUTHOR"]]))
35              xmlValue(node[["TEXT"]][["AUTHOR"]])              XML::xmlValue(node[["TEXT"]][["AUTHOR"]])
36          else          else
37              ""              ""
38    
39          datetimestamp <- as.POSIXct(strptime(xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))          datetimestamp <- as.POSIXct(strptime(XML::xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))
40          id <- xmlAttrs(node)[["NEWID"]]          id <- XML::xmlAttrs(node)[["NEWID"]]
41    
42          # The <TITLE></TITLE> tag is unfortunately NOT obligatory!          # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
43          heading <- if (!is.null(node[["TEXT"]][["TITLE"]]))          heading <- if (!is.null(node[["TEXT"]][["TITLE"]]))
44              xmlValue(node[["TEXT"]][["TITLE"]])              XML::xmlValue(node[["TEXT"]][["TITLE"]])
45          else          else
46              ""              ""
47    
48          topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)          topics <- unlist(XML::xmlApply(node[["TOPICS"]], function(x) XML::xmlValue(x)), use.names = FALSE)
49    
50          doc <- if (load) {          doc <- if (load) {
51              new("Reuters21578Document", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,              new("Reuters21578Document", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,
# Line 61  Line 63 
63    
64  readRCV1 <- FunctionGenerator(function(...) {  readRCV1 <- FunctionGenerator(function(...) {
65      function(elem, load, language, id) {      function(elem, load, language, id) {
66            require("XML")
67    
68          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
69          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- XML::xmlTreeParse(corpus, asText = TRUE)
70          node <- xmlRoot(tree)          node <- XML::xmlRoot(tree)
71    
72          # Mask as list to bypass S4 checks          # Mask as list to bypass S4 checks
73          class(tree) <- "list"          class(tree) <- "list"
74    
75          datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])          datetimestamp <- as.POSIXct(XML::xmlAttrs(node)[["date"]])
76          id <- xmlAttrs(node)[["itemid"]]          id <- XML::xmlAttrs(node)[["itemid"]]
77          heading <- xmlValue(node[["title"]])          heading <- XML::xmlValue(node[["title"]])
78    
79          doc <- if (load) {          doc <- if (load) {
80              new("RCV1Document", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",              new("RCV1Document", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",
# Line 86  Line 90 
90          if (!is.null(node[["metadata"]])) {          if (!is.null(node[["metadata"]])) {
91              # Get simple Dublin Core meta data              # Get simple Dublin Core meta data
92              dc <- node[["metadata"]][names(node[["metadata"]]) == "dc"]              dc <- node[["metadata"]][names(node[["metadata"]]) == "dc"]
93              dc <- lapply(dc, xmlAttrs)              dc <- lapply(dc, XML::xmlAttrs)
94              elements <- sapply(dc, "[[", "element")              elements <- sapply(dc, "[[", "element")
95              values <- sapply(dc, "[[", "value")              values <- sapply(dc, "[[", "value")
96              if ("dc.publisher" %in% elements)              if ("dc.publisher" %in% elements)
# Line 94  Line 98 
98    
99              # Get topic codes              # Get topic codes
100              codes <- node[["metadata"]][names(node[["metadata"]]) == "codes"]              codes <- node[["metadata"]][names(node[["metadata"]]) == "codes"]
101              topics <- codes[sapply(codes, xmlAttrs) == "bip:topics:1.0"]              topics <- codes[sapply(codes, XML::xmlAttrs) == "bip:topics:1.0"]
102              if (length(topics) > 0)              if (length(topics) > 0)
103                  meta(doc, "Topics") <- unlist(xmlApply(topics[[1]], xmlAttrs), use.names = FALSE)                  meta(doc, "Topics") <- unlist(XML::xmlApply(topics[[1]], XML::xmlAttrs), use.names = FALSE)
104          }          }
105    
106          return(doc)          return(doc)
# Line 137  Line 141 
141    
142  readGmane <- FunctionGenerator(function(...) {  readGmane <- FunctionGenerator(function(...) {
143      function(elem, load, language, id) {      function(elem, load, language, id) {
144            require("XML")
145    
146          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
147          # Remove namespaces          # Remove namespaces
148          corpus <- gsub("dc:date", "date", corpus)          corpus <- gsub("dc:date", "date", corpus)
149          corpus <- gsub("dc:creator", "creator", corpus)          corpus <- gsub("dc:creator", "creator", corpus)
150          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- XML::xmlTreeParse(corpus, asText = TRUE)
151          node <- xmlRoot(tree)          node <- XML::xmlRoot(tree)
152    
153          author <- xmlValue(node[["creator"]])          author <- XML::xmlValue(node[["creator"]])
154          datetimestamp <- as.POSIXct(strptime(xmlValue(node[["date"]]), format = "%Y-%m-%dT%H:%M:%S"))          datetimestamp <- as.POSIXct(strptime(XML::xmlValue(node[["date"]]), format = "%Y-%m-%dT%H:%M:%S"))
155          heading <- xmlValue(node[["title"]])          heading <- XML::xmlValue(node[["title"]])
156          id <- xmlValue(node[["link"]])          id <- XML::xmlValue(node[["link"]])
157          newsgroup <- gsub("[0-9]+", "", xmlValue(node[["link"]]))          newsgroup <- gsub("[0-9]+", "", XML::xmlValue(node[["link"]]))
158          origin <- "Gmane Mailing List Archive"          origin <- "Gmane Mailing List Archive"
159    
160          doc <- if (load) {          doc <- if (load) {
161              content <- xmlValue(node[["description"]])              content <- XML::xmlValue(node[["description"]])
162    
163              new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,              new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
164                  Author = author, DateTimeStamp = datetimestamp,                  Author = author, DateTimeStamp = datetimestamp,
# Line 205  Line 211 
211    
212  readHTML <- FunctionGenerator(function(...) {  readHTML <- FunctionGenerator(function(...) {
213      function(elem, load, language, id) {      function(elem, load, language, id) {
214          tree <- xmlTreeParse(elem$content, asText = TRUE)          require("XML")
215          root <- xmlRoot(tree)  
216            tree <- XML::xmlTreeParse(elem$content, asText = TRUE)
217            root <- XML::xmlRoot(tree)
218    
219          head <- root[["head"]]          head <- root[["head"]]
220          heading <- xmlValue(head[["title"]])          heading <- XML::xmlValue(head[["title"]])
221    
222          meta <- lapply(xmlChildren(head)[names(xmlChildren(head)) == "meta"], xmlAttrs)          meta <- lapply(XML::xmlChildren(head)[names(XML::xmlChildren(head)) == "meta"], XML::xmlAttrs)
223          metaNames <- sapply(meta, "[[", "name")          metaNames <- sapply(meta, "[[", "name")
224          metaContents <- sapply(meta, "[[", "content")          metaContents <- sapply(meta, "[[", "content")
225    
# Line 227  Line 235 
235    
236          content <- list("Prologue" = NULL)          content <- list("Prologue" = NULL)
237          i <- 1          i <- 1
238          for (child in xmlChildren(root[["body"]])) {          for (child in XML::xmlChildren(root[["body"]])) {
239              if (tolower(xmlName(child)) == "h1") {              if (tolower(XML::xmlName(child)) == "h1") {
240                  content <- c(content, structure(list(NULL), names = xmlValue(child)))                  content <- c(content, structure(list(NULL), names = XML::xmlValue(child)))
241                  i <- i + 1                  i <- i + 1
242              }              }
243              else {              else {
244                  # We remove remaining HTML tags                  # We remove remaining HTML tags
245                  content[[i]] <- c(content[[i]], toString(xmlApply(child, xmlValue)))                  content[[i]] <- c(content[[i]], toString(XML::xmlApply(child, XML::xmlValue)))
246              }              }
247          }          }
248    
# Line 247  Line 255 
255  # Converter  # Converter
256    
257  convertRCV1Plain <- function(node, ...) {  convertRCV1Plain <- function(node, ...) {
258        require("XML")
259    
260      content <- Content(node)      content <- Content(node)
261      # As XMLDocument is no native S4 class, restore valid information      # As XMLDocument is no native S4 class, restore valid information
262      class(content) <- "XMLDocument"      class(content) <- "XMLDocument"
263      names(content) <- c("doc", "dtd")      names(content) <- c("doc", "dtd")
264      content <- unlist(xmlApply(xmlRoot(content)[["text"]], xmlValue), use.names = FALSE)      content <- unlist(XML::xmlApply(XML::xmlRoot(content)[["text"]], XML::xmlValue), use.names = FALSE)
265    
266      new("PlainTextDocument", .Data = content, Cached = TRUE, URI = NULL,      new("PlainTextDocument", .Data = content, Cached = TRUE, URI = NULL,
267          Author = Author(node), DateTimeStamp = DateTimeStamp(node),          Author = Author(node), DateTimeStamp = DateTimeStamp(node),
# Line 262  Line 272 
272    
273  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
274  convertReut21578XMLPlain <- function(node, ...) {  convertReut21578XMLPlain <- function(node, ...) {
275        require("XML")
276    
277      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
278      if (!is.null(node[["TEXT"]][["AUTHOR"]]))      if (!is.null(node[["TEXT"]][["AUTHOR"]]))
279          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])          author <- XML::xmlValue(node[["TEXT"]][["AUTHOR"]])
280      else      else
281          author <- ""          author <- ""
282    
283      datetimestamp <- as.POSIXct(strptime(xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))      datetimestamp <- as.POSIXct(strptime(XML::xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))
284      description <- ""      description <- ""
285      id <- xmlAttrs(node)[["NEWID"]]      id <- XML::xmlAttrs(node)[["NEWID"]]
286    
287      # The <BODY></BODY> tag is unfortunately NOT obligatory!      # The <BODY></BODY> tag is unfortunately NOT obligatory!
288      corpus <- if (!is.null(node[["TEXT"]][["BODY"]]))      corpus <- if (!is.null(node[["TEXT"]][["BODY"]]))
289          xmlValue(node[["TEXT"]][["BODY"]])          XML::xmlValue(node[["TEXT"]][["BODY"]])
290      else      else
291          ""          ""
292    
293      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
294      heading <- if (!is.null(node[["TEXT"]][["TITLE"]]))      heading <- if (!is.null(node[["TEXT"]][["TITLE"]]))
295          xmlValue(node[["TEXT"]][["TITLE"]])          XML::xmlValue(node[["TEXT"]][["TITLE"]])
296      else      else
297          ""          ""
298    
299      topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)      topics <- unlist(XML::xmlApply(node[["TOPICS"]], function(x) XML::xmlValue(x)), use.names = FALSE)
300    
301      new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = NULL, Author = author, DateTimeStamp = datetimestamp,      new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = NULL, Author = author, DateTimeStamp = datetimestamp,
302          Description = description, ID = id, Origin = "Reuters-21578 XML", Heading = heading, Language = "en_US",          Description = description, ID = id, Origin = "Reuters-21578 XML", Heading = heading, Language = "en_US",

Legend:
Removed from v.885  
changed lines
  Added in v.886

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge