SCM

SCM Repository

[tm] Diff of /trunk/tm/R/reader.R
ViewVC logotype

Diff of /trunk/tm/R/reader.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 698, Sat Jan 6 17:05:44 2007 UTC revision 717, Fri Mar 16 11:13:04 2007 UTC
# Line 3  Line 3 
3  # Reader  # Reader
4    
5  readPlain <- function(...) {  readPlain <- function(...) {
6      function(elem, load, id) {      function(elem, load, language, id) {
7          doc <- if (load) {          doc <- if (load) {
8              new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,              new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,
9                  Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")                  Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "", Language = language)
10          }          }
11          else {          else {
12              new("PlainTextDocument", URI = elem$uri, Cached = FALSE,              new("PlainTextDocument", URI = elem$uri, Cached = FALSE,
13                  Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")                  Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "", Language = language)
14          }          }
15    
16          return(doc)          return(doc)
# Line 19  Line 19 
19  class(readPlain) <- "FunctionGenerator"  class(readPlain) <- "FunctionGenerator"
20    
21  readReut21578XML <- function(...) {  readReut21578XML <- function(...) {
22      function(elem, load, id) {      function(elem, load, language, id) {
23          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
24          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- xmlTreeParse(corpus, asText = TRUE)
25          node <- xmlRoot(tree)          node <- xmlRoot(tree)
# Line 47  Line 47 
47          doc <- if (load) {          doc <- if (load) {
48              new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,              new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,
49                  DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",                  DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
50                  Heading = heading, LocalMetaData = list(Topics = topics))                  Heading = heading, Language = language, LocalMetaData = list(Topics = topics))
51          } else {          } else {
52              new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = author,              new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = author,
53                  DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",                  DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
54                  Heading = heading, LocalMetaData = list(Topics = topics))                  Heading = heading, Language = language, LocalMetaData = list(Topics = topics))
55          }          }
56    
57          return(doc)          return(doc)
# Line 60  Line 60 
60  class(readReut21578XML) <- "FunctionGenerator"  class(readReut21578XML) <- "FunctionGenerator"
61    
62  readRCV1 <- function(...) {  readRCV1 <- function(...) {
63      function(elem, load, id) {      function(elem, load, language, id) {
64          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
65          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- xmlTreeParse(corpus, asText = TRUE)
66          node <- xmlRoot(tree)          node <- xmlRoot(tree)
# Line 75  Line 75 
75          doc <- if (load) {          doc <- if (load) {
76              new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",              new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",
77                  DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",                  DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
78                  Heading = heading)                  Heading = heading, Language = language)
79          } else {          } else {
80              new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = "",              new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = "",
81                  DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",                  DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
82                  Heading = heading)                  Heading = heading, Language = language)
83          }          }
84    
85          return(doc)          return(doc)
# Line 88  Line 88 
88  class(readRCV1) <- "FunctionGenerator"  class(readRCV1) <- "FunctionGenerator"
89    
90  readNewsgroup <- function(...) {  readNewsgroup <- function(...) {
91      function(elem, load, id) {      function(elem, load, language, id) {
92          mail <- elem$content          mail <- elem$content
93          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))
94          datetimestamp <- as.POSIXct(strptime(gsub("Date: ", "", grep("^Date:", mail, value = TRUE)), format = "%d %B %Y %H:%M:%S"))          datetimestamp <- as.POSIXct(strptime(gsub("Date: ", "", grep("^Date:", mail, value = TRUE)), format = "%d %B %Y %H:%M:%S"))
# Line 108  Line 108 
108              new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,              new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
109                  Author = author, DateTimeStamp = datetimestamp,                  Author = author, DateTimeStamp = datetimestamp,
110                  Description = "", ID = id, Origin = origin,                  Description = "", ID = id, Origin = origin,
111                  Heading = heading, Newsgroup = newsgroup)                  Heading = heading, Language = language, Newsgroup = newsgroup)
112          } else {          } else {
113              new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,              new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
114                  Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)                  Description = "", ID = id, Origin = origin, Heading = heading, Language = language, Newsgroup = newsgroup)
115          }          }
116    
117          return(doc)          return(doc)
# Line 120  Line 120 
120  class(readNewsgroup) <- "FunctionGenerator"  class(readNewsgroup) <- "FunctionGenerator"
121    
122  readGmane <- function(...) {  readGmane <- function(...) {
123      function(elem, load, id) {      function(elem, load, language, id) {
124          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
125          # Remove namespaces          # Remove namespaces
126          corpus <- gsub("dc:date", "date", corpus)          corpus <- gsub("dc:date", "date", corpus)
# Line 141  Line 141 
141              new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,              new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
142                  Author = author, DateTimeStamp = datetimestamp,                  Author = author, DateTimeStamp = datetimestamp,
143                  Description = "", ID = id, Origin = origin,                  Description = "", ID = id, Origin = origin,
144                  Heading = heading, Newsgroup = newsgroup)                  Heading = heading, Language = language, Newsgroup = newsgroup)
145          } else {          } else {
146              new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,              new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
147                  Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)                  Description = "", ID = id, Origin = origin, Heading = heading, Language = language, Newsgroup = newsgroup)
148          }          }
149    
150          return(doc)          return(doc)
# Line 162  Line 162 
162      heading <- xmlValue(node[["title"]])      heading <- xmlValue(node[["title"]])
163    
164      new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = "", DateTimeStamp = datetimestamp,      new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = "", DateTimeStamp = datetimestamp,
165          Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML", Heading = heading)          Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML", Heading = heading, Language = "en_US")
166  }  }
167    
168  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
# Line 192  Line 192 
192      topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)      topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
193    
194      new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = author, DateTimeStamp = datetimestamp,      new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = author, DateTimeStamp = datetimestamp,
195          Description = description, ID = id, Origin = "Reuters-21578 XML", Heading = heading, LocalMetaData = list(Topics = topics))          Description = description, ID = id, Origin = "Reuters-21578 XML", Heading = heading, Language = "en_US",
196            LocalMetaData = list(Topics = topics))
197  }  }

Legend:
Removed from v.698  
changed lines
  Added in v.717

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge