SCM

SCM Repository

[tm] Annotation of /trunk/tm/R/reader.R
ViewVC logotype

Annotation of /trunk/tm/R/reader.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 694 - (view) (download)

1 : feinerer 689 # Author: Ingo Feinerer
2 :    
3 :     # Reader
4 :    
5 : feinerer 690 read_plain <- function(...) {
6 : feinerer 694 function(elem, load, id) {
7 :     doc <- if (load) {
8 :     new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,
9 :     Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")
10 : feinerer 689 }
11 :     else {
12 : feinerer 694 new("PlainTextDocument", URI = elem$uri, Cached = FALSE,
13 :     Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")
14 : feinerer 689 }
15 :    
16 :     return(doc)
17 :     }
18 :     }
19 : feinerer 690 class(read_plain) <- "function_generator"
20 : feinerer 689
21 : feinerer 690 read_reut21578xml <- function(...) {
22 : feinerer 694 function(elem, load, id) {
23 : feinerer 689 corpus <- paste(elem$content, "\n", collapse = "")
24 :     tree <- xmlTreeParse(corpus, asText = TRUE)
25 :     node <- xmlRoot(tree)
26 :    
27 :     # Mask as list to bypass S4 checks
28 :     class(tree) <- "list"
29 :    
30 :     # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
31 : feinerer 694 author <- if (!is.null(node[["TEXT"]][["AUTHOR"]]))
32 :     xmlValue(node[["TEXT"]][["AUTHOR"]])
33 : feinerer 689 else
34 : feinerer 694 ""
35 : feinerer 689
36 :     datetimestamp <- as.POSIXct(strptime(xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))
37 :     description <- ""
38 :     id <- xmlAttrs(node)[["NEWID"]]
39 :    
40 :     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
41 : feinerer 694 heading <- if (!is.null(node[["TEXT"]][["TITLE"]]))
42 :     xmlValue(node[["TEXT"]][["TITLE"]])
43 : feinerer 689 else
44 : feinerer 694 ""
45 : feinerer 689
46 :     topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
47 :    
48 : feinerer 694 doc <- if (load) {
49 :     new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,
50 :     DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
51 :     Heading = heading, LocalMetaData = list(Topics = topics))
52 : feinerer 689 } else {
53 : feinerer 694 new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = author,
54 :     DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
55 :     Heading = heading, LocalMetaData = list(Topics = topics))
56 : feinerer 689 }
57 :    
58 :     return(doc)
59 :     }
60 :     }
61 : feinerer 690 class(read_reut21578xml) <- "function_generator"
62 : feinerer 689
63 : feinerer 690 read_rcv1 <- function(...) {
64 : feinerer 694 function(elem, load, id) {
65 : feinerer 689 corpus <- paste(elem$content, "\n", collapse = "")
66 :     tree <- xmlTreeParse(corpus, asText = TRUE)
67 :     node <- xmlRoot(tree)
68 :    
69 :     # Mask as list to bypass S4 checks
70 :     class(tree) <- "list"
71 :    
72 :     datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])
73 :     id <- xmlAttrs(node)[["itemid"]]
74 :     heading <- xmlValue(node[["title"]])
75 :    
76 : feinerer 694 doc <- if (load) {
77 :     new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",
78 :     DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
79 :     Heading = heading)
80 : feinerer 689 } else {
81 : feinerer 694 new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = "",
82 :     DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
83 :     Heading = heading)
84 : feinerer 689 }
85 :    
86 :     return(doc)
87 :     }
88 :     }
89 : feinerer 690 class(read_rcv1) <- "function_generator"
90 : feinerer 689
91 : feinerer 690 read_newsgroup <- function(...) {
92 : feinerer 694 function(elem, load, id) {
93 : feinerer 689 mail <- elem$content
94 :     author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))
95 :     datetimestamp <- as.POSIXct(strptime(gsub("Date: ", "", grep("^Date:", mail, value = TRUE)), format = "%d %B %Y %H:%M:%S"))
96 :     origin <- gsub("Path: ", "", grep("^Path:", mail, value = TRUE))
97 :     heading <- gsub("Subject: ", "", grep("^Subject:", mail, value = TRUE))
98 :     newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE))
99 :    
100 : feinerer 694 doc <- if (load) {
101 : feinerer 689 # The header is separated from the body by a blank line.
102 :     # Reference: \url{http://en.wikipedia.org/wiki/E-mail#Internet_e-mail_format}
103 :     for (index in seq(along = mail)) {
104 :     if (mail[index] == "")
105 :     break
106 :     }
107 :     content <- mail[(index + 1):length(mail)]
108 :    
109 : feinerer 694 new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
110 :     Author = author, DateTimeStamp = datetimestamp,
111 :     Description = "", ID = id, Origin = origin,
112 :     Heading = heading, Newsgroup = newsgroup)
113 : feinerer 689 } else {
114 : feinerer 694 new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
115 :     Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)
116 : feinerer 689 }
117 :    
118 :     return(doc)
119 :     }
120 :     }
121 : feinerer 690 class(read_newsgroup) <- "function_generator"
122 : feinerer 689
123 : feinerer 694 read_gmane <- function(...) {
124 :     function(elem, load, id) {
125 : feinerer 689 corpus <- paste(elem$content, "\n", collapse = "")
126 :     # Remove namespaces
127 :     corpus <- gsub("dc:date", "date", corpus)
128 :     corpus <- gsub("dc:creator", "creator", corpus)
129 :     tree <- xmlTreeParse(corpus, asText = TRUE)
130 :     node <- xmlRoot(tree)
131 :    
132 :     author <- xmlValue(node[["creator"]])
133 :     datetimestamp <- as.POSIXct(strptime(xmlValue(node[["date"]]), format = "%Y-%m-%dT%H:%M:%S"))
134 :     heading <- xmlValue(node[["title"]])
135 :     id <- xmlValue(node[["link"]])
136 :     newsgroup <- gsub("[0-9]+", "", xmlValue(node[["link"]]))
137 : feinerer 694 origin <- "Gmane Mailing List Archive"
138 : feinerer 689
139 : feinerer 694 doc <- if (load) {
140 : feinerer 689 content <- xmlValue(node[["description"]])
141 :    
142 : feinerer 694 new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
143 :     Author = author, DateTimeStamp = datetimestamp,
144 :     Description = "", ID = id, Origin = origin,
145 :     Heading = heading, Newsgroup = newsgroup)
146 : feinerer 689 } else {
147 : feinerer 694 new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
148 :     Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)
149 : feinerer 689 }
150 :    
151 :     return(doc)
152 :     }
153 :     }
154 : feinerer 694 class(read_gmane) <- "function_generator"
155 : feinerer 689
156 : feinerer 690 # Converter
157 : feinerer 689
158 :     # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
159 : feinerer 690 convert_rcv1_plain <- function(node, ...) {
160 : feinerer 689 datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])
161 :     id <- xmlAttrs(node)[["itemid"]]
162 :     origin <- "Reuters Corpus Volume 1 XML"
163 :     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
164 :     heading <- xmlValue(node[["title"]])
165 :    
166 :     new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = "", DateTimeStamp = datetimestamp,
167 :     Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML", Heading = heading)
168 :     }
169 :    
170 :     # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
171 : feinerer 690 convert_reut21578xml_plain <- function(node, ...) {
172 : feinerer 689 # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
173 :     if (!is.null(node[["TEXT"]][["AUTHOR"]]))
174 :     author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
175 :     else
176 :     author <- ""
177 :    
178 :     datetimestamp <- as.POSIXct(strptime(xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))
179 :     description <- ""
180 :     id <- xmlAttrs(node)[["NEWID"]]
181 :    
182 :     origin <- "Reuters-21578 XML"
183 :    
184 :     # The <BODY></BODY> tag is unfortunately NOT obligatory!
185 : feinerer 694 corpus <- if (!is.null(node[["TEXT"]][["BODY"]]))
186 :     xmlValue(node[["TEXT"]][["BODY"]])
187 : feinerer 689 else
188 : feinerer 694 ""
189 : feinerer 689
190 :     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
191 : feinerer 694 heading <- if (!is.null(node[["TEXT"]][["TITLE"]]))
192 :     xmlValue(node[["TEXT"]][["TITLE"]])
193 : feinerer 689 else
194 : feinerer 694 ""
195 : feinerer 689
196 :     topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
197 :    
198 :     new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = author, DateTimeStamp = datetimestamp,
199 :     Description = description, ID = id, Origin = origin, Heading = heading, LocalMetaData = list(Topics = topics))
200 :     }

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge