SCM

SCM Repository

[tm] Annotation of /trunk/tm/R/reader.R
ViewVC logotype

Annotation of /trunk/tm/R/reader.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 689 - (view) (download)
Original Path: trunk/textmin/R/reader.R

1 : feinerer 689 # Author: Ingo Feinerer
2 :    
3 :     # Reader
4 :    
5 :     plaintext_parser <- function(...) {
6 :     function(elem, lodsupport, load, id) {
7 :     if (!lodsupport || (lodsupport && load)) {
8 :     doc <- new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,
9 :     Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")
10 :     }
11 :     else {
12 :     doc <- new("PlainTextDocument", URI = elem$uri, Cached = FALSE,
13 :     Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")
14 :     }
15 :    
16 :     return(doc)
17 :     }
18 :     }
19 :     class(plaintext_parser) <- "function_generator"
20 :    
21 :     reut21578xml_parser <- function(...) {
22 :     function(elem, lodsupport, load, id) {
23 :     corpus <- paste(elem$content, "\n", collapse = "")
24 :     tree <- xmlTreeParse(corpus, asText = TRUE)
25 :     node <- xmlRoot(tree)
26 :    
27 :     # Mask as list to bypass S4 checks
28 :     class(tree) <- "list"
29 :    
30 :     # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
31 :     if (!is.null(node[["TEXT"]][["AUTHOR"]]))
32 :     author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
33 :     else
34 :     author <- ""
35 :    
36 :     datetimestamp <- as.POSIXct(strptime(xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))
37 :     description <- ""
38 :     id <- xmlAttrs(node)[["NEWID"]]
39 :    
40 :     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
41 :     if (!is.null(node[["TEXT"]][["TITLE"]]))
42 :     heading <- xmlValue(node[["TEXT"]][["TITLE"]])
43 :     else
44 :     heading <- ""
45 :    
46 :     topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
47 :    
48 :     if (!lodsupport || (lodsupport && load)) {
49 :     doc <- new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,
50 :     DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
51 :     Heading = heading, LocalMetaData = list(Topics = topics))
52 :     } else {
53 :     doc <- new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = author,
54 :     DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
55 :     Heading = heading, LocalMetaData = list(Topics = topics))
56 :     }
57 :    
58 :     return(doc)
59 :     }
60 :     }
61 :     class(reut21578xml_parser) <- "function_generator"
62 :    
63 :     rcv1_parser <- function(...) {
64 :     function(elem, lodsupport, load, id) {
65 :     corpus <- paste(elem$content, "\n", collapse = "")
66 :     tree <- xmlTreeParse(corpus, asText = TRUE)
67 :     node <- xmlRoot(tree)
68 :    
69 :     # Mask as list to bypass S4 checks
70 :     class(tree) <- "list"
71 :    
72 :     datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])
73 :     id <- xmlAttrs(node)[["itemid"]]
74 :     heading <- xmlValue(node[["title"]])
75 :    
76 :     if (!lodsupport || (lodsupport && load)) {
77 :     doc <- new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",
78 :     DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
79 :     Heading = heading)
80 :     } else {
81 :     doc <- new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = "",
82 :     DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
83 :     Heading = heading)
84 :     }
85 :    
86 :     return(doc)
87 :     }
88 :     }
89 :     class(rcv1_parser) <- "function_generator"
90 :    
91 :     newsgroup_parser <- function(...) {
92 :     function(elem, lodsupport, load, id) {
93 :     mail <- elem$content
94 :     author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))
95 :     datetimestamp <- as.POSIXct(strptime(gsub("Date: ", "", grep("^Date:", mail, value = TRUE)), format = "%d %B %Y %H:%M:%S"))
96 :     origin <- gsub("Path: ", "", grep("^Path:", mail, value = TRUE))
97 :     heading <- gsub("Subject: ", "", grep("^Subject:", mail, value = TRUE))
98 :     newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE))
99 :    
100 :     if (!lodsupport || (lodsupport && load)) {
101 :     # The header is separated from the body by a blank line.
102 :     # Reference: \url{http://en.wikipedia.org/wiki/E-mail#Internet_e-mail_format}
103 :     for (index in seq(along = mail)) {
104 :     if (mail[index] == "")
105 :     break
106 :     }
107 :     content <- mail[(index + 1):length(mail)]
108 :    
109 :     doc <- new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
110 :     Author = author, DateTimeStamp = datetimestamp,
111 :     Description = "", ID = id, Origin = origin,
112 :     Heading = heading, Newsgroup = newsgroup)
113 :     } else {
114 :     doc <- new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
115 :     Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)
116 :     }
117 :    
118 :     return(doc)
119 :     }
120 :     }
121 :     class(newsgroup_parser) <- "function_generator"
122 :    
123 :     gmane_r_reader <- function(...) {
124 :     function(elem, lodsupport, load, id) {
125 :     corpus <- paste(elem$content, "\n", collapse = "")
126 :     # Remove namespaces
127 :     corpus <- gsub("dc:date", "date", corpus)
128 :     corpus <- gsub("dc:creator", "creator", corpus)
129 :     tree <- xmlTreeParse(corpus, asText = TRUE)
130 :     node <- xmlRoot(tree)
131 :    
132 :     author <- xmlValue(node[["creator"]])
133 :     datetimestamp <- as.POSIXct(strptime(xmlValue(node[["date"]]), format = "%Y-%m-%dT%H:%M:%S"))
134 :     heading <- xmlValue(node[["title"]])
135 :     id <- xmlValue(node[["link"]])
136 :     newsgroup <- gsub("[0-9]+", "", xmlValue(node[["link"]]))
137 :     origin <- "Gmane R Mailing Lists Archive"
138 :    
139 :     if (!lodsupport || (lodsupport && load)) {
140 :     content <- xmlValue(node[["description"]])
141 :    
142 :     doc <- new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
143 :     Author = author, DateTimeStamp = datetimestamp,
144 :     Description = "", ID = id, Origin = origin,
145 :     Heading = heading, Newsgroup = newsgroup)
146 :     } else {
147 :     doc <- new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
148 :     Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)
149 :     }
150 :    
151 :     return(doc)
152 :     }
153 :     }
154 :     class(gmane_r_reader) <- "function_generator"
155 :    
156 :    
157 :     # Parser
158 :    
159 :     # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
160 :     rcv1_to_plain <- function(node, ...) {
161 :     datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])
162 :     id <- xmlAttrs(node)[["itemid"]]
163 :     origin <- "Reuters Corpus Volume 1 XML"
164 :     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
165 :     heading <- xmlValue(node[["title"]])
166 :    
167 :     new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = "", DateTimeStamp = datetimestamp,
168 :     Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML", Heading = heading)
169 :     }
170 :    
171 :     # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
172 :     reut21578xml_to_plain <- function(node, ...) {
173 :     # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
174 :     if (!is.null(node[["TEXT"]][["AUTHOR"]]))
175 :     author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
176 :     else
177 :     author <- ""
178 :    
179 :     datetimestamp <- as.POSIXct(strptime(xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))
180 :     description <- ""
181 :     id <- xmlAttrs(node)[["NEWID"]]
182 :    
183 :     origin <- "Reuters-21578 XML"
184 :    
185 :     # The <BODY></BODY> tag is unfortunately NOT obligatory!
186 :     if (!is.null(node[["TEXT"]][["BODY"]]))
187 :     corpus <- xmlValue(node[["TEXT"]][["BODY"]])
188 :     else
189 :     corpus <- ""
190 :    
191 :     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
192 :     if (!is.null(node[["TEXT"]][["TITLE"]]))
193 :     heading <- xmlValue(node[["TEXT"]][["TITLE"]])
194 :     else
195 :     heading <- ""
196 :    
197 :     topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
198 :    
199 :     new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = author, DateTimeStamp = datetimestamp,
200 :     Description = description, ID = id, Origin = origin, Heading = heading, LocalMetaData = list(Topics = topics))
201 :     }

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge