SCM

SCM Repository

[tm] Annotation of /pkg/R/reader.R
ViewVC logotype

Annotation of /pkg/R/reader.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 886 - (view) (download)

1 : feinerer 689 # Author: Ingo Feinerer
2 :     # Reader
3 :    
4 : feinerer 875 getReaders <- function()
5 : feinerer 806 c("readDOC", "readGmane", "readHTML", "readNewsgroup", "readPDF", "readReut21578XML", "readPlain", "readRCV1")
6 : feinerer 805
7 : feinerer 777 readPlain <- FunctionGenerator(function(...) {
8 : feinerer 717 function(elem, load, language, id) {
9 : feinerer 694 doc <- if (load) {
10 :     new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,
11 : feinerer 717 Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "", Language = language)
12 : feinerer 689 }
13 :     else {
14 : feinerer 694 new("PlainTextDocument", URI = elem$uri, Cached = FALSE,
15 : feinerer 717 Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "", Language = language)
16 : feinerer 689 }
17 :    
18 :     return(doc)
19 :     }
20 : feinerer 777 })
21 : feinerer 689
22 : feinerer 777 readReut21578XML <- FunctionGenerator(function(...) {
23 : feinerer 717 function(elem, load, language, id) {
24 : feinerer 886 require("XML")
25 :    
26 : feinerer 689 corpus <- paste(elem$content, "\n", collapse = "")
27 : feinerer 886 tree <- XML::xmlTreeParse(corpus, asText = TRUE)
28 :     node <- XML::xmlRoot(tree)
29 : feinerer 689
30 :     # Mask as list to bypass S4 checks
31 :     class(tree) <- "list"
32 :    
33 :     # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
34 : feinerer 694 author <- if (!is.null(node[["TEXT"]][["AUTHOR"]]))
35 : feinerer 886 XML::xmlValue(node[["TEXT"]][["AUTHOR"]])
36 : feinerer 689 else
37 : feinerer 694 ""
38 : feinerer 689
39 : feinerer 886 datetimestamp <- as.POSIXct(strptime(XML::xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))
40 :     id <- XML::xmlAttrs(node)[["NEWID"]]
41 : feinerer 689
42 :     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
43 : feinerer 694 heading <- if (!is.null(node[["TEXT"]][["TITLE"]]))
44 : feinerer 886 XML::xmlValue(node[["TEXT"]][["TITLE"]])
45 : feinerer 689 else
46 : feinerer 694 ""
47 : feinerer 689
48 : feinerer 886 topics <- unlist(XML::xmlApply(node[["TOPICS"]], function(x) XML::xmlValue(x)), use.names = FALSE)
49 : feinerer 689
50 : feinerer 694 doc <- if (load) {
51 : feinerer 757 new("Reuters21578Document", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,
52 : feinerer 694 DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
53 : feinerer 717 Heading = heading, Language = language, LocalMetaData = list(Topics = topics))
54 : feinerer 689 } else {
55 : feinerer 757 new("Reuters21578Document", URI = elem$uri, Cached = FALSE, Author = author,
56 : feinerer 694 DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
57 : feinerer 717 Heading = heading, Language = language, LocalMetaData = list(Topics = topics))
58 : feinerer 689 }
59 :    
60 :     return(doc)
61 :     }
62 : feinerer 777 })
63 : feinerer 689
64 : feinerer 777 readRCV1 <- FunctionGenerator(function(...) {
65 : feinerer 717 function(elem, load, language, id) {
66 : feinerer 886 require("XML")
67 :    
68 : feinerer 689 corpus <- paste(elem$content, "\n", collapse = "")
69 : feinerer 886 tree <- XML::xmlTreeParse(corpus, asText = TRUE)
70 :     node <- XML::xmlRoot(tree)
71 : feinerer 689
72 :     # Mask as list to bypass S4 checks
73 :     class(tree) <- "list"
74 :    
75 : feinerer 886 datetimestamp <- as.POSIXct(XML::xmlAttrs(node)[["date"]])
76 :     id <- XML::xmlAttrs(node)[["itemid"]]
77 :     heading <- XML::xmlValue(node[["title"]])
78 : feinerer 689
79 : feinerer 694 doc <- if (load) {
80 : feinerer 757 new("RCV1Document", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",
81 : feinerer 694 DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
82 : feinerer 717 Heading = heading, Language = language)
83 : feinerer 689 } else {
84 : feinerer 757 new("RCV1Document", URI = elem$uri, Cached = FALSE, Author = "",
85 : feinerer 694 DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
86 : feinerer 717 Heading = heading, Language = language)
87 : feinerer 689 }
88 :    
89 : feinerer 856 # Extract meta data
90 :     if (!is.null(node[["metadata"]])) {
91 :     # Get simple Dublin Core meta data
92 :     dc <- node[["metadata"]][names(node[["metadata"]]) == "dc"]
93 : feinerer 886 dc <- lapply(dc, XML::xmlAttrs)
94 : feinerer 856 elements <- sapply(dc, "[[", "element")
95 :     values <- sapply(dc, "[[", "value")
96 :     if ("dc.publisher" %in% elements)
97 :     DublinCore(doc, "Publisher") <- values[elements == "dc.publisher"]
98 :    
99 :     # Get topic codes
100 :     codes <- node[["metadata"]][names(node[["metadata"]]) == "codes"]
101 : feinerer 886 topics <- codes[sapply(codes, XML::xmlAttrs) == "bip:topics:1.0"]
102 : feinerer 856 if (length(topics) > 0)
103 : feinerer 886 meta(doc, "Topics") <- unlist(XML::xmlApply(topics[[1]], XML::xmlAttrs), use.names = FALSE)
104 : feinerer 856 }
105 :    
106 : feinerer 689 return(doc)
107 :     }
108 : feinerer 777 })
109 : feinerer 689
110 : feinerer 882 readNewsgroup <- FunctionGenerator(function(DateFormat = "%d %B %Y %H:%M:%S", ...) {
111 :     format <- DateFormat
112 : feinerer 717 function(elem, load, language, id) {
113 : feinerer 689 mail <- elem$content
114 :     author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))
115 : feinerer 882 datetimestamp <- as.POSIXct(strptime(gsub("Date: ", "", grep("^Date:", mail, value = TRUE)), format = format))
116 : feinerer 689 origin <- gsub("Path: ", "", grep("^Path:", mail, value = TRUE))
117 :     heading <- gsub("Subject: ", "", grep("^Subject:", mail, value = TRUE))
118 :     newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE))
119 :    
120 : feinerer 694 doc <- if (load) {
121 : feinerer 689 # The header is separated from the body by a blank line.
122 :     # Reference: \url{http://en.wikipedia.org/wiki/E-mail#Internet_e-mail_format}
123 : feinerer 744 for (index in seq_along(mail)) {
124 : feinerer 689 if (mail[index] == "")
125 :     break
126 :     }
127 :     content <- mail[(index + 1):length(mail)]
128 :    
129 : feinerer 694 new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
130 :     Author = author, DateTimeStamp = datetimestamp,
131 :     Description = "", ID = id, Origin = origin,
132 : feinerer 717 Heading = heading, Language = language, Newsgroup = newsgroup)
133 : feinerer 689 } else {
134 : feinerer 694 new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
135 : feinerer 717 Description = "", ID = id, Origin = origin, Heading = heading, Language = language, Newsgroup = newsgroup)
136 : feinerer 689 }
137 :    
138 :     return(doc)
139 :     }
140 : feinerer 777 })
141 : feinerer 689
142 : feinerer 777 readGmane <- FunctionGenerator(function(...) {
143 : feinerer 717 function(elem, load, language, id) {
144 : feinerer 886 require("XML")
145 :    
146 : feinerer 689 corpus <- paste(elem$content, "\n", collapse = "")
147 :     # Remove namespaces
148 :     corpus <- gsub("dc:date", "date", corpus)
149 :     corpus <- gsub("dc:creator", "creator", corpus)
150 : feinerer 886 tree <- XML::xmlTreeParse(corpus, asText = TRUE)
151 :     node <- XML::xmlRoot(tree)
152 : feinerer 689
153 : feinerer 886 author <- XML::xmlValue(node[["creator"]])
154 :     datetimestamp <- as.POSIXct(strptime(XML::xmlValue(node[["date"]]), format = "%Y-%m-%dT%H:%M:%S"))
155 :     heading <- XML::xmlValue(node[["title"]])
156 :     id <- XML::xmlValue(node[["link"]])
157 :     newsgroup <- gsub("[0-9]+", "", XML::xmlValue(node[["link"]]))
158 : feinerer 694 origin <- "Gmane Mailing List Archive"
159 : feinerer 689
160 : feinerer 694 doc <- if (load) {
161 : feinerer 886 content <- XML::xmlValue(node[["description"]])
162 : feinerer 689
163 : feinerer 694 new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
164 :     Author = author, DateTimeStamp = datetimestamp,
165 :     Description = "", ID = id, Origin = origin,
166 : feinerer 717 Heading = heading, Language = language, Newsgroup = newsgroup)
167 : feinerer 689 } else {
168 : feinerer 694 new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
169 : feinerer 717 Description = "", ID = id, Origin = origin, Heading = heading, Language = language, Newsgroup = newsgroup)
170 : feinerer 689 }
171 :    
172 :     return(doc)
173 :     }
174 : feinerer 777 })
175 : feinerer 689
176 : feinerer 789 # readDOC needs antiword installed to be able to extract the text
177 :     readDOC <- FunctionGenerator(function(...) {
178 :     function(elem, load, language, id) {
179 :     if (!load)
180 :     warning("load on demand not supported for DOC documents")
181 :    
182 : feinerer 875 corpus <- paste(system(paste("antiword", shQuote(summary(eval(elem$uri))$description)), intern = TRUE), sep = "\n", collapse = "")
183 : feinerer 789
184 :     new("PlainTextDocument", .Data = corpus, URI = elem$uri, Cached = TRUE,
185 :     Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id,
186 :     Origin = "", Heading = "", Language = language)
187 :     }
188 :     })
189 :    
190 : feinerer 776 # readPDF needs pdftotext and pdfinfo installed to be able to extract the text and meta information
191 : feinerer 777 readPDF <- FunctionGenerator(function(...) {
192 : feinerer 766 function(elem, load, language, id) {
193 : feinerer 875 meta <- system(paste("pdfinfo", shQuote(summary(eval(elem$uri))$description)), intern = TRUE)
194 : feinerer 766 heading <- gsub("Title:[[:space:]]*", "", grep("Title:", meta, value = TRUE))
195 :     author <- gsub("Author:[[:space:]]*", "", grep("Author:", meta, value = TRUE))
196 :     datetimestamp <- as.POSIXct(strptime(gsub("CreationDate:[[:space:]]*", "",
197 :     grep("CreationDate:", meta, value = TRUE)),
198 :     format = "%a %b %d %H:%M:%S %Y"))
199 :     description <- gsub("Subject:[[:space:]]*", "", grep("Subject:", meta, value = TRUE))
200 :     origin <- gsub("Creator:[[:space:]]*", "", grep("Creator:", meta, value = TRUE))
201 :    
202 : feinerer 767 if (!load)
203 :     warning("load on demand not supported for PDF documents")
204 : feinerer 766
205 : feinerer 875 corpus <- paste(system(paste("pdftotext", shQuote(summary(eval(elem$uri))$description), "-"), intern = TRUE), sep = "\n", collapse = "")
206 : feinerer 767 new("PlainTextDocument", .Data = corpus, URI = elem$uri, Cached = TRUE,
207 :     Author = author, DateTimeStamp = datetimestamp, Description = description, ID = id,
208 :     Origin = origin, Heading = heading, Language = language)
209 : feinerer 766 }
210 : feinerer 777 })
211 : feinerer 766
212 : feinerer 777 readHTML <- FunctionGenerator(function(...) {
213 : feinerer 767 function(elem, load, language, id) {
214 : feinerer 886 require("XML")
215 : feinerer 767
216 : feinerer 886 tree <- XML::xmlTreeParse(elem$content, asText = TRUE)
217 :     root <- XML::xmlRoot(tree)
218 :    
219 : feinerer 767 head <- root[["head"]]
220 : feinerer 886 heading <- XML::xmlValue(head[["title"]])
221 : feinerer 767
222 : feinerer 886 meta <- lapply(XML::xmlChildren(head)[names(XML::xmlChildren(head)) == "meta"], XML::xmlAttrs)
223 : feinerer 767 metaNames <- sapply(meta, "[[", "name")
224 :     metaContents <- sapply(meta, "[[", "content")
225 :    
226 :     # See http://dublincore.org/documents/dcmi-terms/ and http://dublincore.org/documents/dcq-html/
227 :     author <- paste(metaContents[metaNames == "DC.creator"])
228 :     description <- paste(metaContents[metaNames == "DC.description"])
229 :     datetimestamp <- as.POSIXct(paste(metaContents[metaNames == "DC.date"]))
230 :     origin <- paste(metaContents[metaNames == "DC.publisher"])
231 :     language <- paste(metaContents[metaNames == "DC.language"])
232 :    
233 :     if (!load)
234 :     warning("load on demand not supported for StructuredTextDocuments using HTML")
235 :    
236 :     content <- list("Prologue" = NULL)
237 :     i <- 1
238 : feinerer 886 for (child in XML::xmlChildren(root[["body"]])) {
239 :     if (tolower(XML::xmlName(child)) == "h1") {
240 :     content <- c(content, structure(list(NULL), names = XML::xmlValue(child)))
241 : feinerer 767 i <- i + 1
242 :     }
243 :     else {
244 :     # We remove remaining HTML tags
245 : feinerer 886 content[[i]] <- c(content[[i]], toString(XML::xmlApply(child, XML::xmlValue)))
246 : feinerer 767 }
247 :     }
248 :    
249 :     new("StructuredTextDocument", .Data = content, URI = elem$uri, Cached = TRUE,
250 :     Author = author, DateTimeStamp = datetimestamp, Description = description, ID = id,
251 :     Origin = origin, Heading = heading, Language = language)
252 :     }
253 : feinerer 777 })
254 : feinerer 767
255 : feinerer 690 # Converter
256 : feinerer 689
257 : feinerer 698 convertRCV1Plain <- function(node, ...) {
258 : feinerer 886 require("XML")
259 :    
260 : feinerer 856 content <- Content(node)
261 :     # As XMLDocument is no native S4 class, restore valid information
262 :     class(content) <- "XMLDocument"
263 :     names(content) <- c("doc", "dtd")
264 : feinerer 886 content <- unlist(XML::xmlApply(XML::xmlRoot(content)[["text"]], XML::xmlValue), use.names = FALSE)
265 : feinerer 689
266 : feinerer 875 new("PlainTextDocument", .Data = content, Cached = TRUE, URI = NULL,
267 : feinerer 856 Author = Author(node), DateTimeStamp = DateTimeStamp(node),
268 :     Description = Description(node), ID = ID(node), Origin =
269 :     Origin(node), Heading = Heading(node), Language = Language(node),
270 :     LocalMetaData = LocalMetaData(node))
271 : feinerer 689 }
272 :    
273 :     # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
274 : feinerer 698 convertReut21578XMLPlain <- function(node, ...) {
275 : feinerer 886 require("XML")
276 :    
277 : feinerer 689 # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
278 :     if (!is.null(node[["TEXT"]][["AUTHOR"]]))
279 : feinerer 886 author <- XML::xmlValue(node[["TEXT"]][["AUTHOR"]])
280 : feinerer 689 else
281 :     author <- ""
282 :    
283 : feinerer 886 datetimestamp <- as.POSIXct(strptime(XML::xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))
284 : feinerer 689 description <- ""
285 : feinerer 886 id <- XML::xmlAttrs(node)[["NEWID"]]
286 : feinerer 689
287 :     # The <BODY></BODY> tag is unfortunately NOT obligatory!
288 : feinerer 694 corpus <- if (!is.null(node[["TEXT"]][["BODY"]]))
289 : feinerer 886 XML::xmlValue(node[["TEXT"]][["BODY"]])
290 : feinerer 689 else
291 : feinerer 694 ""
292 : feinerer 689
293 :     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
294 : feinerer 694 heading <- if (!is.null(node[["TEXT"]][["TITLE"]]))
295 : feinerer 886 XML::xmlValue(node[["TEXT"]][["TITLE"]])
296 : feinerer 689 else
297 : feinerer 694 ""
298 : feinerer 689
299 : feinerer 886 topics <- unlist(XML::xmlApply(node[["TOPICS"]], function(x) XML::xmlValue(x)), use.names = FALSE)
300 : feinerer 689
301 : feinerer 875 new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = NULL, Author = author, DateTimeStamp = datetimestamp,
302 : feinerer 717 Description = description, ID = id, Origin = "Reuters-21578 XML", Heading = heading, Language = "en_US",
303 :     LocalMetaData = list(Topics = topics))
304 : feinerer 689 }

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge