SCM

SCM Repository

[tm] Annotation of /trunk/tm/R/reader.R
ViewVC logotype

Annotation of /trunk/tm/R/reader.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 722 - (view) (download)

1 : feinerer 689 # Author: Ingo Feinerer
2 :    
3 :     # Reader
4 :    
5 : feinerer 698 readPlain <- function(...) {
6 : feinerer 717 function(elem, load, language, id) {
7 : feinerer 694 doc <- if (load) {
8 :     new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,
9 : feinerer 717 Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "", Language = language)
10 : feinerer 689 }
11 :     else {
12 : feinerer 694 new("PlainTextDocument", URI = elem$uri, Cached = FALSE,
13 : feinerer 717 Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "", Language = language)
14 : feinerer 689 }
15 :    
16 :     return(doc)
17 :     }
18 :     }
19 : feinerer 722 attr(readPlain, "FunctionGenerator") <- TRUE
20 : feinerer 689
21 : feinerer 698 readReut21578XML <- function(...) {
22 : feinerer 717 function(elem, load, language, id) {
23 : feinerer 689 corpus <- paste(elem$content, "\n", collapse = "")
24 :     tree <- xmlTreeParse(corpus, asText = TRUE)
25 :     node <- xmlRoot(tree)
26 :    
27 :     # Mask as list to bypass S4 checks
28 :     class(tree) <- "list"
29 :    
30 :     # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
31 : feinerer 694 author <- if (!is.null(node[["TEXT"]][["AUTHOR"]]))
32 :     xmlValue(node[["TEXT"]][["AUTHOR"]])
33 : feinerer 689 else
34 : feinerer 694 ""
35 : feinerer 689
36 :     datetimestamp <- as.POSIXct(strptime(xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))
37 :     id <- xmlAttrs(node)[["NEWID"]]
38 :    
39 :     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
40 : feinerer 694 heading <- if (!is.null(node[["TEXT"]][["TITLE"]]))
41 :     xmlValue(node[["TEXT"]][["TITLE"]])
42 : feinerer 689 else
43 : feinerer 694 ""
44 : feinerer 689
45 :     topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
46 :    
47 : feinerer 694 doc <- if (load) {
48 :     new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,
49 :     DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
50 : feinerer 717 Heading = heading, Language = language, LocalMetaData = list(Topics = topics))
51 : feinerer 689 } else {
52 : feinerer 694 new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = author,
53 :     DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
54 : feinerer 717 Heading = heading, Language = language, LocalMetaData = list(Topics = topics))
55 : feinerer 689 }
56 :    
57 :     return(doc)
58 :     }
59 :     }
60 : feinerer 722 attr(readReut21578XML, "FunctionGenerator") <- TRUE
61 : feinerer 689
62 : feinerer 698 readRCV1 <- function(...) {
63 : feinerer 717 function(elem, load, language, id) {
64 : feinerer 689 corpus <- paste(elem$content, "\n", collapse = "")
65 :     tree <- xmlTreeParse(corpus, asText = TRUE)
66 :     node <- xmlRoot(tree)
67 :    
68 :     # Mask as list to bypass S4 checks
69 :     class(tree) <- "list"
70 :    
71 :     datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])
72 :     id <- xmlAttrs(node)[["itemid"]]
73 :     heading <- xmlValue(node[["title"]])
74 :    
75 : feinerer 694 doc <- if (load) {
76 :     new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",
77 :     DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
78 : feinerer 717 Heading = heading, Language = language)
79 : feinerer 689 } else {
80 : feinerer 694 new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = "",
81 :     DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
82 : feinerer 717 Heading = heading, Language = language)
83 : feinerer 689 }
84 :    
85 :     return(doc)
86 :     }
87 :     }
88 : feinerer 722 attr(readRCV1, "FunctionGenerator") <- TRUE
89 : feinerer 689
90 : feinerer 698 readNewsgroup <- function(...) {
91 : feinerer 717 function(elem, load, language, id) {
92 : feinerer 689 mail <- elem$content
93 :     author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))
94 :     datetimestamp <- as.POSIXct(strptime(gsub("Date: ", "", grep("^Date:", mail, value = TRUE)), format = "%d %B %Y %H:%M:%S"))
95 :     origin <- gsub("Path: ", "", grep("^Path:", mail, value = TRUE))
96 :     heading <- gsub("Subject: ", "", grep("^Subject:", mail, value = TRUE))
97 :     newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE))
98 :    
99 : feinerer 694 doc <- if (load) {
100 : feinerer 689 # The header is separated from the body by a blank line.
101 :     # Reference: \url{http://en.wikipedia.org/wiki/E-mail#Internet_e-mail_format}
102 :     for (index in seq(along = mail)) {
103 :     if (mail[index] == "")
104 :     break
105 :     }
106 :     content <- mail[(index + 1):length(mail)]
107 :    
108 : feinerer 694 new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
109 :     Author = author, DateTimeStamp = datetimestamp,
110 :     Description = "", ID = id, Origin = origin,
111 : feinerer 717 Heading = heading, Language = language, Newsgroup = newsgroup)
112 : feinerer 689 } else {
113 : feinerer 694 new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
114 : feinerer 717 Description = "", ID = id, Origin = origin, Heading = heading, Language = language, Newsgroup = newsgroup)
115 : feinerer 689 }
116 :    
117 :     return(doc)
118 :     }
119 :     }
120 : feinerer 722 attr(readNewsgroup, "FunctionGenerator") <- TRUE
121 : feinerer 689
122 : feinerer 698 readGmane <- function(...) {
123 : feinerer 717 function(elem, load, language, id) {
124 : feinerer 689 corpus <- paste(elem$content, "\n", collapse = "")
125 :     # Remove namespaces
126 :     corpus <- gsub("dc:date", "date", corpus)
127 :     corpus <- gsub("dc:creator", "creator", corpus)
128 :     tree <- xmlTreeParse(corpus, asText = TRUE)
129 :     node <- xmlRoot(tree)
130 :    
131 :     author <- xmlValue(node[["creator"]])
132 :     datetimestamp <- as.POSIXct(strptime(xmlValue(node[["date"]]), format = "%Y-%m-%dT%H:%M:%S"))
133 :     heading <- xmlValue(node[["title"]])
134 :     id <- xmlValue(node[["link"]])
135 :     newsgroup <- gsub("[0-9]+", "", xmlValue(node[["link"]]))
136 : feinerer 694 origin <- "Gmane Mailing List Archive"
137 : feinerer 689
138 : feinerer 694 doc <- if (load) {
139 : feinerer 689 content <- xmlValue(node[["description"]])
140 :    
141 : feinerer 694 new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
142 :     Author = author, DateTimeStamp = datetimestamp,
143 :     Description = "", ID = id, Origin = origin,
144 : feinerer 717 Heading = heading, Language = language, Newsgroup = newsgroup)
145 : feinerer 689 } else {
146 : feinerer 694 new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
147 : feinerer 717 Description = "", ID = id, Origin = origin, Heading = heading, Language = language, Newsgroup = newsgroup)
148 : feinerer 689 }
149 :    
150 :     return(doc)
151 :     }
152 :     }
153 : feinerer 722 attr(readGmane, "FunctionGenerator") <- TRUE
154 : feinerer 689
155 : feinerer 690 # Converter
156 : feinerer 689
157 :     # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
158 : feinerer 698 convertRCV1Plain <- function(node, ...) {
159 : feinerer 689 datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])
160 :     id <- xmlAttrs(node)[["itemid"]]
161 :     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
162 :     heading <- xmlValue(node[["title"]])
163 :    
164 :     new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = "", DateTimeStamp = datetimestamp,
165 : feinerer 717 Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML", Heading = heading, Language = "en_US")
166 : feinerer 689 }
167 :    
168 :     # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
169 : feinerer 698 convertReut21578XMLPlain <- function(node, ...) {
170 : feinerer 689 # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
171 :     if (!is.null(node[["TEXT"]][["AUTHOR"]]))
172 :     author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
173 :     else
174 :     author <- ""
175 :    
176 :     datetimestamp <- as.POSIXct(strptime(xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))
177 :     description <- ""
178 :     id <- xmlAttrs(node)[["NEWID"]]
179 :    
180 :     # The <BODY></BODY> tag is unfortunately NOT obligatory!
181 : feinerer 694 corpus <- if (!is.null(node[["TEXT"]][["BODY"]]))
182 :     xmlValue(node[["TEXT"]][["BODY"]])
183 : feinerer 689 else
184 : feinerer 694 ""
185 : feinerer 689
186 :     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
187 : feinerer 694 heading <- if (!is.null(node[["TEXT"]][["TITLE"]]))
188 :     xmlValue(node[["TEXT"]][["TITLE"]])
189 : feinerer 689 else
190 : feinerer 694 ""
191 : feinerer 689
192 :     topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
193 :    
194 :     new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = author, DateTimeStamp = datetimestamp,
195 : feinerer 717 Description = description, ID = id, Origin = "Reuters-21578 XML", Heading = heading, Language = "en_US",
196 :     LocalMetaData = list(Topics = topics))
197 : feinerer 689 }

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge