SCM

SCM Repository

[tm] Annotation of /pkg/R/corpus.R
ViewVC logotype

Annotation of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 41 - (view) (download)
Original Path: trunk/R/trunk/R/textdoccol.R

1 : feinerer 17 # Author: Ingo Feinerer
2 :    
3 : feinerer 21 setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))
4 :     setMethod("textdoccol",
5 : feinerer 32 c("character", "character", "logical", "logical"),
6 : feinerer 37 function(object, inputType = "CSV", stripWhiteSpace = FALSE, toLower = FALSE) {
7 : feinerer 22 # Add a new type for each unique input source format
8 : feinerer 40 type <- match.arg(inputType,c("CSV", "RCV1", "REUT21578", "RIS"))
9 : feinerer 22 switch(type,
10 : feinerer 37 # Text in a special CSV format
11 :     # For details on the file format see the R documentation file
12 :     # The first argument is a directory with .csv files
13 :     "CSV" = {
14 : feinerer 41 filelist <- dir(object, pattern = ".csv", full.names = TRUE)
15 : feinerer 39 tdl <- sapply(filelist,
16 : feinerer 37 function(file) {
17 :     m <- as.matrix(read.csv(file, header = FALSE))
18 :     l <- vector("list", dim(m)[1])
19 :     for (i in 1:dim(m)[1]) {
20 :     author <- ""
21 :     timestamp <- date()
22 :     description <- ""
23 :     id <- as.integer(m[i,1])
24 :     corpus <- as.character(m[i,2:dim(m)[2]])
25 :     if (stripWhiteSpace)
26 :     corpus <- gsub("[[:space:]]+", " ", corpus)
27 :     if (toLower)
28 :     corpus <- tolower(corpus)
29 :     origin <- "CSV"
30 :     heading <- ""
31 :    
32 :     l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
33 :     description = description, id = id, origin = origin, heading = heading)
34 :     }
35 :     l
36 :     })
37 : feinerer 39 if (length(filelist) > 1)
38 :     tdcl <- new("textdoccol", .Data = unlist(tdl, recursive = FALSE))
39 :     else
40 :     tdcl <- new("textdoccol", .Data = tdl)
41 : feinerer 37 },
42 : feinerer 22 # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
43 : feinerer 37 # The first argument is a directory with the RCV1 XML files
44 : feinerer 22 "RCV1" = {
45 : feinerer 40 filelist <- dir(object, pattern = ".xml", full.names = TRUE)
46 : feinerer 39 tdl <- sapply(filelist,
47 : feinerer 37 function(file) {
48 :     tree <- xmlTreeParse(file)
49 :     xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower)
50 :     })
51 : feinerer 39 if (length(filelist) > 1)
52 :     tdcl <- new("textdoccol", .Data = unlist(tdl, recursive = FALSE))
53 :     else
54 :     tdcl <- new("textdoccol", .Data = tdl)
55 : feinerer 22 },
56 : feinerer 23 # Read in text documents in Reuters-21578 XML (not SGML) format
57 :     # Typically the first argument will be a directory where we can
58 :     # find the files reut2-000.xml ... reut2-021.xml
59 :     "REUT21578" = {
60 : feinerer 40 filelist <- dir(object, pattern = ".xml", full.names = TRUE)
61 : feinerer 39 tdl <- sapply(filelist,
62 : feinerer 24 function(file) {
63 :     tree <- xmlTreeParse(file)
64 :     xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)
65 :     })
66 : feinerer 39 if (length(filelist) > 1)
67 :     tdcl <- new("textdoccol", .Data = unlist(tdl, recursive = FALSE))
68 :     else
69 :     tdcl <- new("textdoccol", .Data = tdl)
70 : feinerer 40 },
71 :     # Read in HTML documents as used by http://ris.bka.gv.at/vwgh
72 :     "RIS" = {
73 :     filelist <- dir(object, pattern = ".html", full.names = TRUE)
74 :     tdl <- sapply(filelist,
75 :     function(file) {
76 : feinerer 41 # Ignore warnings from misformed HTML documents
77 :     suppressWarnings(RISDoc <- parseHTML(file, stripWhiteSpace, toLower))
78 :     if (!is.null(RISDoc)) {
79 :     l <- list()
80 :     l[[length(l) + 1]] <- RISDoc
81 :     l
82 :     }
83 : feinerer 40 })
84 :     tdcl <- new("textdoccol", .Data = tdl)
85 : feinerer 24 })
86 : feinerer 21 tdcl
87 :     })
88 :    
89 : feinerer 40 # Parse an HTML document
90 :     parseHTML <- function(file, stripWhiteSpace = FALSE, toLower = FALSE) {
91 :     author <- ""
92 :     timestamp <- date()
93 :     description <- ""
94 :    
95 :     tree <- htmlTreeParse(file)
96 :     htmlElem <- unlist(tree$children$html$children)
97 : feinerer 41
98 :     if (is.null(htmlElem))
99 :     stop(paste("Empty document", file, "cannot be processed."))
100 :    
101 : feinerer 40 textElem <- htmlElem[which(regexpr("text.value", names(htmlElem)) > 0)]
102 :     names(textElem) <- NULL
103 :    
104 :     corpus <- paste(textElem, collapse = " ")
105 : feinerer 41
106 :     year <- substring(corpus, regexpr("..../../", corpus), regexpr("..../../", corpus) + 3)
107 :     senat <- substring(corpus, regexpr("..../../", corpus) + 5, regexpr("..../../", corpus) + 6)
108 :     number <- substring(corpus, regexpr("..../../", corpus) + 8, regexpr("..../../", corpus) + 11)
109 :    
110 :     id <- as.integer(paste(year, senat, number, sep = ""))
111 :    
112 :     if (is.na(id))
113 :     stop(paste("Cannot extract 'Geschaeftszahl' out of malformed document", file))
114 : feinerer 40 origin <- ""
115 :    
116 :     if (stripWhiteSpace)
117 :     corpus <- gsub("[[:space:]]+", " ", corpus)
118 :     if (toLower)
119 :     corpus <- tolower(corpus)
120 :    
121 :     heading <- ""
122 :    
123 :     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
124 :     description = description, id = id, origin = origin, heading = heading)
125 :     }
126 :    
127 : feinerer 37 # TODO: Implement lacking fields as soon I have access to the original RCV1
128 : feinerer 23 # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
129 : feinerer 21 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
130 : feinerer 17 author <- "Not yet implemented"
131 : feinerer 18 timestamp <- xmlAttrs(node)[["date"]]
132 : feinerer 17 description <- "Not yet implemented"
133 :     id <- as.integer(xmlAttrs(node)[["itemid"]])
134 : feinerer 36 origin <- "Reuters Corpus Volume 1 XML"
135 : feinerer 18 corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
136 : feinerer 21
137 :     if (stripWhiteSpace)
138 :     corpus <- gsub("[[:space:]]+", " ", corpus)
139 :     if (toLower)
140 :     corpus <- tolower(corpus)
141 :    
142 : feinerer 18 heading <- xmlValue(node[["title"]])
143 : feinerer 17
144 : feinerer 21 new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
145 :     description = description, id = id, origin = origin, heading = heading)
146 : feinerer 19 }
147 : feinerer 23
148 :     # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
149 :     parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
150 : feinerer 36 # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
151 :     if (!is.null(node[["TEXT"]][["AUTHOR"]]))
152 :     author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
153 :     else
154 :     author <- ""
155 :    
156 : feinerer 23 timestamp <- xmlValue(node[["DATE"]])
157 : feinerer 36 description <- ""
158 : feinerer 23 id <- as.integer(xmlAttrs(node)[["NEWID"]])
159 :    
160 : feinerer 36 origin <- "Reuters-21578 XML"
161 : feinerer 23
162 :     # The <BODY></BODY> tag is unfortunately NOT obligatory!
163 :     if (!is.null(node[["TEXT"]][["BODY"]]))
164 :     corpus <- xmlValue(node[["TEXT"]][["BODY"]])
165 :     else
166 :     corpus <- ""
167 :    
168 :     if (stripWhiteSpace)
169 :     corpus <- gsub("[[:space:]]+", " ", corpus)
170 :     if (toLower)
171 :     corpus <- tolower(corpus)
172 :    
173 :     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
174 :     if (!is.null(node[["TEXT"]][["TITLE"]]))
175 :     heading <- xmlValue(node[["TEXT"]][["TITLE"]])
176 :     else
177 :     heading <- ""
178 :    
179 :     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
180 :     description = description, id = id, origin = origin, heading = heading)
181 :     }

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge