16 |
} |
} |
17 |
setMethod("docs", "textdoccol", function(object) object@docs) |
setMethod("docs", "textdoccol", function(object) object@docs) |
18 |
|
|
19 |
setGeneric("textdoccol", function(docs) standardGeneric("textdoccol")) |
setGeneric("textdoccol", function(object) standardGeneric("textdoccol")) |
20 |
# Read in XML text documents |
# Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format |
21 |
# Reuters Corpus Volume 1 (RCV1) |
setMethod("textdoccol", "character", function(object) { |
|
setMethod("textdoccol", "character", function(docs) { |
|
22 |
require(XML) |
require(XML) |
23 |
|
|
24 |
tree <- xmlTreeParse(docs) |
tree <- xmlTreeParse(object) |
25 |
root <- xmlRoot(tree) |
new("textdoccol", docs = xmlApply(xmlRoot(tree), parseNewsItem), tdm = matrix()) |
26 |
|
}) |
|
# TODO: At each loop node points to the current newsitem |
|
|
node <- root |
|
27 |
|
|
28 |
# TODO: Implement lacking fields. |
# TODO: Implement lacking fields. |
29 |
# For this we need the full RCV1 XML set to know where to find those things |
# For this we need the full RCV1 XML set to know where to find those things |
30 |
|
parseNewsItem <- function(node) { |
31 |
author <- "Not yet implemented" |
author <- "Not yet implemented" |
32 |
timestamp <- xmlAttrs(node)[["date"]] |
timestamp <- xmlAttrs(node)[["date"]] |
33 |
description <- "Not yet implemented" |
description <- "Not yet implemented" |
34 |
id <- as.integer(xmlAttrs(node)[["itemid"]]) |
id <- as.integer(xmlAttrs(node)[["itemid"]]) |
35 |
origin <- "Not yet implemented" |
origin <- "Not yet implemented" |
36 |
|
# TODO: Concatenate list elements (= XML paragraphs) to a single string |
37 |
corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE) |
corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE) |
|
|
|
38 |
heading <- xmlValue(node[["title"]]) |
heading <- xmlValue(node[["title"]]) |
39 |
|
|
40 |
doc <- new("textdocument", author = author, timestamp = timestamp, description = description, |
new("textdocument", author = author, timestamp = timestamp, description = description, |
41 |
id = id, origin = origin, corpus = corpus, heading = heading) |
id = id, origin = origin, corpus = corpus, heading = heading) |
42 |
|
} |
|
new("textdoccol", docs = list(doc), tdm = matrix()) |
|
|
}) |
|