SCM

SCM Repository

[tm] View of /pkg/R/corpus.R
ViewVC logotype

View of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 18 - (download) (annotate)
Sat Nov 5 19:00:05 2005 UTC (13 years, 3 months ago) by feinerer
Original Path: trunk/R/trunk/R/textdoccol.R
File size: 1500 byte(s)
# Author: Ingo Feinerer

# S4 class definition
# Text document collection
# TODO: Define proper S4 term-document matrix
setClass("textdoccol", representation(docs = "list",
                                      tdm = "matrix"))

# Accessor function
if (!isGeneric("docs")) {
    if (is.function("docs"))
        fun <- docs
    else
        fun <- function(object) standardGeneric("docs")
    setGeneric("docs", fun)
}
setMethod("docs", "textdoccol", function(object) object@docs)

setGeneric("textdoccol", function(docs) standardGeneric("textdoccol"))
# Read in XML text documents
# Reuters Corpus Volume 1 (RCV1)
setMethod("textdoccol", "character", function(docs) {
    require(XML)

    tree <- xmlTreeParse(docs)
    root <- xmlRoot(tree)

    # TODO: At each loop node points to the current newsitem
    node <- root

    # TODO: Implement lacking fields.
    # For this we need the full RCV1 XML set to know where to find those things
    author <- "Not yet implemented"
    timestamp <- xmlAttrs(node)[["date"]]
    description <- "Not yet implemented"
    id <- as.integer(xmlAttrs(node)[["itemid"]])
    origin <- "Not yet implemented"
    corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)

    heading <- xmlValue(node[["title"]])

    doc <- new("textdocument", author = author, timestamp = timestamp, description = description,
               id = id, origin = origin, corpus = corpus, heading = heading)

    new("textdoccol", docs = list(doc), tdm = matrix())
})

R-Forge@R-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge