SCM

SCM Repository

[tm] View of /pkg/R/corpus.R
ViewVC logotype

View of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 36 - (download) (annotate)
Wed Jan 11 15:42:56 2006 UTC (13 years, 1 month ago) by feinerer
Original Path: trunk/R/trunk/R/textdoccol.R
File size: 5143 byte(s)
See ChangeLog
# Author: Ingo Feinerer

setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))
setMethod("textdoccol",
          c("character", "character", "logical", "logical"),
          function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {

              # Add a new type for each unique input source format
              type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))
              switch(type,
                     # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
                     # For the moment the first argument is still a single file
                     # This will be changed to a directory as soon as we have the full RCV1 data set
                     "RCV1" = {
                         tree <- xmlTreeParse(object)
                         tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))
                     },
                     # Text in a special CSV format (as e.g. exported from an Excel sheet)
                     # For details on the file format see data/Umfrage.csv
                     # The first argument has to be a single file
                     "CSV" = {
                         m <- as.matrix(read.csv(object))
                         l <- vector("list", dim(m)[1])
                         for (i in 1:dim(m)[1]) {
                             author <- "Not yet implemented"
                             timestamp <- date()
                             description <- "Not yet implemented"
                             id <- i
                             corpus <- as.character(m[i,2:dim(m)[2]])
                             if (stripWhiteSpace)
                                 corpus <- gsub("[[:space:]]+", " ", corpus)
                             if (toLower)
                                 corpus <- tolower(corpus)
                             origin <- "Not yet implemented"
                             heading <- "Not yet implemented"

                             l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
                                 description = description, id = id, origin = origin, heading = heading)
                         }
                         tdcl <- new("textdoccol", .Data = l)
                     },
                     # Read in text documents in Reuters-21578 XML (not SGML) format
                     # Typically the first argument will be a directory where we can
                     # find the files reut2-000.xml ... reut2-021.xml
                     "REUT21578" = {
                         tdl <- sapply(dir(object,
                                           pattern = ".xml",
                                           full.names = TRUE),
                                       function(file) {
                                           tree <- xmlTreeParse(file)
                                           xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)
                                       })

                         tdcl <- new("textdoccol", .Data = tdl)
                     })
              tdcl
          })

# Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
    author <- "Not yet implemented"
    timestamp <- xmlAttrs(node)[["date"]]
    description <- "Not yet implemented"
    id <- as.integer(xmlAttrs(node)[["itemid"]])
    origin <- "Reuters Corpus Volume 1 XML"
    corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)

    if (stripWhiteSpace)
        corpus <- gsub("[[:space:]]+", " ", corpus)
    if (toLower)
        corpus <- tolower(corpus)

    heading <- xmlValue(node[["title"]])

    new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
        description = description, id = id, origin = origin, heading = heading)
}

# Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
    # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
    if (!is.null(node[["TEXT"]][["AUTHOR"]]))
        author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
    else
        author <- ""

    timestamp <- xmlValue(node[["DATE"]])
    description <- ""
    id <- as.integer(xmlAttrs(node)[["NEWID"]])

    origin <- "Reuters-21578 XML"

    # The <BODY></BODY> tag is unfortunately NOT obligatory!
    if (!is.null(node[["TEXT"]][["BODY"]]))
        corpus <- xmlValue(node[["TEXT"]][["BODY"]])
    else
        corpus <- ""

    if (stripWhiteSpace)
        corpus <- gsub("[[:space:]]+", " ", corpus)
    if (toLower)
        corpus <- tolower(corpus)

    # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
    if (!is.null(node[["TEXT"]][["TITLE"]]))
        heading <- xmlValue(node[["TEXT"]][["TITLE"]])
    else
        heading <- ""

    new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
        description = description, id = id, origin = origin, heading = heading)
}

R-Forge@R-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge