SCM

SCM Repository

[tm] View of /trunk/tm/R/reader.R
ViewVC logotype

View of /trunk/tm/R/reader.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 689 - (download) (annotate)
Fri Dec 8 14:21:46 2006 UTC (12 years, 8 months ago) by feinerer
Original Path: trunk/textmin/R/reader.R
File size: 8368 byte(s)
Implemented changes as proposed at the Forschungsklausur on 01.12.2006.
# Author: Ingo Feinerer

# Reader

plaintext_parser <- function(...) {
    function(elem, lodsupport, load, id) {
        if (!lodsupport || (lodsupport && load)) {
            doc <- new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,
                       Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")
        }
        else {
            doc <- new("PlainTextDocument", URI = elem$uri, Cached = FALSE,
                       Author = "", DateTimeStamp = Sys.time(), Description = "", ID = id, Origin = "", Heading = "")
        }

        return(doc)
    }
}
class(plaintext_parser) <- "function_generator"

reut21578xml_parser <- function(...) {
    function(elem, lodsupport, load, id) {
        corpus <- paste(elem$content, "\n", collapse = "")
        tree <- xmlTreeParse(corpus, asText = TRUE)
        node <- xmlRoot(tree)

        # Mask as list to bypass S4 checks
        class(tree) <- "list"

        # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
        if (!is.null(node[["TEXT"]][["AUTHOR"]]))
            author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
        else
            author <- ""

        datetimestamp <- as.POSIXct(strptime(xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))
        description <- ""
        id <- xmlAttrs(node)[["NEWID"]]

        # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
        if (!is.null(node[["TEXT"]][["TITLE"]]))
            heading <- xmlValue(node[["TEXT"]][["TITLE"]])
        else
            heading <- ""

        topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)

        if (!lodsupport || (lodsupport && load)) {
            doc <- new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,
                       DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
                       Heading = heading, LocalMetaData = list(Topics = topics))
        } else {
            doc <- new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = author,
                       DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
                       Heading = heading, LocalMetaData = list(Topics = topics))
        }

        return(doc)
    }
}
class(reut21578xml_parser) <- "function_generator"

rcv1_parser <- function(...) {
    function(elem, lodsupport, load, id) {
        corpus <- paste(elem$content, "\n", collapse = "")
        tree <- xmlTreeParse(corpus, asText = TRUE)
        node <- xmlRoot(tree)

        # Mask as list to bypass S4 checks
        class(tree) <- "list"

        datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])
        id <- xmlAttrs(node)[["itemid"]]
        heading <- xmlValue(node[["title"]])

        if (!lodsupport || (lodsupport && load)) {
            doc <- new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",
                       DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
                       Heading = heading)
        } else {
            doc <- new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = "",
                       DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
                       Heading = heading)
        }

        return(doc)
    }
}
class(rcv1_parser) <- "function_generator"

newsgroup_parser <- function(...) {
    function(elem, lodsupport, load, id) {
        mail <- elem$content
        author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))
        datetimestamp <- as.POSIXct(strptime(gsub("Date: ", "", grep("^Date:", mail, value = TRUE)), format = "%d %B %Y %H:%M:%S"))
        origin <- gsub("Path: ", "", grep("^Path:", mail, value = TRUE))
        heading <- gsub("Subject: ", "", grep("^Subject:", mail, value = TRUE))
        newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE))

        if (!lodsupport || (lodsupport && load)) {
            # The header is separated from the body by a blank line.
            # Reference: \url{http://en.wikipedia.org/wiki/E-mail#Internet_e-mail_format}
            for (index in seq(along = mail)) {
                if (mail[index] == "")
                    break
            }
            content <- mail[(index + 1):length(mail)]

            doc <- new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
                       Author = author, DateTimeStamp = datetimestamp,
                       Description = "", ID = id, Origin = origin,
                       Heading = heading, Newsgroup = newsgroup)
        } else {
            doc <- new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
                       Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)
        }

        return(doc)
    }
}
class(newsgroup_parser) <- "function_generator"

gmane_r_reader <- function(...) {
    function(elem, lodsupport, load, id) {
        corpus <- paste(elem$content, "\n", collapse = "")
        # Remove namespaces
        corpus <- gsub("dc:date", "date", corpus)
        corpus <- gsub("dc:creator", "creator", corpus)
        tree <- xmlTreeParse(corpus, asText = TRUE)
        node <- xmlRoot(tree)

        author <- xmlValue(node[["creator"]])
        datetimestamp <- as.POSIXct(strptime(xmlValue(node[["date"]]), format = "%Y-%m-%dT%H:%M:%S"))
        heading <- xmlValue(node[["title"]])
        id <- xmlValue(node[["link"]])
        newsgroup <- gsub("[0-9]+", "", xmlValue(node[["link"]]))
        origin <- "Gmane R Mailing Lists Archive"

        if (!lodsupport || (lodsupport && load)) {
            content <- xmlValue(node[["description"]])

            doc <- new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
                       Author = author, DateTimeStamp = datetimestamp,
                       Description = "", ID = id, Origin = origin,
                       Heading = heading, Newsgroup = newsgroup)
        } else {
            doc <- new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
                       Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)
        }

        return(doc)
    }
}
class(gmane_r_reader) <- "function_generator"


# Parser

# Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
rcv1_to_plain <- function(node, ...) {
    datetimestamp <- as.POSIXct(xmlAttrs(node)[["date"]])
    id <- xmlAttrs(node)[["itemid"]]
    origin <- "Reuters Corpus Volume 1 XML"
    corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
    heading <- xmlValue(node[["title"]])

    new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = "", DateTimeStamp = datetimestamp,
        Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML", Heading = heading)
}

# Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
reut21578xml_to_plain <- function(node, ...) {
    # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
    if (!is.null(node[["TEXT"]][["AUTHOR"]]))
        author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
    else
        author <- ""

    datetimestamp <- as.POSIXct(strptime(xmlValue(node[["DATE"]]), format = "%d-%B-%Y %H:%M:%S"))
    description <- ""
    id <- xmlAttrs(node)[["NEWID"]]

    origin <- "Reuters-21578 XML"

    # The <BODY></BODY> tag is unfortunately NOT obligatory!
    if (!is.null(node[["TEXT"]][["BODY"]]))
        corpus <- xmlValue(node[["TEXT"]][["BODY"]])
    else
        corpus <- ""

    # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
    if (!is.null(node[["TEXT"]][["TITLE"]]))
        heading <- xmlValue(node[["TEXT"]][["TITLE"]])
    else
        heading <- ""

    topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)

    new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = author, DateTimeStamp = datetimestamp,
        Description = description, ID = id, Origin = origin, Heading = heading, LocalMetaData = list(Topics = topics))
}

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge