SCM

SCM Repository

[tm] View of /pkg/R/reader.R
ViewVC logotype

View of /pkg/R/reader.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1061 - (download) (annotate)
Fri Mar 19 11:41:37 2010 UTC (9 years, 5 months ago) by feinerer
File size: 9248 byte(s)
Extract TOPICS, LEWISSPLIT, CGISPLIT, and OLDID meta tags from Reuters-21578 documents
## Author: Ingo Feinerer
## Reader

getReaders <- function()
    c("readDOC", "readGmane", "readPDF", "readReut21578XML", "readReut21578XMLasPlain", "readPlain", "readRCV1", "readRCV1asPlain", "readTabular")

prepareReader <- function(readerControl, defaultReader = NULL, ...) {
    if (is.null(readerControl$reader))
        readerControl$reader <- defaultReader
    if (inherits(readerControl$reader, "FunctionGenerator"))
        readerControl$reader <- readerControl$reader(...)
    if (is.null(readerControl$language))
        readerControl$language <- "eng"
    readerControl
}

readPlain <- FunctionGenerator(function(...) {
    function(elem, language, id) PlainTextDocument(elem$content, id = id, language = language)
})

readXML <- FunctionGenerator(function(spec, doc, ...) {
    spec <- spec
    doc <- doc
    function(elem, language, id) {
        tree <- XML::xmlInternalTreeParse(elem$content, asText = TRUE)
        Content(doc) <- if ("Content" %in% names(spec))
            .xml_content(tree, spec[["Content"]])
        else
            XML::xmlTreeParse(elem$content, asText = TRUE)
        for (n in setdiff(names(spec), "Content"))
            meta(doc, n) <- .xml_content(tree, spec[[n]])
        XML::free(tree)
        attr(doc, "Language") <- language
        doc
    }
})

readGmane <- readXML(spec = list(Author = list("node", "/item/creator"),
                     Content = list("node", "/item/description"),
                     DateTimeStamp = list("function", function(node)
                     strptime(sapply(XML::getNodeSet(node, "/item/date"), XML::xmlValue),
                              format = "%Y-%m-%dT%H:%M:%S",
                              tz = "GMT")),
                     Description = list("unevaluated", ""),
                     Heading = list("node", "/item/title"),
                     ID = list("node", "/item/link"),
                     Origin = list("unevaluated", "Gmane Mailing List Archive")),
                     doc = PlainTextDocument())

readReut21578XML <- readXML(spec = list(Author = list("node", "/REUTERS/TEXT/AUTHOR"),
                            DateTimeStamp = list("function", function(node)
                            strptime(sapply(XML::getNodeSet(node, "/REUTERS/DATE"), XML::xmlValue),
                                     format = "%d-%B-%Y %H:%M:%S",
                                     tz = "GMT")),
                            Description = list("unevaluated", ""),
                            Heading = list("node", "/REUTERS/TEXT/TITLE"),
                            ID = list("attribute", "/REUTERS/@NEWID"),
                            TOPICS = list("attribute", "/REUTERS/@TOPICS"),
                            LEWISSPLIT = list("attribute", "/REUTERS/@LEWISSPLIT"),
                            CGISPLIT = list("attribute", "/REUTERS/@CGISPLIT"),
                            OLDID = list("attribute", "/REUTERS/@OLDID"),
                            Origin = list("unevaluated", "Reuters-21578 XML"),
                            Topics = list("node", "/REUTERS/TOPICS/D"),
                            Places = list("node", "/REUTERS/PLACES/D"),
                            People = list("node", "/REUTERS/PEOPLE/D"),
                            Orgs = list("node", "/REUTERS/ORGS/D"),
                            Exchanges = list("node", "/REUTERS/EXCHANGES/D")),
                            doc = Reuters21578Document())

readReut21578XMLasPlain <- readXML(spec = list(Author = list("node", "/REUTERS/TEXT/AUTHOR"),
                                   Content = list("node", "/REUTERS/TEXT/BODY"),
                                   DateTimeStamp = list("function", function(node)
                                   strptime(sapply(XML::getNodeSet(node, "/REUTERS/DATE"), XML::xmlValue),
                                            format = "%d-%B-%Y %H:%M:%S",
                                            tz = "GMT")),
                                   Description = list("unevaluated", ""),
                                   Heading = list("node", "/REUTERS/TEXT/TITLE"),
                                   ID = list("attribute", "/REUTERS/@NEWID"),
                                   TOPICS = list("attribute", "/REUTERS/@TOPICS"),
                                   LEWISSPLIT = list("attribute", "/REUTERS/@LEWISSPLIT"),
                                   CGISPLIT = list("attribute", "/REUTERS/@CGISPLIT"),
                                   OLDID = list("attribute", "/REUTERS/@OLDID"),
                                   Origin = list("unevaluated", "Reuters-21578 XML"),
                                   Topics = list("node", "/REUTERS/TOPICS/D"),
                                   Places = list("node", "/REUTERS/PLACES/D"),
                                   People = list("node", "/REUTERS/PEOPLE/D"),
                                   Orgs = list("node", "/REUTERS/ORGS/D"),
                                   Exchanges = list("node", "/REUTERS/EXCHANGES/D")),
                                   doc = PlainTextDocument())

readRCV1 <- readXML(spec = list(Author = list("unevaluated", ""),
                    DateTimeStamp = list("function", function(node)
                    as.POSIXlt(as.character(XML::getNodeSet(node, "/newsitem/@date")), tz = "GMT")),
                    Description = list("unevaluated", ""),
                    Heading = list("node", "/newsitem/title"),
                    ID = list("attribute", "/newsitem/@itemid"),
                    Origin = list("unevaluated", "Reuters Corpus Volume 1"),
                    Publisher = list("attribute", "/newsitem/metadata/dc[@element='dc.publisher']/@value"),
                    Topics = list("attribute", "/newsitem/metadata/codes[@class='bip:topics:1.0']/code/@code"),
                    Industries = list("attribute", "/newsitem/metadata/codes[@class='bip:industries:1.0']/code/@code"),
                    Countries = list("attribute", "/newsitem/metadata/codes[@class='bip:countries:1.0']/code/@code")),
                    doc = RCV1Document())

readRCV1asPlain <- readXML(spec = list(Author = list("unevaluated", ""),
                           Content = list("node", "/newsitem/text"),
                           DateTimeStamp = list("function", function(node)
                           as.POSIXlt(as.character(XML::getNodeSet(node, "/newsitem/@date")), tz = "GMT")),
                           Description = list("unevaluated", ""),
                           Heading = list("node", "/newsitem/title"),
                           ID = list("attribute", "/newsitem/@itemid"),
                           Origin = list("unevaluated", "Reuters Corpus Volume 1"),
                           Publisher = list("attribute", "/newsitem/metadata/dc[@element='dc.publisher']/@value"),
                           Topics = list("attribute", "/newsitem/metadata/codes[@class='bip:topics:1.0']/code/@code"),
                           Industries = list("attribute", "/newsitem/metadata/codes[@class='bip:industries:1.0']/code/@code"),
                           Countries = list("attribute", "/newsitem/metadata/codes[@class='bip:countries:1.0']/code/@code")),
                           doc = PlainTextDocument())

# # readDOC needs antiword installed to be able to extract the text
readDOC <- FunctionGenerator(function(AntiwordOptions = "", ...) {
    AntiwordOptions <- AntiwordOptions
    function(elem, language, id) {
        content <- system(paste("antiword", AntiwordOptions, shQuote(eval(elem$uri))), intern = TRUE)
        PlainTextDocument(content, id = id, language = language)
    }
})

# readPDF needs pdftotext and pdfinfo installed to be able to extract the text and meta information
readPDF <- FunctionGenerator(function(PdfinfoOptions = "", PdftotextOptions = "", ...) {
    PdfinfoOptions <- PdfinfoOptions
    PdftotextOptions <- PdftotextOptions
    function(elem, language, id) {
        meta <- system(paste("pdfinfo", PdfinfoOptions, shQuote(eval(elem$uri))), intern = TRUE)
        heading <- gsub("Title:[[:space:]]*", "", grep("Title:", meta, value = TRUE))
        author <- gsub("Author:[[:space:]]*", "", grep("Author:", meta, value = TRUE))
        datetimestamp <- strptime(gsub("CreationDate:[[:space:]]*", "",
                                       grep("CreationDate:", meta, value = TRUE)),
                                  format = "%a %b %d %H:%M:%S %Y",
                                   tz = "GMT")
        description <- gsub("Subject:[[:space:]]*", "", grep("Subject:", meta, value = TRUE))
        origin <- gsub("Creator:[[:space:]]*", "", grep("Creator:", meta, value = TRUE))

        content <- system(paste("pdftotext", PdftotextOptions, shQuote(eval(elem$uri)), "-"), intern = TRUE)
        PlainTextDocument(content, author, datetimestamp, description, heading, id, origin, language)
     }
})

readTabular <- FunctionGenerator(function(mapping, ...) {
    mapping <- mapping
    function(elem, language, id) {
        doc <- PlainTextDocument(id = id, language = language)
        for (n in setdiff(names(mapping), "Content"))
            meta(doc, n) <- elem$content[, mapping[[n]]]
        if ("Content" %in% names(mapping))
            Content(doc) <- elem$content[, mapping[["Content"]]]
        doc
    }
})

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge