SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 19, Sun Nov 6 15:38:48 2005 UTC revision 32, Thu Dec 15 13:13:54 2005 UTC
# Line 2  Line 2 
2    
3  # S4 class definition  # S4 class definition
4  # Text document collection  # Text document collection
5  # TODO: Define proper S4 term-document matrix  setClass("textdoccol",
6  setClass("textdoccol", representation(docs = "list",           contains = c("list"))
                                       tdm = "matrix"))  
   
 # Accessor function  
 if (!isGeneric("docs")) {  
     if (is.function("docs"))  
         fun <- docs  
     else  
         fun <- function(object) standardGeneric("docs")  
     setGeneric("docs", fun)  
 }  
 setMethod("docs", "textdoccol", function(object) object@docs)  
7    
8  setGeneric("textdoccol", function(object) standardGeneric("textdoccol"))  # Constructors
 # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format  
 setMethod("textdoccol", "character", function(object) {  
     require(XML)  
9    
10    setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))
11    setMethod("textdoccol",
12              c("character", "character", "logical", "logical"),
13              function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {
14    
15                  # Add a new type for each unique input source format
16                  type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))
17                  switch(type,
18                         # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
19                         # For the moment the first argument is still a single file
20                         # This will be changed to a directory as soon as we have the full RCV1 data set
21                         "RCV1" = {
22      tree <- xmlTreeParse(object)      tree <- xmlTreeParse(object)
23      new("textdoccol", docs = xmlApply(xmlRoot(tree), parseNewsItem), tdm = matrix())                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))
24                         },
25                         # Text in a special CSV format (as e.g. exported from an Excel sheet)
26                         # For details on the file format see data/Umfrage.csv
27                         # The first argument has to be a single file
28                         "CSV" = {
29                             m <- as.matrix(read.csv(object))
30                             l <- vector("list", dim(m)[1])
31                             for (i in 1:dim(m)[1]) {
32                                 author <- "Not yet implemented"
33                                 timestamp <- date()
34                                 description <- "Not yet implemented"
35                                 id <- i
36                                 corpus <- as.character(m[i,2:dim(m)[2]])
37                                 if (stripWhiteSpace)
38                                     corpus <- gsub("[[:space:]]+", " ", corpus)
39                                 if (toLower)
40                                     corpus <- tolower(corpus)
41                                 origin <- "Not yet implemented"
42                                 heading <- "Not yet implemented"
43    
44                                 l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
45                                     description = description, id = id, origin = origin, heading = heading)
46                             }
47                             tdcl <- new("textdoccol", .Data = l)
48                         },
49                         # Read in text documents in Reuters-21578 XML (not SGML) format
50                         # Typically the first argument will be a directory where we can
51                         # find the files reut2-000.xml ... reut2-021.xml
52                         "REUT21578" = {
53                             tdl <- sapply(dir(object,
54                                               pattern = ".xml",
55                                               full.names = TRUE),
56                                           function(file) {
57                                               tree <- xmlTreeParse(file)
58                                               xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)
59                                           })
60    
61                             tdcl <- new("textdoccol", .Data = tdl)
62                         })
63                  tdcl
64  })  })
65    
66  # TODO: Implement lacking fields.  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
67  # For this we need the full RCV1 XML set to know where to find those things  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
 parseNewsItem <- function(node) {  
68      author <- "Not yet implemented"      author <- "Not yet implemented"
69      timestamp <- xmlAttrs(node)[["date"]]      timestamp <- xmlAttrs(node)[["date"]]
70      description <- "Not yet implemented"      description <- "Not yet implemented"
71      id <- as.integer(xmlAttrs(node)[["itemid"]])      id <- as.integer(xmlAttrs(node)[["itemid"]])
72      origin <- "Not yet implemented"      origin <- "Not yet implemented"
     # TODO: Concatenate list elements (= XML paragraphs) to a single string  
73      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
74    
75        if (stripWhiteSpace)
76            corpus <- gsub("[[:space:]]+", " ", corpus)
77        if (toLower)
78            corpus <- tolower(corpus)
79    
80      heading <- xmlValue(node[["title"]])      heading <- xmlValue(node[["title"]])
81    
82      new("textdocument", author = author, timestamp = timestamp, description = description,      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
83          id = id, origin = origin, corpus = corpus, heading = heading)          description = description, id = id, origin = origin, heading = heading)
84    }
85    
86    # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
87    parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
88        author <- "Not yet implemented"
89        timestamp <- xmlValue(node[["DATE"]])
90        description <- "Not yet implemented"
91        id <- as.integer(xmlAttrs(node)[["NEWID"]])
92    
93        origin <- "Not yet implemented"
94    
95        # The <BODY></BODY> tag is unfortunately NOT obligatory!
96        if (!is.null(node[["TEXT"]][["BODY"]]))
97            corpus <- xmlValue(node[["TEXT"]][["BODY"]])
98        else
99            corpus <- ""
100    
101        if (stripWhiteSpace)
102            corpus <- gsub("[[:space:]]+", " ", corpus)
103        if (toLower)
104            corpus <- tolower(corpus)
105    
106        # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
107        if (!is.null(node[["TEXT"]][["TITLE"]]))
108            heading <- xmlValue(node[["TEXT"]][["TITLE"]])
109        else
110            heading <- ""
111    
112        new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
113            description = description, id = id, origin = origin, heading = heading)
114  }  }

Legend:
Removed from v.19  
changed lines
  Added in v.32

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge