SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 23, Sat Nov 19 18:25:41 2005 UTC revision 32, Thu Dec 15 13:13:54 2005 UTC
# Line 3  Line 3 
3  # S4 class definition  # S4 class definition
4  # Text document collection  # Text document collection
5  setClass("textdoccol",  setClass("textdoccol",
          representation(tdm = "termdocmatrix"),  
6           contains = c("list"))           contains = c("list"))
7    
 # Accessor functions as described in "S4 Classes in 15 pages, more or less"  
   
 if (!isGeneric("tdm")) {  
     if (is.function("tdm"))  
         fun <- tdm  
     else  
         fun <- function(object) standardGeneric("tdm")  
     setGeneric("tdm", fun)  
 }  
 setMethod("tdm", "textdoccol", function(object) object@tdm)  
   
8  # Constructors  # Constructors
9    
10  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))
11  setMethod("textdoccol",  setMethod("textdoccol",
12            c("character", "character", "logical", "logical",  "character",            c("character", "character", "logical", "logical"),
13              "logical", "character", "integer", "integer", "logical"),            function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {
           function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE, weighting = "tf",  
                    stemming = FALSE, language = "english", minWordLength = 3, minDocFreq = 1, stopwords = NULL) {  
14    
15                # Add a new type for each unique input source format                # Add a new type for each unique input source format
16                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))
17                switch(type,                switch(type,
18                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
19                         # For the moment the first argument is still a single file
20                         # This will be changed to a directory as soon as we have the full RCV1 data set
21                       "RCV1" = {                       "RCV1" = {
                          require(XML)  
   
22                           tree <- xmlTreeParse(object)                           tree <- xmlTreeParse(object)
23                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))
24                       },                       },
25                       # Text in CSV format (as e.g. exported from an Excel sheet)                       # Text in a special CSV format (as e.g. exported from an Excel sheet)
26                         # For details on the file format see data/Umfrage.csv
27                         # The first argument has to be a single file
28                       "CSV" = {                       "CSV" = {
29                           m <- as.matrix(read.csv(object))                           m <- as.matrix(read.csv(object))
30                           l <- vector("list", dim(m)[1])                           l <- vector("list", dim(m)[1])
# Line 62  Line 50 
50                       # Typically the first argument will be a directory where we can                       # Typically the first argument will be a directory where we can
51                       # find the files reut2-000.xml ... reut2-021.xml                       # find the files reut2-000.xml ... reut2-021.xml
52                       "REUT21578" = {                       "REUT21578" = {
53                           require(XML)                           tdl <- sapply(dir(object,
54                                               pattern = ".xml",
55                           # TODO: Read in a whole directory of XML files                                             full.names = TRUE),
56                           # lapply(dir(object, full.names = TRUE), function)                                         function(file) {
57                           # object is for the moment still a single XML file                                             tree <- xmlTreeParse(file)
58                           tree <- xmlTreeParse(object)                                             xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)
59                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower))                                         })
                      }  
                      )  
   
               tdcl@tdm <- termdocmatrix(tdcl, weighting, stemming, language, minWordLength, minDocFreq, stopwords)  
60    
61                             tdcl <- new("textdoccol", .Data = tdl)
62                         })
63                tdcl                tdcl
64            })            })
65    

Legend:
Removed from v.23  
changed lines
  Added in v.32

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge