SCM

SCM Repository

[tm] Diff of /pkg/R/textdoccol.R
ViewVC logotype

Diff of /pkg/R/textdoccol.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 23, Sat Nov 19 18:25:41 2005 UTC revision 24, Sun Nov 20 15:31:34 2005 UTC
# Line 30  Line 30 
30                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))
31                switch(type,                switch(type,
32                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
33                         # For the moment the first argument is still a single file
34                         # This will be changed to a directory as soon as we have the full RCV1 data set
35                       "RCV1" = {                       "RCV1" = {
36                           require(XML)                           require(XML)
37    
38                           tree <- xmlTreeParse(object)                           tree <- xmlTreeParse(object)
39                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))
40                       },                       },
41                       # Text in CSV format (as e.g. exported from an Excel sheet)                       # Text in a special CSV format (as e.g. exported from an Excel sheet)
42                         # For details on the file format see data/Umfrage.csv
43                         # The first argument has to be a single file
44                       "CSV" = {                       "CSV" = {
45                           m <- as.matrix(read.csv(object))                           m <- as.matrix(read.csv(object))
46                           l <- vector("list", dim(m)[1])                           l <- vector("list", dim(m)[1])
# Line 64  Line 68 
68                       "REUT21578" = {                       "REUT21578" = {
69                           require(XML)                           require(XML)
70    
71                           # TODO: Read in a whole directory of XML files                           tdl <- sapply(dir(object,
72                           # lapply(dir(object, full.names = TRUE), function)                                             pattern = ".xml",
73                           # object is for the moment still a single XML file                                             full.names = TRUE),
74                           tree <- xmlTreeParse(object)                                         function(file) {
75                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower))                                             tree <- xmlTreeParse(file)
76                       }                                             xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)
77                       )                                         })
78    
79                             tdcl <- new("textdoccol", .Data = tdl)
80                         })
81    
82                tdcl@tdm <- termdocmatrix(tdcl, weighting, stemming, language, minWordLength, minDocFreq, stopwords)                tdcl@tdm <- termdocmatrix(tdcl, weighting, stemming, language, minWordLength, minDocFreq, stopwords)
83    

Legend:
Removed from v.23  
changed lines
  Added in v.24

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge