SCM

SCM Repository

[tm] Diff of /pkg/R/textdoccol.R
ViewVC logotype

Diff of /pkg/R/textdoccol.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 22, Sat Nov 19 16:58:34 2005 UTC revision 23, Sat Nov 19 18:25:41 2005 UTC
# Line 27  Line 27 
27                     stemming = FALSE, language = "english", minWordLength = 3, minDocFreq = 1, stopwords = NULL) {                     stemming = FALSE, language = "english", minWordLength = 3, minDocFreq = 1, stopwords = NULL) {
28    
29                # Add a new type for each unique input source format                # Add a new type for each unique input source format
30                type <- match.arg(inputType,c("RCV1","CSV"))                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))
31                switch(type,                switch(type,
32                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
33                       "RCV1" = {                       "RCV1" = {
# Line 57  Line 57 
57                                   description = description, id = id, origin = origin, heading = heading)                                   description = description, id = id, origin = origin, heading = heading)
58                           }                           }
59                           tdcl <- new("textdoccol", .Data = l)                           tdcl <- new("textdoccol", .Data = l)
60                         },
61                         # Read in text documents in Reuters-21578 XML (not SGML) format
62                         # Typically the first argument will be a directory where we can
63                         # find the files reut2-000.xml ... reut2-021.xml
64                         "REUT21578" = {
65                             require(XML)
66    
67                             # TODO: Read in a whole directory of XML files
68                             # lapply(dir(object, full.names = TRUE), function)
69                             # object is for the moment still a single XML file
70                             tree <- xmlTreeParse(object)
71                             tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower))
72                       }                       }
73                       )                       )
74    
# Line 65  Line 77 
77                tdcl                tdcl
78            })            })
79    
80  # Parse a <newsitem></newsitem> element from a valid RCV1 XML file  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
81  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
82      author <- "Not yet implemented"      author <- "Not yet implemented"
83      timestamp <- xmlAttrs(node)[["date"]]      timestamp <- xmlAttrs(node)[["date"]]
# Line 83  Line 95 
95    
96      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
97          description = description, id = id, origin = origin, heading = heading)          description = description, id = id, origin = origin, heading = heading)
98    }
99    
100    # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
101    parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
102        author <- "Not yet implemented"
103        timestamp <- xmlValue(node[["DATE"]])
104        description <- "Not yet implemented"
105        id <- as.integer(xmlAttrs(node)[["NEWID"]])
106    
107        origin <- "Not yet implemented"
108    
109        # The <BODY></BODY> tag is unfortunately NOT obligatory!
110        if (!is.null(node[["TEXT"]][["BODY"]]))
111            corpus <- xmlValue(node[["TEXT"]][["BODY"]])
112        else
113            corpus <- ""
114    
115        if (stripWhiteSpace)
116            corpus <- gsub("[[:space:]]+", " ", corpus)
117        if (toLower)
118            corpus <- tolower(corpus)
119    
120        # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
121        if (!is.null(node[["TEXT"]][["TITLE"]]))
122            heading <- xmlValue(node[["TEXT"]][["TITLE"]])
123        else
124            heading <- ""
125    
126        new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
127            description = description, id = id, origin = origin, heading = heading)
128  }  }

Legend:
Removed from v.22  
changed lines
  Added in v.23

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge