SCM

SCM Repository

[tm] Diff of /pkg/R/textdoccol.R
ViewVC logotype

Diff of /pkg/R/textdoccol.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 41, Sun Mar 12 17:14:15 2006 UTC revision 42, Sat Jul 1 08:42:26 2006 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))  setGeneric("textdoccol", function(object, inputType = "CSV", stripWhiteSpace = FALSE, toLower = FALSE) standardGeneric("textdoccol"))
4  setMethod("textdoccol",  setMethod("textdoccol",
5            c("character", "character", "logical", "logical"),            c("character"),
6            function(object, inputType = "CSV", stripWhiteSpace = FALSE, toLower = FALSE) {            function(object, inputType = "CSV", stripWhiteSpace = FALSE, toLower = FALSE) {
7                # Add a new type for each unique input source format                # Add a new type for each unique input source format
8                type <- match.arg(inputType,c("CSV", "RCV1", "REUT21578", "RIS"))                type <- match.arg(inputType,c("CSV", "RCV1", "REUT21578", "RIS"))
# Line 18  Line 18 
18                                             l <- vector("list", dim(m)[1])                                             l <- vector("list", dim(m)[1])
19                                             for (i in 1:dim(m)[1]) {                                             for (i in 1:dim(m)[1]) {
20                                                 author <- ""                                                 author <- ""
21                                                 timestamp <- date()                                                 datetimestamp <- date()
22                                                 description <- ""                                                 description <- ""
23                                                 id <- as.integer(m[i,1])                                                 id <- as.integer(m[i,1])
24                                                 corpus <- as.character(m[i,2:dim(m)[2]])                                                 corpus <- as.character(m[i,2:dim(m)[2]])
# Line 29  Line 29 
29                                                 origin <- "CSV"                                                 origin <- "CSV"
30                                                 heading <- ""                                                 heading <- ""
31    
32                                                 l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,                                                 l[[i]] <- new("textdocument", .Data = corpus, author = author, datetimestamp = datetimestamp,
33                                                               description = description, id = id, origin = origin, heading = heading)                                                               description = description, id = id, origin = origin, heading = heading)
34                                             }                                             }
35                                             l                                             l
# Line 86  Line 86 
86                tdcl                tdcl
87            })            })
88    
89  # Parse an HTML document  # Parse an Austrian RIS HTML document
90  parseHTML <- function(file, stripWhiteSpace = FALSE, toLower = FALSE) {  parseHTML <- function(file, stripWhiteSpace = FALSE, toLower = FALSE) {
91      author <- ""      author <- ""
92      timestamp <- date()      datetimestamp <- date()
93      description <- ""      description <- ""
94    
95      tree <- htmlTreeParse(file)      tree <- htmlTreeParse(file)
# Line 120  Line 120 
120    
121      heading <- ""      heading <- ""
122    
123      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,      new("textdocument", .Data = corpus, author = author, datetimestamp = datetimestamp,
124          description = description, id = id, origin = origin, heading = heading)          description = description, id = id, origin = origin, heading = heading)
125  }  }
126    
# Line 128  Line 128 
128  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
129  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
130      author <- "Not yet implemented"      author <- "Not yet implemented"
131      timestamp <- xmlAttrs(node)[["date"]]      datetimestamp <- xmlAttrs(node)[["date"]]
132      description <- "Not yet implemented"      description <- "Not yet implemented"
133      id <- as.integer(xmlAttrs(node)[["itemid"]])      id <- as.integer(xmlAttrs(node)[["itemid"]])
134      origin <- "Reuters Corpus Volume 1 XML"      origin <- "Reuters Corpus Volume 1 XML"
# Line 141  Line 141 
141    
142      heading <- xmlValue(node[["title"]])      heading <- xmlValue(node[["title"]])
143    
144      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,      new("textdocument", .Data = corpus, author = author, datetimestamp = datetimestamp,
145          description = description, id = id, origin = origin, heading = heading)          description = description, id = id, origin = origin, heading = heading)
146  }  }
147    
# Line 153  Line 153 
153      else      else
154          author <- ""          author <- ""
155    
156      timestamp <- xmlValue(node[["DATE"]])      datetimestamp <- xmlValue(node[["DATE"]])
157      description <- ""      description <- ""
158      id <- as.integer(xmlAttrs(node)[["NEWID"]])      id <- as.integer(xmlAttrs(node)[["NEWID"]])
159    
# Line 176  Line 176 
176      else      else
177          heading <- ""          heading <- ""
178    
179      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,      new("textdocument", .Data = corpus, author = author, datetimestamp = datetimestamp,
180          description = description, id = id, origin = origin, heading = heading)          description = description, id = id, origin = origin, heading = heading)
181  }  }

Legend:
Removed from v.41  
changed lines
  Added in v.42

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge