SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 39, Sat Jan 21 09:37:39 2006 UTC revision 40, Tue Feb 14 15:02:45 2006 UTC
# Line 5  Line 5 
5            c("character", "character", "logical", "logical"),            c("character", "character", "logical", "logical"),
6            function(object, inputType = "CSV", stripWhiteSpace = FALSE, toLower = FALSE) {            function(object, inputType = "CSV", stripWhiteSpace = FALSE, toLower = FALSE) {
7                # Add a new type for each unique input source format                # Add a new type for each unique input source format
8                type <- match.arg(inputType,c("CSV","RCV1","REUT21578"))                type <- match.arg(inputType,c("CSV", "RCV1", "REUT21578", "RIS"))
9                switch(type,                switch(type,
10                       # Text in a special CSV format                       # Text in a special CSV format
11                       # For details on the file format see the R documentation file                       # For details on the file format see the R documentation file
# Line 67  Line 67 
67                               tdcl <- new("textdoccol", .Data = unlist(tdl, recursive = FALSE))                               tdcl <- new("textdoccol", .Data = unlist(tdl, recursive = FALSE))
68                           else                           else
69                               tdcl <- new("textdoccol", .Data = tdl)                               tdcl <- new("textdoccol", .Data = tdl)
70                         },
71                         # Read in HTML documents as used by http://ris.bka.gv.at/vwgh
72                         # The file name must be named according to the following schema:
73                         # Geschäftszahl.html, e.g. 2002130005.html
74                         "RIS" = {
75                             filelist <- dir(object, pattern = ".html", full.names = TRUE)
76                             tdl <- sapply(filelist,
77                                           function(file) {
78                                               l <- list()
79                                               l[[length(l) + 1]] <- parseHTML(file, stripWhiteSpace, toLower)
80                                               l
81                                           })
82                             tdcl <- new("textdoccol", .Data = tdl)
83                       })                       })
84                tdcl                tdcl
85            })            })
86    
87    # Parse an HTML document
88    parseHTML <- function(file, stripWhiteSpace = FALSE, toLower = FALSE) {
89        author <- ""
90        timestamp <- date()
91        description <- ""
92        id <- as.integer(gsub(".html", "", basename(file)))
93    
94        tree <- htmlTreeParse(file)
95        htmlElem <- unlist(tree$children$html$children)
96        textElem <- htmlElem[which(regexpr("text.value", names(htmlElem)) > 0)]
97        names(textElem) <- NULL
98    
99        corpus <- paste(textElem, collapse = " ")
100        origin <- ""
101    
102        if (stripWhiteSpace)
103            corpus <- gsub("[[:space:]]+", " ", corpus)
104        if (toLower)
105            corpus <- tolower(corpus)
106    
107        heading <- ""
108    
109        new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
110            description = description, id = id, origin = origin, heading = heading)
111    }
112    
113  # TODO: Implement lacking fields as soon I have access to the original RCV1  # TODO: Implement lacking fields as soon I have access to the original RCV1
114  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
115  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {

Legend:
Removed from v.39  
changed lines
  Added in v.40

R-Forge@R-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge