SCM

SCM Repository

[tm] Diff of /trunk/tm/R/textdoccol.R
ViewVC logotype

Diff of /trunk/tm/R/textdoccol.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 721, Wed Mar 21 13:54:43 2007 UTC revision 722, Sun Apr 1 15:53:58 2007 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # The "..." are additional arguments for the FunctionGenerator parser  # The "..." are additional arguments for the FunctionGenerator reader
4  setGeneric("TextDocCol", function(object,  setGeneric("TextDocCol", function(object,
5                                    parserControl = list(parser = readPlain, language = "en_US", load = FALSE),                                    readerControl = list(reader = readPlain, language = "en_US", load = FALSE),
6                                    dbControl = list(useDb = FALSE, dbName = "", dbType = "DB1"),                                    dbControl = list(useDb = FALSE, dbName = "", dbType = "DB1"),
7                                    ...) standardGeneric("TextDocCol"))                                    ...) standardGeneric("TextDocCol"))
8  setMethod("TextDocCol",  setMethod("TextDocCol",
9            signature(object = "Source"),            signature(object = "Source"),
10            function(object,            function(object,
11                     parserControl = list(parser = readPlain, language = "en_US", load = FALSE),                     readerControl = list(reader = readPlain, language = "en_US", load = FALSE),
12                     dbControl = list(useDb = FALSE, dbName = "", dbType = "DB1"),                     dbControl = list(useDb = FALSE, dbName = "", dbType = "DB1"),
13                     ...) {                     ...) {
14                if (inherits(parserControl$parser, "FunctionGenerator"))                if (attr(readerControl$reader, "FunctionGenerator"))
15                    parserControl$parser <- parserControl$parser(...)                    readerControl$reader <- readerControl$reader(...)
16    
17                if (dbControl$useDb) {                if (dbControl$useDb) {
18                    if (!dbCreate(dbControl$dbName, dbControl$dbType))                    if (!dbCreate(dbControl$dbName, dbControl$dbType))
# Line 28  Line 28 
28                    # If there is no Load on Demand support                    # If there is no Load on Demand support
29                    # we need to load the corpus into memory at startup                    # we need to load the corpus into memory at startup
30                    if (!object@LoDSupport)                    if (!object@LoDSupport)
31                        parserControl$load <- TRUE                        readerControl$load <- TRUE
32                    doc <- parserControl$parser(elem, parserControl$load, parserControl$language, as.character(counter))                    doc <- readerControl$reader(elem, readerControl$load, readerControl$language, as.character(counter))
33                    if (dbControl$useDb) {                    if (dbControl$useDb) {
34                        dbInsert(db, ID(doc), doc)                        dbInsert(db, ID(doc), doc)
35                        tdl <- c(tdl, ID(doc))                        tdl <- c(tdl, ID(doc))
# Line 108  Line 108 
108    
109  setGeneric("tmUpdate", function(object,  setGeneric("tmUpdate", function(object,
110                                  origin,                                  origin,
111                                  parserControl = list(parser = readPlain, language = "en_US", load = FALSE),                                  readerControl = list(reader = readPlain, language = "en_US", load = FALSE),
112                                  ...) standardGeneric("tmUpdate"))                                  ...) standardGeneric("tmUpdate"))
113  # Update is only supported for directories  # Update is only supported for directories
114  # At the moment no other LoD devices are available anyway  # At the moment no other LoD devices are available anyway
115  setMethod("tmUpdate",  setMethod("tmUpdate",
116            signature(object = "TextDocCol", origin = "DirSource"),            signature(object = "TextDocCol", origin = "DirSource"),
117            function(object, origin,            function(object, origin,
118                     parserControl = list(parser = readPlain, language = "en_US", load = FALSE),                     readerControl = list(reader = readPlain, language = "en_US", load = FALSE),
119                     ...) {                     ...) {
120                if (inherits(parserControl$parser, "FunctionGenerator"))                if (inherits(readerControl$reader, "FunctionGenerator"))
121                    parserControl$parser <- parserControl$parser(...)                    readerControl$reader <- readerControl$reader(...)
122    
123                object.filelist <- unlist(lapply(object, function(x) {as.character(URI(x))[2]}))                object.filelist <- unlist(lapply(object, function(x) {as.character(URI(x))[2]}))
124                new.files <- setdiff(origin@FileList, object.filelist)                new.files <- setdiff(origin@FileList, object.filelist)
# Line 126  Line 126 
126                for (filename in new.files) {                for (filename in new.files) {
127                    elem <- list(content = readLines(filename),                    elem <- list(content = readLines(filename),
128                                 uri = substitute(file(filename)))                                 uri = substitute(file(filename)))
129                    object <- appendElem(object, parserControl$parser(elem, parserControl$load, parserControl$language, filename))                    object <- appendElem(object, readerControl$reader(elem, readerControl$load, readerControl$language, filename))
130                }                }
131    
132                return(object)                return(object)
# Line 200  Line 200 
200                return(object)                return(object)
201            })            })
202    
203    setGeneric("removePunctuation", function(object, ...) standardGeneric("removePunctuation"))
204    setMethod("removePunctuation",
205              signature(object = "PlainTextDocument"),
206              function(object, ...) {
207                  Corpus(object) <- gsub("[[:punct:]]+", "", Corpus(object))
208                  return(object)
209              })
210    
211  setGeneric("removeWords", function(object, stopwords, ...) standardGeneric("removeWords"))  setGeneric("removeWords", function(object, stopwords, ...) standardGeneric("removeWords"))
212  setMethod("removeWords",  setMethod("removeWords",
213            signature(object = "PlainTextDocument", stopwords = "character"),            signature(object = "PlainTextDocument", stopwords = "character"),
# Line 530  Line 538 
538                                                "\nThe metadata consists of %d tag-value pairs and a data frame\n"),                                                "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
539                                         length(CMetaData(object)@MetaData)))                                         length(CMetaData(object)@MetaData)))
540                    cat("Available tags are:\n")                    cat("Available tags are:\n")
541                    cat(names(CMetaData(object)@MetaData), "\n")                    cat(strwrap(paste(names(CMetaData(object)@MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
542                    cat("Available variables in the data frame are:\n")                    cat("Available variables in the data frame are:\n")
543                    cat(names(DMetaData(object)), "\n")                    cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
544                }                }
545      })      })
546    

Legend:
Removed from v.721  
changed lines
  Added in v.722

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge