SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 22, Sat Nov 19 16:58:34 2005 UTC pkg/R/corpus.R revision 1108, Fri Oct 22 18:32:47 2010 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4  # Text document collection      attr(x, "CMetaData") <- cmeta
5  setClass("textdoccol",      attr(x, "DMetaData") <- dmeta
6           representation(tdm = "termdocmatrix"),      attr(x, "DBControl") <- dbcontrol
7           contains = c("list"))      class(x) <- c("PCorpus", "Corpus", "list")
8        x
9  # Accessor functions as described in "S4 Classes in 15 pages, more or less"  }
10    DBControl <- function(x) attr(x, "DBControl")
11  if (!isGeneric("tdm")) {  
12      if (is.function("tdm"))  PCorpus <- function(x,
13          fun <- tdm                      readerControl = list(reader = x$DefaultReader, language = "en"),
14                        dbControl = list(dbName = "", dbType = "DB1"),
15                        ...) {
16        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
17    
18        if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19            stop("error in creating database")
20        db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22        # Allocate memory in advance if length is known
23        tdl <- if (x$Length > 0)
24            vector("list", as.integer(x$Length))
25        else
26            list()
27    
28        counter <- 1
29        while (!eoi(x)) {
30            x <- stepNext(x)
31            elem <- getElem(x)
32            doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
33            filehash::dbInsert(db, ID(doc), doc)
34            if (x$Length > 0) tdl[[counter]] <- ID(doc)
35            else tdl <- c(tdl, ID(doc))
36            counter <- counter + 1
37        }
38        names(tdl) <- x$Names
39    
40        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
41        filehash::dbInsert(db, "DMetaData", df)
42        dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
43    
44        .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
45    }
46    
47    .VCorpus <- function(x, cmeta, dmeta) {
48        attr(x, "CMetaData") <- cmeta
49        attr(x, "DMetaData") <- dmeta
50        class(x) <- c("VCorpus", "Corpus", "list")
51        x
52    }
53    
54    # Register S3 corpus classes to be recognized by S4 methods. This is
55    # mainly a fix to be compatible with packages which were originally
56    # developed to cooperate with corresponding S4 tm classes. Necessary
57    # since tm's class architecture was changed to S3 since tm version 0.5.
58    setOldClass(c("VCorpus", "Corpus", "list"))
59    
60    # The "..." are additional arguments for the FunctionGenerator reader
61    VCorpus <- Corpus <- function(x,
62                                  readerControl = list(reader = x$DefaultReader, language = "en"),
63                                  ...) {
64        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
65    
66        # Allocate memory in advance if length is known
67        tdl <- if (x$Length > 0)
68            vector("list", as.integer(x$Length))
69        else
70            list()
71    
72        if (x$Vectorized)
73            tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
74                          pGetElem(x),
75                          id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,
76                          SIMPLIFY = FALSE)
77        else {
78            counter <- 1
79            while (!eoi(x)) {
80                x <- stepNext(x)
81                elem <- getElem(x)
82                doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
83                if (x$Length > 0)
84                    tdl[[counter]] <- doc
85                else
86                    tdl <- c(tdl, list(doc))
87                counter <- counter + 1
88            }
89        }
90        names(tdl) <- x$Names
91        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
92        .VCorpus(tdl, .MetaDataNode(), df)
93    }
94    
95    `[.PCorpus` <- function(x, i) {
96        if (missing(i)) return(x)
97        index <- attr(x, "DMetaData")[[1 , "subset"]]
98        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
99        dmeta <- attr(x, "DMetaData")
100        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
101    }
102    
103    `[.VCorpus` <- function(x, i) {
104        if (missing(i)) return(x)
105        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
106    }
107    
108    `[<-.PCorpus` <- function(x, i, value) {
109        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
110        counter <- 1
111        for (id in unclass(x)[i]) {
112            if (identical(length(value), 1L)) db[[id]] <- value
113            else db[[id]] <- value[[counter]]
114            counter <- counter + 1
115        }
116        x
117    }
118    
119    .map_name_index <- function(x, i) {
120        if (is.character(i)) {
121            if (is.null(names(x)))
122                match(i, meta(x, "ID", type = "local"))
123      else      else
124          fun <- function(object) standardGeneric("tdm")              match(i, names(x))
125      setGeneric("tdm", fun)      }
126        i
127  }  }
 setMethod("tdm", "textdoccol", function(object) object@tdm)  
128    
129  # Constructors  `[[.PCorpus` <-  function(x, i) {
130        i <- .map_name_index(x, i)
131        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
132        filehash::dbFetch(db, NextMethod("[["))
133    }
134    `[[.VCorpus` <-  function(x, i) {
135        i <- .map_name_index(x, i)
136        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
137        if (!is.null(lazyTmMap))
138            .Call("copyCorpus", x, materialize(x, i))
139        NextMethod("[[")
140    }
141    
142  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))  `[[<-.PCorpus` <-  function(x, i, value) {
143  setMethod("textdoccol",      i <- .map_name_index(x, i)
144            c("character", "character", "logical", "logical",  "character",      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
145              "logical", "character", "integer", "integer", "logical"),      index <- unclass(x)[[i]]
146            function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE, weighting = "tf",      db[[index]] <- value
147                     stemming = FALSE, language = "english", minWordLength = 3, minDocFreq = 1, stopwords = NULL) {      x
148    }
149                # Add a new type for each unique input source format  `[[<-.VCorpus` <-  function(x, i, value) {
150                type <- match.arg(inputType,c("RCV1","CSV"))      i <- .map_name_index(x, i)
151                switch(type,      # Mark new objects as not active for lazy mapping
152                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
153                       "RCV1" = {      if (!is.null(lazyTmMap)) {
154                           require(XML)          lazyTmMap$index[i] <- FALSE
155            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
156                           tree <- xmlTreeParse(object)      }
157                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))      # Set the value
158                       },      cl <- class(x)
159                       # Text in CSV format (as e.g. exported from an Excel sheet)      y <- NextMethod("[[<-")
160                       "CSV" = {      class(y) <- cl
161                           m <- as.matrix(read.csv(object))      y
162                           l <- vector("list", dim(m)[1])  }
                          for (i in 1:dim(m)[1]) {  
                              author <- "Not yet implemented"  
                              timestamp <- date()  
                              description <- "Not yet implemented"  
                              id <- i  
                              corpus <- as.character(m[i,2:dim(m)[2]])  
                              if (stripWhiteSpace)  
                                  corpus <- gsub("[[:space:]]+", " ", corpus)  
                              if (toLower)  
                                  corpus <- tolower(corpus)  
                              origin <- "Not yet implemented"  
                              heading <- "Not yet implemented"  
   
                              l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
                                  description = description, id = id, origin = origin, heading = heading)  
                          }  
                          tdcl <- new("textdoccol", .Data = l)  
                      }  
                      )  
   
               tdcl@tdm <- termdocmatrix(tdcl, weighting, stemming, language, minWordLength, minDocFreq, stopwords)  
   
               tdcl  
           })  
   
 # Parse a <newsitem></newsitem> element from a valid RCV1 XML file  
 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
   
     if (stripWhiteSpace)  
         corpus <- gsub("[[:space:]]+", " ", corpus)  
     if (toLower)  
         corpus <- tolower(corpus)  
163    
164      heading <- xmlValue(node[["title"]])  # Update NodeIDs of a CMetaData tree
165    .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
166        # Traversal of (binary) CMetaData tree with setup of NodeIDs
167        set_id <- function(x) {
168            x$NodeID <- id
169            id <<- id + 1
170            level <<- level + 1
171            if (length(x$Children) > 0) {
172                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
173                left <- set_id(x$Children[[1]])
174                if (level == 1) {
175                    left.mapping <<- mapping
176                    mapping <<- NULL
177                }
178                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
179                right <- set_id(x$Children[[2]])
180    
181      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,              x$Children <- list(left, right)
182          description = description, id = id, origin = origin, heading = heading)          }
183            level <<- level - 1
184            x
185        }
186        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
187    }
188    
189    # Find indices to be updated for a CMetaData tree
190    .find_indices <- function(x) {
191        indices.mapping <- NULL
192        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
193            indices <- (DMetaData(x)$MetaID == m)
194            indices.mapping <- c(indices.mapping, list(m = indices))
195            names(indices.mapping)[length(indices.mapping)] <- m
196        }
197        indices.mapping
198    }
199    
200    c2 <- function(x, y, ...) {
201        # Update the CMetaData tree
202        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
203        update.struct <- .update_id(cmeta)
204    
205        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
206    
207        # Find indices to be updated for the left tree
208        indices.mapping <- .find_indices(x)
209    
210        # Update the DMetaData data frames for the left tree
211        for (i in 1:ncol(update.struct$left.mapping)) {
212            map <- update.struct$left.mapping[,i]
213            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
214        }
215    
216        # Find indices to be updated for the right tree
217        indices.mapping <- .find_indices(y)
218    
219        # Update the DMetaData data frames for the right tree
220        for (i in 1:ncol(update.struct$right.mapping)) {
221            map <- update.struct$right.mapping[,i]
222            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
223        }
224    
225        # Merge the DMetaData data frames
226        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
227        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
228        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
229        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
230        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
231        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
232        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
233    
234        new
235    }
236    
237    c.Corpus <-
238    function(x, ..., recursive = FALSE)
239    {
240        args <- list(...)
241    
242        if (identical(length(args), 0L))
243            return(x)
244    
245        if (!all(unlist(lapply(args, inherits, class(x)))))
246            stop("not all arguments are of the same corpus type")
247    
248        if (inherits(x, "PCorpus"))
249            stop("concatenation of corpora with underlying databases is not supported")
250    
251        l <- base::c(list(x), args)
252        if (recursive)
253            Reduce(c2, l)
254        else {
255            l <- do.call("c", lapply(l, unclass))
256            .VCorpus(l,
257                     cmeta = .MetaDataNode(),
258                     dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))
259        }
260    }
261    
262    c.TextDocument <- function(x, ..., recursive = FALSE) {
263        args <- list(...)
264    
265        if (identical(length(args), 0L))
266            return(x)
267    
268        if (!all(unlist(lapply(args, inherits, class(x)))))
269            stop("not all arguments are text documents")
270    
271        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
272        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
273    }
274    
275    print.Corpus <- function(x, ...) {
276        cat(sprintf(ngettext(length(x),
277                             "A corpus with %d text document\n",
278                             "A corpus with %d text documents\n"),
279                    length(x)))
280        invisible(x)
281    }
282    
283    summary.Corpus <- function(object, ...) {
284        print(object)
285        if (length(DMetaData(object)) > 0) {
286            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
287                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
288                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
289                        length(CMetaData(object)$MetaData)))
290            cat("Available tags are:\n")
291            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
292            cat("Available variables in the data frame are:\n")
293            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
294        }
295    }
296    
297    inspect <- function(x) UseMethod("inspect", x)
298    inspect.PCorpus <- function(x) {
299        summary(x)
300        cat("\n")
301        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
302        show(filehash::dbMultiFetch(db, unlist(x)))
303    }
304    inspect.VCorpus <- function(x) {
305        summary(x)
306        cat("\n")
307        print(noquote(lapply(x, identity)))
308    }
309    
310    lapply.PCorpus <- function(X, FUN, ...) {
311        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
312        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
313    }
314    lapply.VCorpus <- function(X, FUN, ...) {
315        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
316        if (!is.null(lazyTmMap))
317            .Call("copyCorpus", X, materialize(X))
318        base::lapply(X, FUN, ...)
319    }
320    
321    writeCorpus <-  function(x, path = ".", filenames = NULL) {
322        filenames <- file.path(path,
323                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
324                               else filenames)
325        i <- 1
326        for (o in x) {
327            writeLines(as.PlainTextDocument(o), filenames[i])
328            i <- i + 1
329        }
330  }  }

Legend:
Removed from v.22  
changed lines
  Added in v.1108

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge