SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 22, Sat Nov 19 16:58:34 2005 UTC pkg/R/corpus.R revision 1114, Fri Nov 26 14:05:54 2010 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4  # Text document collection      attr(x, "CMetaData") <- cmeta
5  setClass("textdoccol",      attr(x, "DMetaData") <- dmeta
6           representation(tdm = "termdocmatrix"),      attr(x, "DBControl") <- dbcontrol
7           contains = c("list"))      class(x) <- c("PCorpus", "Corpus", "list")
8        x
9  # Accessor functions as described in "S4 Classes in 15 pages, more or less"  }
10    DBControl <- function(x) attr(x, "DBControl")
11  if (!isGeneric("tdm")) {  
12      if (is.function("tdm"))  PCorpus <- function(x,
13          fun <- tdm                      readerControl = list(reader = x$DefaultReader, language = "en"),
14                        dbControl = list(dbName = "", dbType = "DB1"),
15                        ...) {
16        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
17    
18        if (is.function(readerControl$init))
19            readerControl$init()
20    
21        if (is.function(readerControl$exit))
22            on.exit(readerControl$exit())
23    
24        if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
25            stop("error in creating database")
26        db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
27    
28        # Allocate memory in advance if length is known
29        tdl <- if (x$Length > 0)
30            vector("list", as.integer(x$Length))
31        else
32            list()
33    
34        counter <- 1
35        while (!eoi(x)) {
36            x <- stepNext(x)
37            elem <- getElem(x)
38            doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
39            filehash::dbInsert(db, ID(doc), doc)
40            if (x$Length > 0) tdl[[counter]] <- ID(doc)
41            else tdl <- c(tdl, ID(doc))
42            counter <- counter + 1
43        }
44        names(tdl) <- x$Names
45    
46        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
47        filehash::dbInsert(db, "DMetaData", df)
48        dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
49    
50        .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
51    }
52    
53    .VCorpus <- function(x, cmeta, dmeta) {
54        attr(x, "CMetaData") <- cmeta
55        attr(x, "DMetaData") <- dmeta
56        class(x) <- c("VCorpus", "Corpus", "list")
57        x
58    }
59    
60    # Register S3 corpus classes to be recognized by S4 methods. This is
61    # mainly a fix to be compatible with packages which were originally
62    # developed to cooperate with corresponding S4 tm classes. Necessary
63    # since tm's class architecture was changed to S3 since tm version 0.5.
64    setOldClass(c("VCorpus", "Corpus", "list"))
65    
66    # The "..." are additional arguments for the FunctionGenerator reader
67    VCorpus <- Corpus <- function(x,
68                                  readerControl = list(reader = x$DefaultReader, language = "en"),
69                                  ...) {
70        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
71    
72        if (is.function(readerControl$init))
73            readerControl$init()
74    
75        if (is.function(readerControl$exit))
76            on.exit(readerControl$exit())
77    
78        # Allocate memory in advance if length is known
79        tdl <- if (x$Length > 0)
80            vector("list", as.integer(x$Length))
81        else
82            list()
83    
84        if (x$Vectorized)
85            tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
86                          pGetElem(x),
87                          id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,
88                          SIMPLIFY = FALSE)
89        else {
90            counter <- 1
91            while (!eoi(x)) {
92                x <- stepNext(x)
93                elem <- getElem(x)
94                doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
95                if (x$Length > 0)
96                    tdl[[counter]] <- doc
97                else
98                    tdl <- c(tdl, list(doc))
99                counter <- counter + 1
100            }
101        }
102        names(tdl) <- x$Names
103        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
104        .VCorpus(tdl, .MetaDataNode(), df)
105    }
106    
107    `[.PCorpus` <- function(x, i) {
108        if (missing(i)) return(x)
109        index <- attr(x, "DMetaData")[[1 , "subset"]]
110        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
111        dmeta <- attr(x, "DMetaData")
112        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
113    }
114    
115    `[.VCorpus` <- function(x, i) {
116        if (missing(i)) return(x)
117        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
118    }
119    
120    `[<-.PCorpus` <- function(x, i, value) {
121        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
122        counter <- 1
123        for (id in unclass(x)[i]) {
124            if (identical(length(value), 1L)) db[[id]] <- value
125            else db[[id]] <- value[[counter]]
126            counter <- counter + 1
127        }
128        x
129    }
130    
131    .map_name_index <- function(x, i) {
132        if (is.character(i)) {
133            if (is.null(names(x)))
134                match(i, meta(x, "ID", type = "local"))
135      else      else
136          fun <- function(object) standardGeneric("tdm")              match(i, names(x))
137      setGeneric("tdm", fun)      }
138        i
139  }  }
 setMethod("tdm", "textdoccol", function(object) object@tdm)  
140    
141  # Constructors  `[[.PCorpus` <-  function(x, i) {
142        i <- .map_name_index(x, i)
143        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
144        filehash::dbFetch(db, NextMethod("[["))
145    }
146    `[[.VCorpus` <-  function(x, i) {
147        i <- .map_name_index(x, i)
148        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
149        if (!is.null(lazyTmMap))
150            .Call("copyCorpus", x, materialize(x, i))
151        NextMethod("[[")
152    }
153    
154  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))  `[[<-.PCorpus` <-  function(x, i, value) {
155  setMethod("textdoccol",      i <- .map_name_index(x, i)
156            c("character", "character", "logical", "logical",  "character",      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
157              "logical", "character", "integer", "integer", "logical"),      index <- unclass(x)[[i]]
158            function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE, weighting = "tf",      db[[index]] <- value
159                     stemming = FALSE, language = "english", minWordLength = 3, minDocFreq = 1, stopwords = NULL) {      x
160    }
161                # Add a new type for each unique input source format  `[[<-.VCorpus` <-  function(x, i, value) {
162                type <- match.arg(inputType,c("RCV1","CSV"))      i <- .map_name_index(x, i)
163                switch(type,      # Mark new objects as not active for lazy mapping
164                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
165                       "RCV1" = {      if (!is.null(lazyTmMap)) {
166                           require(XML)          lazyTmMap$index[i] <- FALSE
167            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
168                           tree <- xmlTreeParse(object)      }
169                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))      # Set the value
170                       },      cl <- class(x)
171                       # Text in CSV format (as e.g. exported from an Excel sheet)      y <- NextMethod("[[<-")
172                       "CSV" = {      class(y) <- cl
173                           m <- as.matrix(read.csv(object))      y
174                           l <- vector("list", dim(m)[1])  }
                          for (i in 1:dim(m)[1]) {  
                              author <- "Not yet implemented"  
                              timestamp <- date()  
                              description <- "Not yet implemented"  
                              id <- i  
                              corpus <- as.character(m[i,2:dim(m)[2]])  
                              if (stripWhiteSpace)  
                                  corpus <- gsub("[[:space:]]+", " ", corpus)  
                              if (toLower)  
                                  corpus <- tolower(corpus)  
                              origin <- "Not yet implemented"  
                              heading <- "Not yet implemented"  
   
                              l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
                                  description = description, id = id, origin = origin, heading = heading)  
                          }  
                          tdcl <- new("textdoccol", .Data = l)  
                      }  
                      )  
   
               tdcl@tdm <- termdocmatrix(tdcl, weighting, stemming, language, minWordLength, minDocFreq, stopwords)  
   
               tdcl  
           })  
   
 # Parse a <newsitem></newsitem> element from a valid RCV1 XML file  
 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
   
     if (stripWhiteSpace)  
         corpus <- gsub("[[:space:]]+", " ", corpus)  
     if (toLower)  
         corpus <- tolower(corpus)  
175    
176      heading <- xmlValue(node[["title"]])  # Update NodeIDs of a CMetaData tree
177    .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
178        # Traversal of (binary) CMetaData tree with setup of NodeIDs
179        set_id <- function(x) {
180            x$NodeID <- id
181            id <<- id + 1
182            level <<- level + 1
183            if (length(x$Children) > 0) {
184                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
185                left <- set_id(x$Children[[1]])
186                if (level == 1) {
187                    left.mapping <<- mapping
188                    mapping <<- NULL
189                }
190                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
191                right <- set_id(x$Children[[2]])
192    
193      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,              x$Children <- list(left, right)
194          description = description, id = id, origin = origin, heading = heading)          }
195            level <<- level - 1
196            x
197        }
198        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
199    }
200    
201    # Find indices to be updated for a CMetaData tree
202    .find_indices <- function(x) {
203        indices.mapping <- NULL
204        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
205            indices <- (DMetaData(x)$MetaID == m)
206            indices.mapping <- c(indices.mapping, list(m = indices))
207            names(indices.mapping)[length(indices.mapping)] <- m
208        }
209        indices.mapping
210    }
211    
212    c2 <- function(x, y, ...) {
213        # Update the CMetaData tree
214        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
215        update.struct <- .update_id(cmeta)
216    
217        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
218    
219        # Find indices to be updated for the left tree
220        indices.mapping <- .find_indices(x)
221    
222        # Update the DMetaData data frames for the left tree
223        for (i in 1:ncol(update.struct$left.mapping)) {
224            map <- update.struct$left.mapping[,i]
225            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
226        }
227    
228        # Find indices to be updated for the right tree
229        indices.mapping <- .find_indices(y)
230    
231        # Update the DMetaData data frames for the right tree
232        for (i in 1:ncol(update.struct$right.mapping)) {
233            map <- update.struct$right.mapping[,i]
234            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
235        }
236    
237        # Merge the DMetaData data frames
238        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
239        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
240        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
241        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
242        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
243        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
244        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
245    
246        new
247    }
248    
249    c.Corpus <-
250    function(x, ..., recursive = FALSE)
251    {
252        args <- list(...)
253    
254        if (identical(length(args), 0L))
255            return(x)
256    
257        if (!all(unlist(lapply(args, inherits, class(x)))))
258            stop("not all arguments are of the same corpus type")
259    
260        if (inherits(x, "PCorpus"))
261            stop("concatenation of corpora with underlying databases is not supported")
262    
263        l <- base::c(list(x), args)
264        if (recursive)
265            Reduce(c2, l)
266        else {
267            l <- do.call("c", lapply(l, unclass))
268            .VCorpus(l,
269                     cmeta = .MetaDataNode(),
270                     dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))
271        }
272    }
273    
274    c.TextDocument <- function(x, ..., recursive = FALSE) {
275        args <- list(...)
276    
277        if (identical(length(args), 0L))
278            return(x)
279    
280        if (!all(unlist(lapply(args, inherits, class(x)))))
281            stop("not all arguments are text documents")
282    
283        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
284        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
285    }
286    
287    print.Corpus <- function(x, ...) {
288        cat(sprintf(ngettext(length(x),
289                             "A corpus with %d text document\n",
290                             "A corpus with %d text documents\n"),
291                    length(x)))
292        invisible(x)
293    }
294    
295    summary.Corpus <- function(object, ...) {
296        print(object)
297        if (length(DMetaData(object)) > 0) {
298            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
299                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
300                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
301                        length(CMetaData(object)$MetaData)))
302            cat("Available tags are:\n")
303            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
304            cat("Available variables in the data frame are:\n")
305            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
306        }
307    }
308    
309    inspect <- function(x) UseMethod("inspect", x)
310    inspect.PCorpus <- function(x) {
311        summary(x)
312        cat("\n")
313        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
314        show(filehash::dbMultiFetch(db, unlist(x)))
315    }
316    inspect.VCorpus <- function(x) {
317        summary(x)
318        cat("\n")
319        print(noquote(lapply(x, identity)))
320    }
321    
322    lapply.PCorpus <- function(X, FUN, ...) {
323        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
324        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
325    }
326    lapply.VCorpus <- function(X, FUN, ...) {
327        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
328        if (!is.null(lazyTmMap))
329            .Call("copyCorpus", X, materialize(X))
330        base::lapply(X, FUN, ...)
331    }
332    
333    writeCorpus <-  function(x, path = ".", filenames = NULL) {
334        filenames <- file.path(path,
335                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
336                               else filenames)
337        i <- 1
338        for (o in x) {
339            writeLines(as.PlainTextDocument(o), filenames[i])
340            i <- i + 1
341        }
342  }  }

Legend:
Removed from v.22  
changed lines
  Added in v.1114

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge