SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 21, Sat Nov 19 10:23:19 2005 UTC pkg/R/corpus.R revision 1114, Fri Nov 26 14:05:54 2010 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4  # Text document collection      attr(x, "CMetaData") <- cmeta
5  setClass("textdoccol",      attr(x, "DMetaData") <- dmeta
6           representation(tdm = "termdocmatrix"),      attr(x, "DBControl") <- dbcontrol
7           contains = c("list"))      class(x) <- c("PCorpus", "Corpus", "list")
8        x
9  # Accessor functions as described in "S4 Classes in 15 pages, more or less"  }
10    DBControl <- function(x) attr(x, "DBControl")
11  if (!isGeneric("tdm")) {  
12      if (is.function("tdm"))  PCorpus <- function(x,
13          fun <- tdm                      readerControl = list(reader = x$DefaultReader, language = "en"),
14                        dbControl = list(dbName = "", dbType = "DB1"),
15                        ...) {
16        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
17    
18        if (is.function(readerControl$init))
19            readerControl$init()
20    
21        if (is.function(readerControl$exit))
22            on.exit(readerControl$exit())
23    
24        if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
25            stop("error in creating database")
26        db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
27    
28        # Allocate memory in advance if length is known
29        tdl <- if (x$Length > 0)
30            vector("list", as.integer(x$Length))
31        else
32            list()
33    
34        counter <- 1
35        while (!eoi(x)) {
36            x <- stepNext(x)
37            elem <- getElem(x)
38            doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
39            filehash::dbInsert(db, ID(doc), doc)
40            if (x$Length > 0) tdl[[counter]] <- ID(doc)
41            else tdl <- c(tdl, ID(doc))
42            counter <- counter + 1
43        }
44        names(tdl) <- x$Names
45    
46        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
47        filehash::dbInsert(db, "DMetaData", df)
48        dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
49    
50        .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
51    }
52    
53    .VCorpus <- function(x, cmeta, dmeta) {
54        attr(x, "CMetaData") <- cmeta
55        attr(x, "DMetaData") <- dmeta
56        class(x) <- c("VCorpus", "Corpus", "list")
57        x
58    }
59    
60    # Register S3 corpus classes to be recognized by S4 methods. This is
61    # mainly a fix to be compatible with packages which were originally
62    # developed to cooperate with corresponding S4 tm classes. Necessary
63    # since tm's class architecture was changed to S3 since tm version 0.5.
64    setOldClass(c("VCorpus", "Corpus", "list"))
65    
66    # The "..." are additional arguments for the FunctionGenerator reader
67    VCorpus <- Corpus <- function(x,
68                                  readerControl = list(reader = x$DefaultReader, language = "en"),
69                                  ...) {
70        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
71    
72        if (is.function(readerControl$init))
73            readerControl$init()
74    
75        if (is.function(readerControl$exit))
76            on.exit(readerControl$exit())
77    
78        # Allocate memory in advance if length is known
79        tdl <- if (x$Length > 0)
80            vector("list", as.integer(x$Length))
81        else
82            list()
83    
84        if (x$Vectorized)
85            tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
86                          pGetElem(x),
87                          id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,
88                          SIMPLIFY = FALSE)
89        else {
90            counter <- 1
91            while (!eoi(x)) {
92                x <- stepNext(x)
93                elem <- getElem(x)
94                doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
95                if (x$Length > 0)
96                    tdl[[counter]] <- doc
97                else
98                    tdl <- c(tdl, list(doc))
99                counter <- counter + 1
100            }
101        }
102        names(tdl) <- x$Names
103        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
104        .VCorpus(tdl, .MetaDataNode(), df)
105    }
106    
107    `[.PCorpus` <- function(x, i) {
108        if (missing(i)) return(x)
109        index <- attr(x, "DMetaData")[[1 , "subset"]]
110        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
111        dmeta <- attr(x, "DMetaData")
112        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
113    }
114    
115    `[.VCorpus` <- function(x, i) {
116        if (missing(i)) return(x)
117        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
118    }
119    
120    `[<-.PCorpus` <- function(x, i, value) {
121        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
122        counter <- 1
123        for (id in unclass(x)[i]) {
124            if (identical(length(value), 1L)) db[[id]] <- value
125            else db[[id]] <- value[[counter]]
126            counter <- counter + 1
127        }
128        x
129    }
130    
131    .map_name_index <- function(x, i) {
132        if (is.character(i)) {
133            if (is.null(names(x)))
134                match(i, meta(x, "ID", type = "local"))
135      else      else
136          fun <- function(object) standardGeneric("tdm")              match(i, names(x))
137      setGeneric("tdm", fun)      }
138        i
139  }  }
 setMethod("tdm", "textdoccol", function(object) object@tdm)  
140    
141  # Constructors  `[[.PCorpus` <-  function(x, i) {
142        i <- .map_name_index(x, i)
143        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
144        filehash::dbFetch(db, NextMethod("[["))
145    }
146    `[[.VCorpus` <-  function(x, i) {
147        i <- .map_name_index(x, i)
148        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
149        if (!is.null(lazyTmMap))
150            .Call("copyCorpus", x, materialize(x, i))
151        NextMethod("[[")
152    }
153    
154  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))  `[[<-.PCorpus` <-  function(x, i, value) {
155  # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format      i <- .map_name_index(x, i)
156  setMethod("textdoccol",      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
157            c("character", "logical", "logical",  "character", "logical", "character", "integer", "integer", "logical"),      index <- unclass(x)[[i]]
158            function(object, stripWhiteSpace = FALSE, toLower = FALSE, weighting = "tf", stemming = FALSE,      db[[index]] <- value
159                     language = "german", minWordLength = 3, minDocFreq = 1, stopwords = NULL) {      x
160                require(XML)  }
161    `[[<-.VCorpus` <-  function(x, i, value) {
162                tree <- xmlTreeParse(object)      i <- .map_name_index(x, i)
163                tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))      # Mark new objects as not active for lazy mapping
164                tdcl@tdm = termdocmatrix(tdcl, weighting, stemming, language, minWordLength, minDocFreq, stopwords)      lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
165        if (!is.null(lazyTmMap)) {
166                tdcl          lazyTmMap$index[i] <- FALSE
167            })          meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
168        }
169  # TODO: Implement lacking fields.      # Set the value
170  # For this we need the full RCV1 XML set to know where to find those things      cl <- class(x)
171  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {      y <- NextMethod("[[<-")
172      author <- "Not yet implemented"      class(y) <- cl
173      timestamp <- xmlAttrs(node)[["date"]]      y
174      description <- "Not yet implemented"  }
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
   
     if (stripWhiteSpace)  
         corpus <- gsub("[[:space:]]+", " ", corpus)  
     if (toLower)  
         corpus <- tolower(corpus)  
175    
176      heading <- xmlValue(node[["title"]])  # Update NodeIDs of a CMetaData tree
177    .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
178        # Traversal of (binary) CMetaData tree with setup of NodeIDs
179        set_id <- function(x) {
180            x$NodeID <- id
181            id <<- id + 1
182            level <<- level + 1
183            if (length(x$Children) > 0) {
184                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
185                left <- set_id(x$Children[[1]])
186                if (level == 1) {
187                    left.mapping <<- mapping
188                    mapping <<- NULL
189                }
190                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
191                right <- set_id(x$Children[[2]])
192    
193      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,              x$Children <- list(left, right)
194          description = description, id = id, origin = origin, heading = heading)          }
195            level <<- level - 1
196            x
197        }
198        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
199    }
200    
201    # Find indices to be updated for a CMetaData tree
202    .find_indices <- function(x) {
203        indices.mapping <- NULL
204        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
205            indices <- (DMetaData(x)$MetaID == m)
206            indices.mapping <- c(indices.mapping, list(m = indices))
207            names(indices.mapping)[length(indices.mapping)] <- m
208        }
209        indices.mapping
210    }
211    
212    c2 <- function(x, y, ...) {
213        # Update the CMetaData tree
214        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
215        update.struct <- .update_id(cmeta)
216    
217        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
218    
219        # Find indices to be updated for the left tree
220        indices.mapping <- .find_indices(x)
221    
222        # Update the DMetaData data frames for the left tree
223        for (i in 1:ncol(update.struct$left.mapping)) {
224            map <- update.struct$left.mapping[,i]
225            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
226        }
227    
228        # Find indices to be updated for the right tree
229        indices.mapping <- .find_indices(y)
230    
231        # Update the DMetaData data frames for the right tree
232        for (i in 1:ncol(update.struct$right.mapping)) {
233            map <- update.struct$right.mapping[,i]
234            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
235        }
236    
237        # Merge the DMetaData data frames
238        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
239        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
240        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
241        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
242        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
243        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
244        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
245    
246        new
247    }
248    
249    c.Corpus <-
250    function(x, ..., recursive = FALSE)
251    {
252        args <- list(...)
253    
254        if (identical(length(args), 0L))
255            return(x)
256    
257        if (!all(unlist(lapply(args, inherits, class(x)))))
258            stop("not all arguments are of the same corpus type")
259    
260        if (inherits(x, "PCorpus"))
261            stop("concatenation of corpora with underlying databases is not supported")
262    
263        l <- base::c(list(x), args)
264        if (recursive)
265            Reduce(c2, l)
266        else {
267            l <- do.call("c", lapply(l, unclass))
268            .VCorpus(l,
269                     cmeta = .MetaDataNode(),
270                     dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))
271        }
272    }
273    
274    c.TextDocument <- function(x, ..., recursive = FALSE) {
275        args <- list(...)
276    
277        if (identical(length(args), 0L))
278            return(x)
279    
280        if (!all(unlist(lapply(args, inherits, class(x)))))
281            stop("not all arguments are text documents")
282    
283        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
284        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
285    }
286    
287    print.Corpus <- function(x, ...) {
288        cat(sprintf(ngettext(length(x),
289                             "A corpus with %d text document\n",
290                             "A corpus with %d text documents\n"),
291                    length(x)))
292        invisible(x)
293    }
294    
295    summary.Corpus <- function(object, ...) {
296        print(object)
297        if (length(DMetaData(object)) > 0) {
298            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
299                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
300                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
301                        length(CMetaData(object)$MetaData)))
302            cat("Available tags are:\n")
303            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
304            cat("Available variables in the data frame are:\n")
305            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
306        }
307    }
308    
309    inspect <- function(x) UseMethod("inspect", x)
310    inspect.PCorpus <- function(x) {
311        summary(x)
312        cat("\n")
313        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
314        show(filehash::dbMultiFetch(db, unlist(x)))
315    }
316    inspect.VCorpus <- function(x) {
317        summary(x)
318        cat("\n")
319        print(noquote(lapply(x, identity)))
320    }
321    
322    lapply.PCorpus <- function(X, FUN, ...) {
323        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
324        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
325    }
326    lapply.VCorpus <- function(X, FUN, ...) {
327        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
328        if (!is.null(lazyTmMap))
329            .Call("copyCorpus", X, materialize(X))
330        base::lapply(X, FUN, ...)
331    }
332    
333    writeCorpus <-  function(x, path = ".", filenames = NULL) {
334        filenames <- file.path(path,
335                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
336                               else filenames)
337        i <- 1
338        for (o in x) {
339            writeLines(as.PlainTextDocument(o), filenames[i])
340            i <- i + 1
341        }
342  }  }

Legend:
Removed from v.21  
changed lines
  Added in v.1114

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge