SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 18, Sat Nov 5 19:00:05 2005 UTC pkg/R/corpus.R revision 1073, Fri May 28 12:32:46 2010 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4  # Text document collection      attr(x, "CMetaData") <- cmeta
5  # TODO: Define proper S4 term-document matrix      attr(x, "DMetaData") <- dmeta
6  setClass("textdoccol", representation(docs = "list",      attr(x, "DBControl") <- dbcontrol
7                                        tdm = "matrix"))      class(x) <- c("PCorpus", "Corpus", "list")
8        x
9  # Accessor function  }
10  if (!isGeneric("docs")) {  DBControl <- function(x) attr(x, "DBControl")
11      if (is.function("docs"))  
12          fun <- docs  PCorpus <- function(x,
13                        readerControl = list(reader = x$DefaultReader, language = "en"),
14                        dbControl = list(dbName = "", dbType = "DB1"),
15                        ...) {
16        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
17    
18        if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19            stop("error in creating database")
20        db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22        # Allocate memory in advance if length is known
23        tdl <- if (x$Length > 0)x
24            vector("list", as.integer(x$Length))
25      else      else
26          fun <- function(object) standardGeneric("docs")          list()
27      setGeneric("docs", fun)  
28        counter <- 1
29        while (!eoi(x)) {
30            x <- stepNext(x)
31            elem <- getElem(x)
32            doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
33            filehash::dbInsert(db, ID(doc), doc)
34            if (x$Length > 0) tdl[[counter]] <- ID(doc)
35            else tdl <- c(tdl, ID(doc))
36            counter <- counter + 1
37        }
38        names(tdl) <- x$Names
39    
40        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
41        filehash::dbInsert(db, "DMetaData", df)
42        dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
43    
44        .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
45    }
46    
47    .VCorpus <- function(x, cmeta, dmeta) {
48        attr(x, "CMetaData") <- cmeta
49        attr(x, "DMetaData") <- dmeta
50        class(x) <- c("VCorpus", "Corpus", "list")
51        x
52    }
53    
54    # Register S3 corpus classes to be recognized by S4 methods. This is
55    # mainly a fix to be compatible with packages which were originally
56    # developed to cooperate with corresponding S4 tm classes. Necessary
57    # since tm's class architecture was changed to S3 since tm version 0.5.
58    setOldClass(c("VCorpus", "Corpus", "list"))
59    
60    # The "..." are additional arguments for the FunctionGenerator reader
61    VCorpus <- Corpus <- function(x,
62                                  readerControl = list(reader = x$DefaultReader, language = "en"),
63                                  ...) {
64        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
65    
66        # Allocate memory in advance if length is known
67        tdl <- if (x$Length > 0)
68            vector("list", as.integer(x$Length))
69        else
70            list()
71    
72        if (x$Vectorized)
73            tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
74                          pGetElem(x),
75                          id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,
76                          SIMPLIFY = FALSE)
77        else {
78            counter <- 1
79            while (!eoi(x)) {
80                x <- stepNext(x)
81                elem <- getElem(x)
82                doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
83                if (x$Length > 0)
84                    tdl[[counter]] <- doc
85                else
86                    tdl <- c(tdl, list(doc))
87                counter <- counter + 1
88            }
89        }
90        names(tdl) <- x$Names
91        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
92        .VCorpus(tdl, .MetaDataNode(), df)
93    }
94    
95    `[.PCorpus` <- function(x, i) {
96        if (missing(i)) return(x)
97        index <- attr(x, "DMetaData")[[1 , "subset"]]
98        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
99        dmeta <- attr(x, "DMetaData")
100        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
101    }
102    
103    `[.VCorpus` <- function(x, i) {
104        if (missing(i)) return(x)
105        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
106    }
107    
108    `[<-.PCorpus` <- function(x, i, value) {
109        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
110        counter <- 1
111        for (id in unclass(x)[i]) {
112            if (identical(length(value), 1L)) db[[id]] <- value
113            else db[[id]] <- value[[counter]]
114            counter <- counter + 1
115        }
116        x
117    }
118    
119    `[[.PCorpus` <-  function(x, i) {
120        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
121        filehash::dbFetch(db, NextMethod("[["))
122    }
123    `[[.VCorpus` <-  function(x, i) {
124        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
125        if (!is.null(lazyTmMap))
126            .Call("copyCorpus", x, materialize(x, i))
127        NextMethod("[[")
128    }
129    
130    `[[<-.PCorpus` <-  function(x, i, value) {
131        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
132        index <- unclass(x)[[i]]
133        db[[index]] <- value
134        x
135    }
136    `[[<-.VCorpus` <-  function(x, i, value) {
137        # Mark new objects as not active for lazy mapping
138        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
139        if (!is.null(lazyTmMap)) {
140            lazyTmMap$index[i] <- FALSE
141            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
142        }
143        # Set the value
144        cl <- class(x)
145        y <- NextMethod("[[<-")
146        class(y) <- cl
147        y
148    }
149    
150    # Update NodeIDs of a CMetaData tree
151    .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
152        # Traversal of (binary) CMetaData tree with setup of NodeIDs
153        set_id <- function(x) {
154            x$NodeID <- id
155            id <<- id + 1
156            level <<- level + 1
157            if (length(x$Children) > 0) {
158                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
159                left <- set_id(x$Children[[1]])
160                if (level == 1) {
161                    left.mapping <<- mapping
162                    mapping <<- NULL
163                }
164                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
165                right <- set_id(x$Children[[2]])
166    
167                x$Children <- list(left, right)
168            }
169            level <<- level - 1
170            x
171        }
172        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
173    }
174    
175    # Find indices to be updated for a CMetaData tree
176    .find_indices <- function(x) {
177        indices.mapping <- NULL
178        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
179            indices <- (DMetaData(x)$MetaID == m)
180            indices.mapping <- c(indices.mapping, list(m = indices))
181            names(indices.mapping)[length(indices.mapping)] <- m
182        }
183        indices.mapping
184  }  }
 setMethod("docs", "textdoccol", function(object) object@docs)  
185    
186  setGeneric("textdoccol", function(docs) standardGeneric("textdoccol"))  c2 <- function(x, y, ...) {
187  # Read in XML text documents      # Update the CMetaData tree
188  # Reuters Corpus Volume 1 (RCV1)      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
189  setMethod("textdoccol", "character", function(docs) {      update.struct <- .update_id(cmeta)
     require(XML)  
   
     tree <- xmlTreeParse(docs)  
     root <- xmlRoot(tree)  
   
     # TODO: At each loop node points to the current newsitem  
     node <- root  
   
     # TODO: Implement lacking fields.  
     # For this we need the full RCV1 XML set to know where to find those things  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
190    
191      heading <- xmlValue(node[["title"]])      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
192    
193      doc <- new("textdocument", author = author, timestamp = timestamp, description = description,      # Find indices to be updated for the left tree
194                 id = id, origin = origin, corpus = corpus, heading = heading)      indices.mapping <- .find_indices(x)
195    
196        # Update the DMetaData data frames for the left tree
197        for (i in 1:ncol(update.struct$left.mapping)) {
198            map <- update.struct$left.mapping[,i]
199            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
200        }
201    
202        # Find indices to be updated for the right tree
203        indices.mapping <- .find_indices(y)
204    
205        # Update the DMetaData data frames for the right tree
206        for (i in 1:ncol(update.struct$right.mapping)) {
207            map <- update.struct$right.mapping[,i]
208            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
209        }
210    
211        # Merge the DMetaData data frames
212        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
213        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
214        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
215        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
216        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
217        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
218        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
219    
220        new
221    }
222    
223    c.Corpus <-
224    function(x, ..., recursive = FALSE)
225    {
226        args <- list(...)
227    
228        if (identical(length(args), 0L))
229            return(x)
230    
231        if (!all(unlist(lapply(args, inherits, class(x)))))
232            stop("not all arguments are of the same corpus type")
233    
234        if (inherits(x, "PCorpus"))
235            stop("concatenation of corpora with underlying databases is not supported")
236    
237        Reduce(c2, base::c(list(x), args))
238    }
239    
240      new("textdoccol", docs = list(doc), tdm = matrix())  c.TextDocument <- function(x, ..., recursive = FALSE) {
241  })      args <- list(...)
242    
243        if (identical(length(args), 0L))
244            return(x)
245    
246        if (!all(unlist(lapply(args, inherits, class(x)))))
247            stop("not all arguments are text documents")
248    
249        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
250        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
251    }
252    
253    print.Corpus <- function(x, ...) {
254        cat(sprintf(ngettext(length(x),
255                             "A corpus with %d text document\n",
256                             "A corpus with %d text documents\n"),
257                    length(x)))
258        invisible(x)
259    }
260    
261    summary.Corpus <- function(object, ...) {
262        print(object)
263        if (length(DMetaData(object)) > 0) {
264            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
265                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
266                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
267                        length(CMetaData(object)$MetaData)))
268            cat("Available tags are:\n")
269            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
270            cat("Available variables in the data frame are:\n")
271            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
272        }
273    }
274    
275    inspect <- function(x) UseMethod("inspect", x)
276    inspect.PCorpus <- function(x) {
277        summary(x)
278        cat("\n")
279        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
280        show(filehash::dbMultiFetch(db, unlist(x)))
281    }
282    inspect.VCorpus <- function(x) {
283        summary(x)
284        cat("\n")
285        print(noquote(lapply(x, identity)))
286    }
287    
288    lapply.PCorpus <- function(X, FUN, ...) {
289        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
290        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
291    }
292    lapply.VCorpus <- function(X, FUN, ...) {
293        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
294        if (!is.null(lazyTmMap))
295            .Call("copyCorpus", X, materialize(X))
296        base::lapply(X, FUN, ...)
297    }
298    
299    writeCorpus <-  function(x, path = ".", filenames = NULL) {
300        filenames <- file.path(path,
301                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
302                               else filenames)
303        i <- 1
304        for (o in x) {
305            writeLines(as.PlainTextDocument(o), filenames[i])
306            i <- i + 1
307        }
308    }

Legend:
Removed from v.18  
changed lines
  Added in v.1073

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge