SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 18, Sat Nov 5 19:00:05 2005 UTC pkg/R/corpus.R revision 986, Tue Sep 1 15:33:30 2009 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  prepareReader <- function(readerControl, defaultReader = NULL, ...) {
4  # Text document collection      if (is.null(readerControl$reader))
5  # TODO: Define proper S4 term-document matrix          readerControl$reader <- defaultReader
6  setClass("textdoccol", representation(docs = "list",      if (inherits(readerControl$reader, "FunctionGenerator"))
7                                        tdm = "matrix"))          readerControl$reader <- readerControl$reader(...)
8        if (is.null(readerControl$language))
9  # Accessor function          readerControl$language <- "eng"
10  if (!isGeneric("docs")) {      readerControl
11      if (is.function("docs"))  }
12          fun <- docs  
13    # Node ID, actual meta data, and possibly other nodes as children
14    .MetaDataNode <- function(nodeid = 0, meta = list(create_date = as.POSIXlt(Sys.time(), tz = "GMT"), creator = Sys.getenv("LOGNAME")), children = NULL) {
15        structure(list(NodeID = nodeid, MetaData = meta, Children = children),
16                  class = "MetaDataNode")
17    }
18    
19    print.MetaDataNode <- function(x, ...)
20        print(x$MetaData)
21    
22    .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
23        attr(x, "CMetaData") <- cmeta
24        attr(x, "DMetaData") <- dmeta
25        attr(x, "DBControl") <- dbcontrol
26        class(x) <- c("PCorpus", "Corpus", "list")
27        x
28    }
29    
30    PCorpus <- function(x,
31                        readerControl = list(reader = x$DefaultReader, language = "eng"),
32                        dbControl = list(dbName = "", dbType = "DB1"),
33                        ...) {
34        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
35    
36        if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
37            stop("error in creating database")
38        db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
39    
40        # Allocate memory in advance if length is known
41        tdl <- if (x$Length > 0)
42            vector("list", as.integer(x$Length))
43      else      else
44          fun <- function(object) standardGeneric("docs")          list()
45      setGeneric("docs", fun)  
46        counter <- 1
47        while (!eoi(x)) {
48            x <- stepNext(x)
49            elem <- getElem(x)
50            doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
51            filehash::dbInsert(db, ID(doc), doc)
52            if (x$Length > 0) tdl[[counter]] <- ID(doc)
53            else tdl <- c(tdl, ID(doc))
54            counter <- counter + 1
55  }  }
 setMethod("docs", "textdoccol", function(object) object@docs)  
56    
57  setGeneric("textdoccol", function(docs) standardGeneric("textdoccol"))      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
58  # Read in XML text documents      filehash::dbInsert(db, "DMetaData", df)
59  # Reuters Corpus Volume 1 (RCV1)      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
 setMethod("textdoccol", "character", function(docs) {  
     require(XML)  
   
     tree <- xmlTreeParse(docs)  
     root <- xmlRoot(tree)  
   
     # TODO: At each loop node points to the current newsitem  
     node <- root  
   
     # TODO: Implement lacking fields.  
     # For this we need the full RCV1 XML set to know where to find those things  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
60    
61      heading <- xmlValue(node[["title"]])      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
62    }
63    
64      doc <- new("textdocument", author = author, timestamp = timestamp, description = description,  .VCorpus <- function(x, cmeta, dmeta) {
65                 id = id, origin = origin, corpus = corpus, heading = heading)      attr(x, "CMetaData") <- cmeta
66        attr(x, "DMetaData") <- dmeta
67        class(x) <- c("VCorpus", "Corpus", "list")
68        x
69    }
70    
71    # The "..." are additional arguments for the FunctionGenerator reader
72    VCorpus <- Corpus <- function(x,
73                        readerControl = list(reader = x$DefaultReader, language = "eng"),
74                        ...) {
75        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
76    
77        # Allocate memory in advance if length is known
78        tdl <- if (x$Length > 0)
79            vector("list", as.integer(x$Length))
80        else
81            list()
82    
83        if (x$Vectorized)
84            mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
85                   pGetElem(x),
86                   id = as.character(seq_len(x$Length)),
87                   SIMPLIFY = FALSE)
88        else {
89            counter <- 1
90            while (!eoi(x)) {
91                x <- stepNext(x)
92                elem <- getElem(x)
93                doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
94                if (x$Length > 0)
95                    tdl[[counter]] <- doc
96                else
97                    tdl <- c(tdl, list(doc))
98                counter <- counter + 1
99            }
100        }
101    
102        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
103        .VCorpus(tdl, .MetaDataNode(), df)
104    }
105    
106      new("textdoccol", docs = list(doc), tdm = matrix())  `[.PCorpus` <- function(x, i) {
107  })      if (missing(i)) return(x)
108        cmeta <- CMetaData(x)
109        index <- attr(x, "DMetaData")[[1 , "subset"]]
110        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
111        dmeta <- attr(x, "DMetaData")
112        dbcontrol <- DBControl(x)
113        class(x) <- "list"
114        .PCorpus(x[i, drop = FALSE], cmeta, dmeta, dbcontrol)
115    }
116    
117    `[.VCorpus` <- function(x, i) {
118        if (missing(i)) return(x)
119        cmeta <- CMetaData(x)
120        dmeta <- DMetaData(x)[i, , drop = FALSE]
121        class(x) <- "list"
122        .VCorpus(x[i, drop = FALSE], cmeta, dmeta)
123    }
124    
125    `[<-.PCorpus` <- function(x, i, value) {
126        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
127        counter <- 1
128        for (id in unclass(x)[i]) {
129            if (identical(length(value), 1)) db[[id]] <- value
130            else db[[id]] <- value[[counter]]
131            counter <- counter + 1
132        }
133        x
134    }
135    
136    `[[.PCorpus` <-  function(x, i) {
137        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
138        class(x) <- "list"
139        filehash::dbFetch(db, x[[i]])
140    }
141    `[[.VCorpus` <-  function(x, i) {
142        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
143        if (!is.null(lazyTmMap))
144            .Call("copyCorpus", x, materialize(x, i))
145        class(x) <- "list"
146        x[[i]]
147    }
148    
149    `[[<-.PCorpus` <-  function(x, i, value) {
150        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
151        index <- unclass(x)[[i]]
152        db[[index]] <- value
153        x
154    }
155    `[[<-.VCorpus` <-  function(x, i, value) {
156        # Mark new objects as not active for lazy mapping
157        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
158        if (!is.null(lazyTmMap)) {
159            lazyTmMap$index[i] <- FALSE
160            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
161        }
162        # Set the value
163        cl <- class(x)
164        class(x) <- "list"
165        x[[i]] <- value
166        class(x) <- cl
167        x
168    }
169    
170    # Update \code{NodeID}s of a CMetaData tree
171    update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
172        # Traversal of (binary) CMetaData tree with setup of \code{NodeID}s
173        set_id <- function(x) {
174            attrs <- attributes(x)
175            x <- id
176            attributes(x) <- attrs
177            id <<- id + 1
178            level <<- level + 1
179            if (length(attr(x, "Children")) > 0) {
180                mapping <<- cbind(mapping, c(as.numeric(attr(x, "Children")[[1]]), id))
181                left <- set_id(attr(x, "Children")[[1]])
182                if (level == 1) {
183                    left.mapping <<- mapping
184                    mapping <<- NULL
185                }
186                mapping <<- cbind(mapping, c(as.numeric(attr(x, "Children")[[2]]), id))
187                right <- set_id(attr(x, "Children")[[2]])
188    
189                attr(x, "Children") <- list(left, right)
190            }
191            level <<- level - 1
192            x
193        }
194    
195        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
196    }
197    
198    c2 <- function(x, y, ...) {
199        # Update the CMetaData tree
200        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
201        update.struct <- update_id(cmeta)
202    
203        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
204    
205        # Find indices to be updated for the left tree
206        indices.mapping <- NULL
207        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
208            indices <- (DMetaData(x)$MetaID == m)
209            indices.mapping <- c(indices.mapping, list(m = indices))
210            names(indices.mapping)[length(indices.mapping)] <- m
211        }
212    
213        # Update the DMetaData data frames for the left tree
214        for (i in 1:ncol(update.struct$left.mapping)) {
215            map <- update.struct$left.mapping[,i]
216            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
217        }
218    
219        # Find indices to be updated for the right tree
220        indices.mapping <- NULL
221        for (m in levels(as.factor(DMetaData(y)$MetaID))) {
222            indices <- (DMetaData(y)$MetaID == m)
223            indices.mapping <- c(indices.mapping, list(m = indices))
224            names(indices.mapping)[length(indices.mapping)] <- m
225        }
226    
227        # Update the DMetaData data frames for the right tree
228        for (i in 1:ncol(update.struct$right.mapping)) {
229            map <- update.struct$right.mapping[,i]
230            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
231        }
232    
233        # Merge the DMetaData data frames
234        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
235        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
236        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
237        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
238        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
239        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
240        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
241    
242        new
243    }
244    
245    c.Corpus <-
246    function(x, ..., recursive = FALSE)
247    {
248        args <- list(...)
249    
250        if (identical(length(args), 0))
251            return(x)
252    
253        if (!all(unlist(lapply(args, inherits, class(x)))))
254            stop("not all arguments are of the same corpus type")
255    
256        if (inherits(x, "PCorpus"))
257            stop("concatenation of corpora with underlying databases is not supported")
258    
259        Reduce(c2, base::c(list(x), args))
260    }
261    
262    c.TextDocument <- function(x, ..., recursive = FALSE) {
263        args <- list(...)
264    
265        if (identical(length(args), 0))
266            return(x)
267    
268        if (!all(unlist(lapply(args, inherits, class(x)))))
269            stop("not all arguments are text documents")
270    
271        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
272        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
273    }
274    
275    print.Corpus <- function(x, ...) {
276        cat(sprintf(ngettext(length(x),
277                             "A corpus with %d text document\n",
278                             "A corpus with %d text documents\n"),
279                    length(x)))
280        invisible(x)
281    }
282    
283    summary.Corpus <- function(x, ...) {
284        print(x)
285        if (length(DMetaData(x)) > 0) {
286            cat(sprintf(ngettext(length(attr(CMetaData(x), "MetaData")),
287                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
288                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
289                        length(attr(CMetaData(x), "MetaData"))))
290            cat("Available tags are:\n")
291            cat(strwrap(paste(names(attr(CMetaData(x), "MetaData")), collapse = " "), indent = 2, exdent = 2), "\n")
292            cat("Available variables in the data frame are:\n")
293            cat(strwrap(paste(names(DMetaData(x)), collapse = " "), indent = 2, exdent = 2), "\n")
294        }
295    }
296    
297    inspect <- function(x) UseMethod("inspect", x)
298    inspect.PCorpus <- function(x) {
299        summary(x)
300        cat("\n")
301        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
302        show(filehash::dbMultiFetch(db, unlist(x)))
303    }
304    inspect.VCorpus <- function(x) {
305        summary(x)
306        cat("\n")
307        print(noquote(lapply(x, identity)))
308    }
309    
310    # No metadata is checked
311    `%IN%` <- function(x, y) UseMethod("%IN%", y)
312    `%IN%.PCorpus` <- function(x, y) {
313        db <- filehash::dbInit(DBControl(y)[["dbName"]], DBControl(y)[["dbType"]])
314        any(unlist(lapply(y, function(x, z) {x %in% Content(z)}, x)))
315    }
316    `%IN%.VCorpus` <- function(x, y) x %in% y
317    
318    lapply.PCorpus <- function(X, FUN, ...) {
319        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
320        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
321    }
322    lapply.VCorpus <- function(X, FUN, ...) {
323        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
324        if (!is.null(lazyTmMap))
325            .Call("copyCorpus", X, materialize(X))
326        base::lapply(X, FUN, ...)
327    }
328    
329    writeCorpus <-  function(x, path = ".", filenames = NULL) {
330        filenames <- file.path(path,
331                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
332                               else filenames)
333        i <- 1
334        for (o in x) {
335            writeLines(as.PlainTextDocument(o), filenames[i])
336            i <- i + 1
337        }
338    }

Legend:
Removed from v.18  
changed lines
  Added in v.986

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge