SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 18, Sat Nov 5 19:00:05 2005 UTC pkg/R/corpus.R revision 985, Thu Aug 27 18:09:05 2009 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  prepareReader <- function(readerControl, defaultReader = NULL, ...) {
4  # Text document collection      if (is.null(readerControl$reader))
5  # TODO: Define proper S4 term-document matrix          readerControl$reader <- defaultReader
6  setClass("textdoccol", representation(docs = "list",      if (inherits(readerControl$reader, "FunctionGenerator"))
7                                        tdm = "matrix"))          readerControl$reader <- readerControl$reader(...)
8        if (is.null(readerControl$language))
9  # Accessor function          readerControl$language <- "eng"
10  if (!isGeneric("docs")) {      readerControl
11      if (is.function("docs"))  }
12          fun <- docs  
13    # Node ID, actual meta data, and possibly other nodes as children
14    .MetaDataNode <- function(node = 0, meta = list(create_date = as.POSIXlt(Sys.time(), tz = "GMT"), creator = Sys.getenv("LOGNAME")), children = NULL) {
15        attr(node, "MetaData") <- meta
16        attr(node, "Children") <- children
17        class(node) <- c("MetaDataNode", "numeric")
18        node
19    }
20    
21    print.MetaDataNode <- function(x, ...)
22        print(attr(x, "MetaData"))
23    
24    .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
25        attr(x, "CMetaData") <- cmeta
26        attr(x, "DMetaData") <- dmeta
27        attr(x, "DBControl") <- dbcontrol
28        class(x) <- c("PCorpus", "Corpus", "list")
29        x
30    }
31    
32    PCorpus <- function(x,
33                        readerControl = list(reader = x$DefaultReader, language = "eng"),
34                        dbControl = list(dbName = "", dbType = "DB1"),
35                        ...) {
36        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
37    
38        if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
39            stop("error in creating database")
40        db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
41    
42        # Allocate memory in advance if length is known
43        tdl <- if (x$Length > 0)
44            vector("list", as.integer(x$Length))
45      else      else
46          fun <- function(object) standardGeneric("docs")          list()
47      setGeneric("docs", fun)  
48        counter <- 1
49        while (!eoi(x)) {
50            x <- stepNext(x)
51            elem <- getElem(x)
52            doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
53            filehash::dbInsert(db, ID(doc), doc)
54            if (x$Length > 0) tdl[[counter]] <- ID(doc)
55            else tdl <- c(tdl, ID(doc))
56            counter <- counter + 1
57  }  }
 setMethod("docs", "textdoccol", function(object) object@docs)  
58    
59  setGeneric("textdoccol", function(docs) standardGeneric("textdoccol"))      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
60  # Read in XML text documents      filehash::dbInsert(db, "DMetaData", df)
61  # Reuters Corpus Volume 1 (RCV1)      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
 setMethod("textdoccol", "character", function(docs) {  
     require(XML)  
   
     tree <- xmlTreeParse(docs)  
     root <- xmlRoot(tree)  
   
     # TODO: At each loop node points to the current newsitem  
     node <- root  
   
     # TODO: Implement lacking fields.  
     # For this we need the full RCV1 XML set to know where to find those things  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
62    
63      heading <- xmlValue(node[["title"]])      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
64    }
65    
66      doc <- new("textdocument", author = author, timestamp = timestamp, description = description,  .VCorpus <- function(x, cmeta, dmeta) {
67                 id = id, origin = origin, corpus = corpus, heading = heading)      attr(x, "CMetaData") <- cmeta
68        attr(x, "DMetaData") <- dmeta
69        class(x) <- c("VCorpus", "Corpus", "list")
70        x
71    }
72    
73    # The "..." are additional arguments for the FunctionGenerator reader
74    VCorpus <- Corpus <- function(x,
75                        readerControl = list(reader = x$DefaultReader, language = "eng"),
76                        ...) {
77        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
78    
79        # Allocate memory in advance if length is known
80        tdl <- if (x$Length > 0)
81            vector("list", as.integer(x$Length))
82        else
83            list()
84    
85        if (x$Vectorized)
86            tdl <- lapply(mapply(c, pGetElem(x), id = seq_len(x$Length), SIMPLIFY = FALSE),
87                          function(x) readerControl$reader(x[c("content", "uri")],
88                                                           readerControl$language,
89                                                           as.character(x$id)))
90        else {
91            counter <- 1
92            while (!eoi(x)) {
93                x <- stepNext(x)
94                elem <- getElem(x)
95                doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
96                if (x$Length > 0)
97                    tdl[[counter]] <- doc
98                else
99                    tdl <- c(tdl, list(doc))
100                counter <- counter + 1
101            }
102        }
103    
104        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
105        .VCorpus(tdl, .MetaDataNode(), df)
106    }
107    
108      new("textdoccol", docs = list(doc), tdm = matrix())  `[.PCorpus` <- function(x, i) {
109  })      if (missing(i)) return(x)
110        cmeta <- CMetaData(x)
111        index <- attr(x, "DMetaData")[[1 , "subset"]]
112        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
113        dmeta <- attr(x, "DMetaData")
114        dbcontrol <- DBControl(x)
115        class(x) <- "list"
116        .PCorpus(x[i, drop = FALSE], cmeta, dmeta, dbcontrol)
117    }
118    
119    `[.VCorpus` <- function(x, i) {
120        if (missing(i)) return(x)
121        cmeta <- CMetaData(x)
122        dmeta <- DMetaData(x)[i, , drop = FALSE]
123        class(x) <- "list"
124        .VCorpus(x[i, drop = FALSE], cmeta, dmeta)
125    }
126    
127    `[<-.PCorpus` <- function(x, i, value) {
128        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
129        counter <- 1
130        for (id in unclass(x)[i]) {
131            if (identical(length(value), 1)) db[[id]] <- value
132            else db[[id]] <- value[[counter]]
133            counter <- counter + 1
134        }
135        x
136    }
137    
138    `[[.PCorpus` <-  function(x, i) {
139        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
140        class(x) <- "list"
141        filehash::dbFetch(db, x[[i]])
142    }
143    `[[.VCorpus` <-  function(x, i) {
144        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
145        if (!is.null(lazyTmMap))
146            .Call("copyCorpus", x, materialize(x, i))
147        class(x) <- "list"
148        x[[i]]
149    }
150    
151    `[[<-.PCorpus` <-  function(x, i, value) {
152        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
153        index <- unclass(x)[[i]]
154        db[[index]] <- value
155        x
156    }
157    `[[<-.VCorpus` <-  function(x, i, value) {
158        # Mark new objects as not active for lazy mapping
159        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
160        if (!is.null(lazyTmMap)) {
161            lazyTmMap$index[i] <- FALSE
162            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
163        }
164        # Set the value
165        cl <- class(x)
166        class(x) <- "list"
167        x[[i]] <- value
168        class(x) <- cl
169        x
170    }
171    
172    # Update \code{NodeID}s of a CMetaData tree
173    update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
174        # Traversal of (binary) CMetaData tree with setup of \code{NodeID}s
175        set_id <- function(x) {
176            attrs <- attributes(x)
177            x <- id
178            attributes(x) <- attrs
179            id <<- id + 1
180            level <<- level + 1
181            if (length(attr(x, "Children")) > 0) {
182                mapping <<- cbind(mapping, c(as.numeric(attr(x, "Children")[[1]]), id))
183                left <- set_id(attr(x, "Children")[[1]])
184                if (level == 1) {
185                    left.mapping <<- mapping
186                    mapping <<- NULL
187                }
188                mapping <<- cbind(mapping, c(as.numeric(attr(x, "Children")[[2]]), id))
189                right <- set_id(attr(x, "Children")[[2]])
190    
191                attr(x, "Children") <- list(left, right)
192            }
193            level <<- level - 1
194            x
195        }
196    
197        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
198    }
199    
200    c2 <- function(x, y, ...) {
201        # Update the CMetaData tree
202        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
203        update.struct <- update_id(cmeta)
204    
205        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
206    
207        # Find indices to be updated for the left tree
208        indices.mapping <- NULL
209        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
210            indices <- (DMetaData(x)$MetaID == m)
211            indices.mapping <- c(indices.mapping, list(m = indices))
212            names(indices.mapping)[length(indices.mapping)] <- m
213        }
214    
215        # Update the DMetaData data frames for the left tree
216        for (i in 1:ncol(update.struct$left.mapping)) {
217            map <- update.struct$left.mapping[,i]
218            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
219        }
220    
221        # Find indices to be updated for the right tree
222        indices.mapping <- NULL
223        for (m in levels(as.factor(DMetaData(y)$MetaID))) {
224            indices <- (DMetaData(y)$MetaID == m)
225            indices.mapping <- c(indices.mapping, list(m = indices))
226            names(indices.mapping)[length(indices.mapping)] <- m
227        }
228    
229        # Update the DMetaData data frames for the right tree
230        for (i in 1:ncol(update.struct$right.mapping)) {
231            map <- update.struct$right.mapping[,i]
232            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
233        }
234    
235        # Merge the DMetaData data frames
236        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
237        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
238        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
239        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
240        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
241        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
242        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
243    
244        new
245    }
246    
247    c.Corpus <-
248    function(x, ..., recursive = FALSE)
249    {
250        args <- list(...)
251    
252        if (identical(length(args), 0))
253            return(x)
254    
255        if (!all(unlist(lapply(args, inherits, class(x)))))
256            stop("not all arguments are of the same corpus type")
257    
258        if (inherits(x, "PCorpus"))
259            stop("concatenation of corpora with underlying databases is not supported")
260    
261        Reduce(c2, base::c(list(x), args))
262    }
263    
264    c.TextDocument <- function(x, ..., recursive = FALSE) {
265        args <- list(...)
266    
267        if (identical(length(args), 0))
268            return(x)
269    
270        if (!all(unlist(lapply(args, inherits, class(x)))))
271            stop("not all arguments are text documents")
272    
273        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
274        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
275    }
276    
277    print.Corpus <- function(x, ...) {
278        cat(sprintf(ngettext(length(x),
279                             "A corpus with %d text document\n",
280                             "A corpus with %d text documents\n"),
281                    length(x)))
282        invisible(x)
283    }
284    
285    summary.Corpus <- function(x, ...) {
286        print(x)
287        if (length(DMetaData(x)) > 0) {
288            cat(sprintf(ngettext(length(attr(CMetaData(x), "MetaData")),
289                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
290                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
291                        length(attr(CMetaData(x), "MetaData"))))
292            cat("Available tags are:\n")
293            cat(strwrap(paste(names(attr(CMetaData(x), "MetaData")), collapse = " "), indent = 2, exdent = 2), "\n")
294            cat("Available variables in the data frame are:\n")
295            cat(strwrap(paste(names(DMetaData(x)), collapse = " "), indent = 2, exdent = 2), "\n")
296        }
297    }
298    
299    inspect <- function(x) UseMethod("inspect", x)
300    inspect.PCorpus <- function(x) {
301        summary(x)
302        cat("\n")
303        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
304        show(filehash::dbMultiFetch(db, unlist(x)))
305    }
306    inspect.VCorpus <- function(x) {
307        summary(x)
308        cat("\n")
309        print(noquote(lapply(x, identity)))
310    }
311    
312    # No metadata is checked
313    `%IN%` <- function(x, y) UseMethod("%IN%", y)
314    `%IN%.PCorpus` <- function(x, y) {
315        db <- filehash::dbInit(DBControl(y)[["dbName"]], DBControl(y)[["dbType"]])
316        any(unlist(lapply(y, function(x, z) {x %in% Content(z)}, x)))
317    }
318    `%IN%.VCorpus` <- function(x, y) x %in% y
319    
320    lapply.PCorpus <- function(X, FUN, ...) {
321        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
322        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
323    }
324    lapply.VCorpus <- function(X, FUN, ...) {
325        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
326        if (!is.null(lazyTmMap))
327            .Call("copyCorpus", X, materialize(X))
328        base::lapply(X, FUN, ...)
329    }
330    
331    writeCorpus <-  function(x, path = ".", filenames = NULL) {
332        filenames <- file.path(path,
333                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
334                               else filenames)
335        i <- 1
336        for (o in x) {
337            writeLines(as.PlainTextDocument(o), filenames[i])
338            i <- i + 1
339        }
340    }

Legend:
Removed from v.18  
changed lines
  Added in v.985

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge