SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 18, Sat Nov 5 19:00:05 2005 UTC pkg/R/corpus.R revision 1063, Fri Apr 9 10:36:39 2010 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4  # Text document collection      attr(x, "CMetaData") <- cmeta
5  # TODO: Define proper S4 term-document matrix      attr(x, "DMetaData") <- dmeta
6  setClass("textdoccol", representation(docs = "list",      attr(x, "DBControl") <- dbcontrol
7                                        tdm = "matrix"))      class(x) <- c("PCorpus", "Corpus", "list")
8        x
9  # Accessor function  }
10  if (!isGeneric("docs")) {  DBControl <- function(x) attr(x, "DBControl")
11      if (is.function("docs"))  
12          fun <- docs  PCorpus <- function(x,
13                        readerControl = list(reader = x$DefaultReader, language = "eng"),
14                        dbControl = list(dbName = "", dbType = "DB1"),
15                        ...) {
16        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
17    
18        if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19            stop("error in creating database")
20        db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22        # Allocate memory in advance if length is known
23        tdl <- if (x$Length > 0)
24            vector("list", as.integer(x$Length))
25      else      else
26          fun <- function(object) standardGeneric("docs")          list()
27      setGeneric("docs", fun)  
28        counter <- 1
29        while (!eoi(x)) {
30            x <- stepNext(x)
31            elem <- getElem(x)
32            doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
33            filehash::dbInsert(db, ID(doc), doc)
34            if (x$Length > 0) tdl[[counter]] <- ID(doc)
35            else tdl <- c(tdl, ID(doc))
36            counter <- counter + 1
37        }
38    
39        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
40        filehash::dbInsert(db, "DMetaData", df)
41        dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
42    
43        .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
44    }
45    
46    .VCorpus <- function(x, cmeta, dmeta) {
47        attr(x, "CMetaData") <- cmeta
48        attr(x, "DMetaData") <- dmeta
49        class(x) <- c("VCorpus", "Corpus", "list")
50        x
51    }
52    
53    # Register S3 corpus classes to be recognized by S4 methods. This is
54    # mainly a fix to be compatible with packages which were originally
55    # developed to cooperate with corresponding S4 tm classes. Necessary
56    # since tm's class architecture was changed to S3 since tm version 0.5.
57    setOldClass(c("VCorpus", "Corpus", "list"))
58    
59    # The "..." are additional arguments for the FunctionGenerator reader
60    VCorpus <- Corpus <- function(x,
61                                  readerControl = list(reader = x$DefaultReader, language = "eng"),
62                                  ...) {
63        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
64    
65        # Allocate memory in advance if length is known
66        tdl <- if (x$Length > 0)
67            vector("list", as.integer(x$Length))
68        else
69            list()
70    
71        if (x$Vectorized)
72            tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
73                          pGetElem(x),
74                          id = as.character(seq_len(x$Length)),
75                          SIMPLIFY = FALSE)
76        else {
77            counter <- 1
78            while (!eoi(x)) {
79                x <- stepNext(x)
80                elem <- getElem(x)
81                doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
82                if (x$Length > 0)
83                    tdl[[counter]] <- doc
84                else
85                    tdl <- c(tdl, list(doc))
86                counter <- counter + 1
87            }
88        }
89        names(tdl) <- x$Names
90        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
91        .VCorpus(tdl, .MetaDataNode(), df)
92    }
93    
94    `[.PCorpus` <- function(x, i) {
95        if (missing(i)) return(x)
96        index <- attr(x, "DMetaData")[[1 , "subset"]]
97        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
98        dmeta <- attr(x, "DMetaData")
99        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
100    }
101    
102    `[.VCorpus` <- function(x, i) {
103        if (missing(i)) return(x)
104        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
105    }
106    
107    `[<-.PCorpus` <- function(x, i, value) {
108        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
109        counter <- 1
110        for (id in unclass(x)[i]) {
111            if (identical(length(value), 1L)) db[[id]] <- value
112            else db[[id]] <- value[[counter]]
113            counter <- counter + 1
114        }
115        x
116    }
117    
118    `[[.PCorpus` <-  function(x, i) {
119        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
120        filehash::dbFetch(db, NextMethod("[["))
121    }
122    `[[.VCorpus` <-  function(x, i) {
123        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
124        if (!is.null(lazyTmMap))
125            .Call("copyCorpus", x, materialize(x, i))
126        NextMethod("[[")
127    }
128    
129    `[[<-.PCorpus` <-  function(x, i, value) {
130        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
131        index <- unclass(x)[[i]]
132        db[[index]] <- value
133        x
134    }
135    `[[<-.VCorpus` <-  function(x, i, value) {
136        # Mark new objects as not active for lazy mapping
137        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
138        if (!is.null(lazyTmMap)) {
139            lazyTmMap$index[i] <- FALSE
140            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
141        }
142        # Set the value
143        cl <- class(x)
144        y <- NextMethod("[[<-")
145        class(y) <- cl
146        y
147    }
148    
149    # Update NodeIDs of a CMetaData tree
150    .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
151        # Traversal of (binary) CMetaData tree with setup of NodeIDs
152        set_id <- function(x) {
153            x$NodeID <- id
154            id <<- id + 1
155            level <<- level + 1
156            if (length(x$Children) > 0) {
157                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
158                left <- set_id(x$Children[[1]])
159                if (level == 1) {
160                    left.mapping <<- mapping
161                    mapping <<- NULL
162                }
163                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
164                right <- set_id(x$Children[[2]])
165    
166                x$Children <- list(left, right)
167            }
168            level <<- level - 1
169            x
170        }
171        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
172    }
173    
174    # Find indices to be updated for a CMetaData tree
175    .find_indices <- function(x) {
176        indices.mapping <- NULL
177        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
178            indices <- (DMetaData(x)$MetaID == m)
179            indices.mapping <- c(indices.mapping, list(m = indices))
180            names(indices.mapping)[length(indices.mapping)] <- m
181        }
182        indices.mapping
183  }  }
 setMethod("docs", "textdoccol", function(object) object@docs)  
184    
185  setGeneric("textdoccol", function(docs) standardGeneric("textdoccol"))  c2 <- function(x, y, ...) {
186  # Read in XML text documents      # Update the CMetaData tree
187  # Reuters Corpus Volume 1 (RCV1)      cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
188  setMethod("textdoccol", "character", function(docs) {      update.struct <- .update_id(cmeta)
     require(XML)  
   
     tree <- xmlTreeParse(docs)  
     root <- xmlRoot(tree)  
   
     # TODO: At each loop node points to the current newsitem  
     node <- root  
   
     # TODO: Implement lacking fields.  
     # For this we need the full RCV1 XML set to know where to find those things  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
189    
190      heading <- xmlValue(node[["title"]])      new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
191    
192      doc <- new("textdocument", author = author, timestamp = timestamp, description = description,      # Find indices to be updated for the left tree
193                 id = id, origin = origin, corpus = corpus, heading = heading)      indices.mapping <- .find_indices(x)
194    
195        # Update the DMetaData data frames for the left tree
196        for (i in 1:ncol(update.struct$left.mapping)) {
197            map <- update.struct$left.mapping[,i]
198            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
199        }
200    
201        # Find indices to be updated for the right tree
202        indices.mapping <- .find_indices(y)
203    
204        # Update the DMetaData data frames for the right tree
205        for (i in 1:ncol(update.struct$right.mapping)) {
206            map <- update.struct$right.mapping[,i]
207            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
208        }
209    
210        # Merge the DMetaData data frames
211        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
212        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
213        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
214        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
215        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
216        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
217        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
218    
219        new
220    }
221    
222    c.Corpus <-
223    function(x, ..., recursive = FALSE)
224    {
225        args <- list(...)
226    
227        if (identical(length(args), 0L))
228            return(x)
229    
230        if (!all(unlist(lapply(args, inherits, class(x)))))
231            stop("not all arguments are of the same corpus type")
232    
233        if (inherits(x, "PCorpus"))
234            stop("concatenation of corpora with underlying databases is not supported")
235    
236        Reduce(c2, base::c(list(x), args))
237    }
238    
239      new("textdoccol", docs = list(doc), tdm = matrix())  c.TextDocument <- function(x, ..., recursive = FALSE) {
240  })      args <- list(...)
241    
242        if (identical(length(args), 0L))
243            return(x)
244    
245        if (!all(unlist(lapply(args, inherits, class(x)))))
246            stop("not all arguments are text documents")
247    
248        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
249        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
250    }
251    
252    print.Corpus <- function(x, ...) {
253        cat(sprintf(ngettext(length(x),
254                             "A corpus with %d text document\n",
255                             "A corpus with %d text documents\n"),
256                    length(x)))
257        invisible(x)
258    }
259    
260    summary.Corpus <- function(object, ...) {
261        print(object)
262        if (length(DMetaData(object)) > 0) {
263            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
264                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
265                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
266                        length(CMetaData(object)$MetaData)))
267            cat("Available tags are:\n")
268            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
269            cat("Available variables in the data frame are:\n")
270            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
271        }
272    }
273    
274    inspect <- function(x) UseMethod("inspect", x)
275    inspect.PCorpus <- function(x) {
276        summary(x)
277        cat("\n")
278        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
279        show(filehash::dbMultiFetch(db, unlist(x)))
280    }
281    inspect.VCorpus <- function(x) {
282        summary(x)
283        cat("\n")
284        print(noquote(lapply(x, identity)))
285    }
286    
287    lapply.PCorpus <- function(X, FUN, ...) {
288        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
289        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
290    }
291    lapply.VCorpus <- function(X, FUN, ...) {
292        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
293        if (!is.null(lazyTmMap))
294            .Call("copyCorpus", X, materialize(X))
295        base::lapply(X, FUN, ...)
296    }
297    
298    writeCorpus <-  function(x, path = ".", filenames = NULL) {
299        filenames <- file.path(path,
300                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
301                               else filenames)
302        i <- 1
303        for (o in x) {
304            writeLines(as.PlainTextDocument(o), filenames[i])
305            i <- i + 1
306        }
307    }

Legend:
Removed from v.18  
changed lines
  Added in v.1063

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge