SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 18, Sat Nov 5 19:00:05 2005 UTC pkg/R/corpus.R revision 1025, Fri Dec 11 08:56:22 2009 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4  # Text document collection      attr(x, "CMetaData") <- cmeta
5  # TODO: Define proper S4 term-document matrix      attr(x, "DMetaData") <- dmeta
6  setClass("textdoccol", representation(docs = "list",      attr(x, "DBControl") <- dbcontrol
7                                        tdm = "matrix"))      class(x) <- c("PCorpus", "Corpus", "list")
8        x
9  # Accessor function  }
10  if (!isGeneric("docs")) {  DBControl <- function(x) attr(x, "DBControl")
11      if (is.function("docs"))  
12          fun <- docs  PCorpus <- function(x,
13                        readerControl = list(reader = x$DefaultReader, language = "eng"),
14                        dbControl = list(dbName = "", dbType = "DB1"),
15                        ...) {
16        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
17    
18        if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19            stop("error in creating database")
20        db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21    
22        # Allocate memory in advance if length is known
23        tdl <- if (x$Length > 0)
24            vector("list", as.integer(x$Length))
25      else      else
26          fun <- function(object) standardGeneric("docs")          list()
27      setGeneric("docs", fun)  
28        counter <- 1
29        while (!eoi(x)) {
30            x <- stepNext(x)
31            elem <- getElem(x)
32            doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
33            filehash::dbInsert(db, ID(doc), doc)
34            if (x$Length > 0) tdl[[counter]] <- ID(doc)
35            else tdl <- c(tdl, ID(doc))
36            counter <- counter + 1
37        }
38    
39        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
40        filehash::dbInsert(db, "DMetaData", df)
41        dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
42    
43        .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
44    }
45    
46    .VCorpus <- function(x, cmeta, dmeta) {
47        attr(x, "CMetaData") <- cmeta
48        attr(x, "DMetaData") <- dmeta
49        class(x) <- c("VCorpus", "Corpus", "list")
50        x
51    }
52    
53    # Register S3 corpus classes to be recognized by S4 methods. This is
54    # mainly a fix to be compatible with packages which were originally
55    # developed to cooperate with corresponding S4 tm classes. Necessary
56    # since tm's class architecture was changed to S3 since tm version 0.5.
57    setOldClass(c("VCorpus", "Corpus", "list"))
58    
59    # The "..." are additional arguments for the FunctionGenerator reader
60    VCorpus <- Corpus <- function(x,
61                        readerControl = list(reader = x$DefaultReader, language = "eng"),
62                        ...) {
63        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
64    
65        # Allocate memory in advance if length is known
66        tdl <- if (x$Length > 0)
67            vector("list", as.integer(x$Length))
68        else
69            list()
70    
71        if (x$Vectorized)
72            tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
73                          pGetElem(x),
74                          id = as.character(seq_len(x$Length)),
75                          SIMPLIFY = FALSE)
76        else {
77            counter <- 1
78            while (!eoi(x)) {
79                x <- stepNext(x)
80                elem <- getElem(x)
81                doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
82                if (x$Length > 0)
83                    tdl[[counter]] <- doc
84                else
85                    tdl <- c(tdl, list(doc))
86                counter <- counter + 1
87            }
88        }
89    
90        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
91        .VCorpus(tdl, .MetaDataNode(), df)
92  }  }
 setMethod("docs", "textdoccol", function(object) object@docs)  
93    
94  setGeneric("textdoccol", function(docs) standardGeneric("textdoccol"))  `[.PCorpus` <- function(x, i) {
95  # Read in XML text documents      if (missing(i)) return(x)
96  # Reuters Corpus Volume 1 (RCV1)      index <- attr(x, "DMetaData")[[1 , "subset"]]
97  setMethod("textdoccol", "character", function(docs) {      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
98      require(XML)      dmeta <- attr(x, "DMetaData")
99        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
100      tree <- xmlTreeParse(docs)  }
     root <- xmlRoot(tree)  
   
     # TODO: At each loop node points to the current newsitem  
     node <- root  
   
     # TODO: Implement lacking fields.  
     # For this we need the full RCV1 XML set to know where to find those things  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
101    
102      heading <- xmlValue(node[["title"]])  `[.VCorpus` <- function(x, i) {
103        if (missing(i)) return(x)
104        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
105    }
106    
107    `[<-.PCorpus` <- function(x, i, value) {
108        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
109        counter <- 1
110        for (id in unclass(x)[i]) {
111            if (identical(length(value), 1L)) db[[id]] <- value
112            else db[[id]] <- value[[counter]]
113            counter <- counter + 1
114        }
115        x
116    }
117    
118      doc <- new("textdocument", author = author, timestamp = timestamp, description = description,  `[[.PCorpus` <-  function(x, i) {
119                 id = id, origin = origin, corpus = corpus, heading = heading)      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
120        filehash::dbFetch(db, NextMethod("[["))
121    }
122    `[[.VCorpus` <-  function(x, i) {
123        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
124        if (!is.null(lazyTmMap))
125            .Call("copyCorpus", x, materialize(x, i))
126        NextMethod("[[")
127    }
128    
129      new("textdoccol", docs = list(doc), tdm = matrix())  `[[<-.PCorpus` <-  function(x, i, value) {
130  })      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
131        index <- unclass(x)[[i]]
132        db[[index]] <- value
133        x
134    }
135    `[[<-.VCorpus` <-  function(x, i, value) {
136        # Mark new objects as not active for lazy mapping
137        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
138        if (!is.null(lazyTmMap)) {
139            lazyTmMap$index[i] <- FALSE
140            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
141        }
142        # Set the value
143        cl <- class(x)
144        y <- NextMethod("[[<-")
145        class(y) <- cl
146        y
147    }
148    
149    # Update NodeIDs of a CMetaData tree
150    .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
151        # Traversal of (binary) CMetaData tree with setup of NodeIDs
152        set_id <- function(x) {
153            x$NodeID <- id
154            id <<- id + 1
155            level <<- level + 1
156            if (length(x$Children) > 0) {
157                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
158                left <- set_id(x$Children[[1]])
159                if (level == 1) {
160                    left.mapping <<- mapping
161                    mapping <<- NULL
162                }
163                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
164                right <- set_id(x$Children[[2]])
165    
166                x$Children <- list(left, right)
167            }
168            level <<- level - 1
169            x
170        }
171        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
172    }
173    
174    # Find indices to be updated for a CMetaData tree
175    .find_indices <- function(x) {
176        indices.mapping <- NULL
177        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
178            indices <- (DMetaData(x)$MetaID == m)
179            indices.mapping <- c(indices.mapping, list(m = indices))
180            names(indices.mapping)[length(indices.mapping)] <- m
181        }
182        indices.mapping
183    }
184    
185    c2 <- function(x, y, ...) {
186        # Update the CMetaData tree
187        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
188        update.struct <- .update_id(cmeta)
189    
190        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
191    
192        # Find indices to be updated for the left tree
193        indices.mapping <- .find_indices(x)
194    
195        # Update the DMetaData data frames for the left tree
196        for (i in 1:ncol(update.struct$left.mapping)) {
197            map <- update.struct$left.mapping[,i]
198            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
199        }
200    
201        # Find indices to be updated for the right tree
202        indices.mapping <- .find_indices(y)
203    
204        # Update the DMetaData data frames for the right tree
205        for (i in 1:ncol(update.struct$right.mapping)) {
206            map <- update.struct$right.mapping[,i]
207            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
208        }
209    
210        # Merge the DMetaData data frames
211        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
212        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
213        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
214        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
215        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
216        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
217        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
218    
219        new
220    }
221    
222    c.Corpus <-
223    function(x, ..., recursive = FALSE)
224    {
225        args <- list(...)
226    
227        if (identical(length(args), 0L))
228            return(x)
229    
230        if (!all(unlist(lapply(args, inherits, class(x)))))
231            stop("not all arguments are of the same corpus type")
232    
233        if (inherits(x, "PCorpus"))
234            stop("concatenation of corpora with underlying databases is not supported")
235    
236        Reduce(c2, base::c(list(x), args))
237    }
238    
239    c.TextDocument <- function(x, ..., recursive = FALSE) {
240        args <- list(...)
241    
242        if (identical(length(args), 0L))
243            return(x)
244    
245        if (!all(unlist(lapply(args, inherits, class(x)))))
246            stop("not all arguments are text documents")
247    
248        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
249        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
250    }
251    
252    print.Corpus <- function(x, ...) {
253        cat(sprintf(ngettext(length(x),
254                             "A corpus with %d text document\n",
255                             "A corpus with %d text documents\n"),
256                    length(x)))
257        invisible(x)
258    }
259    
260    summary.Corpus <- function(object, ...) {
261        print(object)
262        if (length(DMetaData(object)) > 0) {
263            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
264                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
265                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
266                        length(CMetaData(object)$MetaData)))
267            cat("Available tags are:\n")
268            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
269            cat("Available variables in the data frame are:\n")
270            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
271        }
272    }
273    
274    inspect <- function(x) UseMethod("inspect", x)
275    inspect.PCorpus <- function(x) {
276        summary(x)
277        cat("\n")
278        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
279        show(filehash::dbMultiFetch(db, unlist(x)))
280    }
281    inspect.VCorpus <- function(x) {
282        summary(x)
283        cat("\n")
284        print(noquote(lapply(x, identity)))
285    }
286    
287    lapply.PCorpus <- function(X, FUN, ...) {
288        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
289        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
290    }
291    lapply.VCorpus <- function(X, FUN, ...) {
292        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
293        if (!is.null(lazyTmMap))
294            .Call("copyCorpus", X, materialize(X))
295        base::lapply(X, FUN, ...)
296    }
297    
298    writeCorpus <-  function(x, path = ".", filenames = NULL) {
299        filenames <- file.path(path,
300                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
301                               else filenames)
302        i <- 1
303        for (o in x) {
304            writeLines(as.PlainTextDocument(o), filenames[i])
305            i <- i + 1
306        }
307    }

Legend:
Removed from v.18  
changed lines
  Added in v.1025

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge