SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 32, Thu Dec 15 13:13:54 2005 UTC pkg/R/corpus.R revision 1073, Fri May 28 12:32:46 2010 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4  # Text document collection      attr(x, "CMetaData") <- cmeta
5  setClass("textdoccol",      attr(x, "DMetaData") <- dmeta
6           contains = c("list"))      attr(x, "DBControl") <- dbcontrol
7        class(x) <- c("PCorpus", "Corpus", "list")
8  # Constructors      x
9    }
10  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))  DBControl <- function(x) attr(x, "DBControl")
11  setMethod("textdoccol",  
12            c("character", "character", "logical", "logical"),  PCorpus <- function(x,
13            function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {                      readerControl = list(reader = x$DefaultReader, language = "en"),
14                        dbControl = list(dbName = "", dbType = "DB1"),
15                # Add a new type for each unique input source format                      ...) {
16                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
17                switch(type,  
18                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19                       # For the moment the first argument is still a single file          stop("error in creating database")
20                       # This will be changed to a directory as soon as we have the full RCV1 data set      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21                       "RCV1" = {  
22                           tree <- xmlTreeParse(object)      # Allocate memory in advance if length is known
23                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))      tdl <- if (x$Length > 0)x
24                       },          vector("list", as.integer(x$Length))
                      # Text in a special CSV format (as e.g. exported from an Excel sheet)  
                      # For details on the file format see data/Umfrage.csv  
                      # The first argument has to be a single file  
                      "CSV" = {  
                          m <- as.matrix(read.csv(object))  
                          l <- vector("list", dim(m)[1])  
                          for (i in 1:dim(m)[1]) {  
                              author <- "Not yet implemented"  
                              timestamp <- date()  
                              description <- "Not yet implemented"  
                              id <- i  
                              corpus <- as.character(m[i,2:dim(m)[2]])  
                              if (stripWhiteSpace)  
                                  corpus <- gsub("[[:space:]]+", " ", corpus)  
                              if (toLower)  
                                  corpus <- tolower(corpus)  
                              origin <- "Not yet implemented"  
                              heading <- "Not yet implemented"  
   
                              l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
                                  description = description, id = id, origin = origin, heading = heading)  
                          }  
                          tdcl <- new("textdoccol", .Data = l)  
                      },  
                      # Read in text documents in Reuters-21578 XML (not SGML) format  
                      # Typically the first argument will be a directory where we can  
                      # find the files reut2-000.xml ... reut2-021.xml  
                      "REUT21578" = {  
                          tdl <- sapply(dir(object,  
                                            pattern = ".xml",  
                                            full.names = TRUE),  
                                        function(file) {  
                                            tree <- xmlTreeParse(file)  
                                            xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)  
                                        })  
   
                          tdcl <- new("textdoccol", .Data = tdl)  
                      })  
               tdcl  
           })  
   
 # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  
 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
   
     if (stripWhiteSpace)  
         corpus <- gsub("[[:space:]]+", " ", corpus)  
     if (toLower)  
         corpus <- tolower(corpus)  
   
     heading <- xmlValue(node[["title"]])  
   
     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
         description = description, id = id, origin = origin, heading = heading)  
 }  
   
 # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  
 parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlValue(node[["DATE"]])  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["NEWID"]])  
   
     origin <- "Not yet implemented"  
   
     # The <BODY></BODY> tag is unfortunately NOT obligatory!  
     if (!is.null(node[["TEXT"]][["BODY"]]))  
         corpus <- xmlValue(node[["TEXT"]][["BODY"]])  
25      else      else
26          corpus <- ""          list()
27    
28        counter <- 1
29        while (!eoi(x)) {
30            x <- stepNext(x)
31            elem <- getElem(x)
32            doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
33            filehash::dbInsert(db, ID(doc), doc)
34            if (x$Length > 0) tdl[[counter]] <- ID(doc)
35            else tdl <- c(tdl, ID(doc))
36            counter <- counter + 1
37        }
38        names(tdl) <- x$Names
39    
40        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
41        filehash::dbInsert(db, "DMetaData", df)
42        dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
43    
44        .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
45    }
46    
47    .VCorpus <- function(x, cmeta, dmeta) {
48        attr(x, "CMetaData") <- cmeta
49        attr(x, "DMetaData") <- dmeta
50        class(x) <- c("VCorpus", "Corpus", "list")
51        x
52    }
53    
54      if (stripWhiteSpace)  # Register S3 corpus classes to be recognized by S4 methods. This is
55          corpus <- gsub("[[:space:]]+", " ", corpus)  # mainly a fix to be compatible with packages which were originally
56      if (toLower)  # developed to cooperate with corresponding S4 tm classes. Necessary
57          corpus <- tolower(corpus)  # since tm's class architecture was changed to S3 since tm version 0.5.
58    setOldClass(c("VCorpus", "Corpus", "list"))
59      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!  
60      if (!is.null(node[["TEXT"]][["TITLE"]]))  # The "..." are additional arguments for the FunctionGenerator reader
61          heading <- xmlValue(node[["TEXT"]][["TITLE"]])  VCorpus <- Corpus <- function(x,
62                                  readerControl = list(reader = x$DefaultReader, language = "en"),
63                                  ...) {
64        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
65    
66        # Allocate memory in advance if length is known
67        tdl <- if (x$Length > 0)
68            vector("list", as.integer(x$Length))
69      else      else
70          heading <- ""          list()
71    
72        if (x$Vectorized)
73            tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
74                          pGetElem(x),
75                          id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,
76                          SIMPLIFY = FALSE)
77        else {
78            counter <- 1
79            while (!eoi(x)) {
80                x <- stepNext(x)
81                elem <- getElem(x)
82                doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
83                if (x$Length > 0)
84                    tdl[[counter]] <- doc
85                else
86                    tdl <- c(tdl, list(doc))
87                counter <- counter + 1
88            }
89        }
90        names(tdl) <- x$Names
91        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
92        .VCorpus(tdl, .MetaDataNode(), df)
93    }
94    
95    `[.PCorpus` <- function(x, i) {
96        if (missing(i)) return(x)
97        index <- attr(x, "DMetaData")[[1 , "subset"]]
98        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
99        dmeta <- attr(x, "DMetaData")
100        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
101    }
102    
103    `[.VCorpus` <- function(x, i) {
104        if (missing(i)) return(x)
105        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
106    }
107    
108    `[<-.PCorpus` <- function(x, i, value) {
109        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
110        counter <- 1
111        for (id in unclass(x)[i]) {
112            if (identical(length(value), 1L)) db[[id]] <- value
113            else db[[id]] <- value[[counter]]
114            counter <- counter + 1
115        }
116        x
117    }
118    
119    `[[.PCorpus` <-  function(x, i) {
120        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
121        filehash::dbFetch(db, NextMethod("[["))
122    }
123    `[[.VCorpus` <-  function(x, i) {
124        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
125        if (!is.null(lazyTmMap))
126            .Call("copyCorpus", x, materialize(x, i))
127        NextMethod("[[")
128    }
129    
130    `[[<-.PCorpus` <-  function(x, i, value) {
131        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
132        index <- unclass(x)[[i]]
133        db[[index]] <- value
134        x
135    }
136    `[[<-.VCorpus` <-  function(x, i, value) {
137        # Mark new objects as not active for lazy mapping
138        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
139        if (!is.null(lazyTmMap)) {
140            lazyTmMap$index[i] <- FALSE
141            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
142        }
143        # Set the value
144        cl <- class(x)
145        y <- NextMethod("[[<-")
146        class(y) <- cl
147        y
148    }
149    
150    # Update NodeIDs of a CMetaData tree
151    .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
152        # Traversal of (binary) CMetaData tree with setup of NodeIDs
153        set_id <- function(x) {
154            x$NodeID <- id
155            id <<- id + 1
156            level <<- level + 1
157            if (length(x$Children) > 0) {
158                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
159                left <- set_id(x$Children[[1]])
160                if (level == 1) {
161                    left.mapping <<- mapping
162                    mapping <<- NULL
163                }
164                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
165                right <- set_id(x$Children[[2]])
166    
167                x$Children <- list(left, right)
168            }
169            level <<- level - 1
170            x
171        }
172        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
173    }
174    
175      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  # Find indices to be updated for a CMetaData tree
176          description = description, id = id, origin = origin, heading = heading)  .find_indices <- function(x) {
177        indices.mapping <- NULL
178        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
179            indices <- (DMetaData(x)$MetaID == m)
180            indices.mapping <- c(indices.mapping, list(m = indices))
181            names(indices.mapping)[length(indices.mapping)] <- m
182        }
183        indices.mapping
184    }
185    
186    c2 <- function(x, y, ...) {
187        # Update the CMetaData tree
188        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
189        update.struct <- .update_id(cmeta)
190    
191        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
192    
193        # Find indices to be updated for the left tree
194        indices.mapping <- .find_indices(x)
195    
196        # Update the DMetaData data frames for the left tree
197        for (i in 1:ncol(update.struct$left.mapping)) {
198            map <- update.struct$left.mapping[,i]
199            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
200        }
201    
202        # Find indices to be updated for the right tree
203        indices.mapping <- .find_indices(y)
204    
205        # Update the DMetaData data frames for the right tree
206        for (i in 1:ncol(update.struct$right.mapping)) {
207            map <- update.struct$right.mapping[,i]
208            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
209        }
210    
211        # Merge the DMetaData data frames
212        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
213        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
214        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
215        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
216        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
217        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
218        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
219    
220        new
221    }
222    
223    c.Corpus <-
224    function(x, ..., recursive = FALSE)
225    {
226        args <- list(...)
227    
228        if (identical(length(args), 0L))
229            return(x)
230    
231        if (!all(unlist(lapply(args, inherits, class(x)))))
232            stop("not all arguments are of the same corpus type")
233    
234        if (inherits(x, "PCorpus"))
235            stop("concatenation of corpora with underlying databases is not supported")
236    
237        Reduce(c2, base::c(list(x), args))
238    }
239    
240    c.TextDocument <- function(x, ..., recursive = FALSE) {
241        args <- list(...)
242    
243        if (identical(length(args), 0L))
244            return(x)
245    
246        if (!all(unlist(lapply(args, inherits, class(x)))))
247            stop("not all arguments are text documents")
248    
249        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
250        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
251    }
252    
253    print.Corpus <- function(x, ...) {
254        cat(sprintf(ngettext(length(x),
255                             "A corpus with %d text document\n",
256                             "A corpus with %d text documents\n"),
257                    length(x)))
258        invisible(x)
259    }
260    
261    summary.Corpus <- function(object, ...) {
262        print(object)
263        if (length(DMetaData(object)) > 0) {
264            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
265                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
266                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
267                        length(CMetaData(object)$MetaData)))
268            cat("Available tags are:\n")
269            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
270            cat("Available variables in the data frame are:\n")
271            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
272        }
273    }
274    
275    inspect <- function(x) UseMethod("inspect", x)
276    inspect.PCorpus <- function(x) {
277        summary(x)
278        cat("\n")
279        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
280        show(filehash::dbMultiFetch(db, unlist(x)))
281    }
282    inspect.VCorpus <- function(x) {
283        summary(x)
284        cat("\n")
285        print(noquote(lapply(x, identity)))
286    }
287    
288    lapply.PCorpus <- function(X, FUN, ...) {
289        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
290        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
291    }
292    lapply.VCorpus <- function(X, FUN, ...) {
293        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
294        if (!is.null(lazyTmMap))
295            .Call("copyCorpus", X, materialize(X))
296        base::lapply(X, FUN, ...)
297    }
298    
299    writeCorpus <-  function(x, path = ".", filenames = NULL) {
300        filenames <- file.path(path,
301                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
302                               else filenames)
303        i <- 1
304        for (o in x) {
305            writeLines(as.PlainTextDocument(o), filenames[i])
306            i <- i + 1
307        }
308  }  }

Legend:
Removed from v.32  
changed lines
  Added in v.1073

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge