SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 32, Thu Dec 15 13:13:54 2005 UTC pkg/R/corpus.R revision 988, Fri Sep 4 12:27:12 2009 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4  # Text document collection      attr(x, "CMetaData") <- cmeta
5  setClass("textdoccol",      attr(x, "DMetaData") <- dmeta
6           contains = c("list"))      attr(x, "DBControl") <- dbcontrol
7        class(x) <- c("PCorpus", "Corpus", "list")
8  # Constructors      x
9    }
10  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))  DBControl <- function(x) attr(x, "DBControl")
11  setMethod("textdoccol",  
12            c("character", "character", "logical", "logical"),  PCorpus <- function(x,
13            function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {                      readerControl = list(reader = x$DefaultReader, language = "eng"),
14                        dbControl = list(dbName = "", dbType = "DB1"),
15                # Add a new type for each unique input source format                      ...) {
16                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
17                switch(type,  
18                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19                       # For the moment the first argument is still a single file          stop("error in creating database")
20                       # This will be changed to a directory as soon as we have the full RCV1 data set      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21                       "RCV1" = {  
22                           tree <- xmlTreeParse(object)      # Allocate memory in advance if length is known
23                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))      tdl <- if (x$Length > 0)
24                       },          vector("list", as.integer(x$Length))
25                       # Text in a special CSV format (as e.g. exported from an Excel sheet)      else
26                       # For details on the file format see data/Umfrage.csv          list()
27                       # The first argument has to be a single file  
28                       "CSV" = {      counter <- 1
29                           m <- as.matrix(read.csv(object))      while (!eoi(x)) {
30                           l <- vector("list", dim(m)[1])          x <- stepNext(x)
31                           for (i in 1:dim(m)[1]) {          elem <- getElem(x)
32                               author <- "Not yet implemented"          doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
33                               timestamp <- date()          filehash::dbInsert(db, ID(doc), doc)
34                               description <- "Not yet implemented"          if (x$Length > 0) tdl[[counter]] <- ID(doc)
35                               id <- i          else tdl <- c(tdl, ID(doc))
36                               corpus <- as.character(m[i,2:dim(m)[2]])          counter <- counter + 1
37                               if (stripWhiteSpace)      }
38                                   corpus <- gsub("[[:space:]]+", " ", corpus)  
39                               if (toLower)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
40                                   corpus <- tolower(corpus)      filehash::dbInsert(db, "DMetaData", df)
41                               origin <- "Not yet implemented"      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
42                               heading <- "Not yet implemented"  
43        .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
44                               l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  }
45                                   description = description, id = id, origin = origin, heading = heading)  
46                           }  .VCorpus <- function(x, cmeta, dmeta) {
47                           tdcl <- new("textdoccol", .Data = l)      attr(x, "CMetaData") <- cmeta
48                       },      attr(x, "DMetaData") <- dmeta
49                       # Read in text documents in Reuters-21578 XML (not SGML) format      class(x) <- c("VCorpus", "Corpus", "list")
50                       # Typically the first argument will be a directory where we can      x
51                       # find the files reut2-000.xml ... reut2-021.xml  }
52                       "REUT21578" = {  
53                           tdl <- sapply(dir(object,  # The "..." are additional arguments for the FunctionGenerator reader
54                                             pattern = ".xml",  VCorpus <- Corpus <- function(x,
55                                             full.names = TRUE),                      readerControl = list(reader = x$DefaultReader, language = "eng"),
56                                         function(file) {                      ...) {
57                                             tree <- xmlTreeParse(file)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
58                                             xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)  
59                                         })      # Allocate memory in advance if length is known
60        tdl <- if (x$Length > 0)
61                           tdcl <- new("textdoccol", .Data = tdl)          vector("list", as.integer(x$Length))
                      })  
               tdcl  
           })  
   
 # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  
 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
   
     if (stripWhiteSpace)  
         corpus <- gsub("[[:space:]]+", " ", corpus)  
     if (toLower)  
         corpus <- tolower(corpus)  
   
     heading <- xmlValue(node[["title"]])  
   
     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
         description = description, id = id, origin = origin, heading = heading)  
 }  
   
 # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  
 parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlValue(node[["DATE"]])  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["NEWID"]])  
   
     origin <- "Not yet implemented"  
   
     # The <BODY></BODY> tag is unfortunately NOT obligatory!  
     if (!is.null(node[["TEXT"]][["BODY"]]))  
         corpus <- xmlValue(node[["TEXT"]][["BODY"]])  
62      else      else
63          corpus <- ""          list()
64    
65      if (stripWhiteSpace)      if (x$Vectorized)
66          corpus <- gsub("[[:space:]]+", " ", corpus)          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
67      if (toLower)                        pGetElem(x),
68          corpus <- tolower(corpus)                        id = as.character(seq_len(x$Length)),
69                          SIMPLIFY = FALSE)
70      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!      else {
71      if (!is.null(node[["TEXT"]][["TITLE"]]))          counter <- 1
72          heading <- xmlValue(node[["TEXT"]][["TITLE"]])          while (!eoi(x)) {
73                x <- stepNext(x)
74                elem <- getElem(x)
75                doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
76                if (x$Length > 0)
77                    tdl[[counter]] <- doc
78      else      else
79          heading <- ""                  tdl <- c(tdl, list(doc))
80                counter <- counter + 1
81            }
82        }
83    
84        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
85        .VCorpus(tdl, .MetaDataNode(), df)
86    }
87    
88    `[.PCorpus` <- function(x, i) {
89        if (missing(i)) return(x)
90        cmeta <- CMetaData(x)
91        index <- attr(x, "DMetaData")[[1 , "subset"]]
92        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
93        dmeta <- attr(x, "DMetaData")
94        dbcontrol <- DBControl(x)
95        class(x) <- "list"
96        .PCorpus(x[i, drop = FALSE], cmeta, dmeta, dbcontrol)
97    }
98    
99    `[.VCorpus` <- function(x, i) {
100        if (missing(i)) return(x)
101        cmeta <- CMetaData(x)
102        dmeta <- DMetaData(x)[i, , drop = FALSE]
103        class(x) <- "list"
104        .VCorpus(x[i, drop = FALSE], cmeta, dmeta)
105    }
106    
107    `[<-.PCorpus` <- function(x, i, value) {
108        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
109        counter <- 1
110        for (id in unclass(x)[i]) {
111            if (identical(length(value), 1)) db[[id]] <- value
112            else db[[id]] <- value[[counter]]
113            counter <- counter + 1
114        }
115        x
116    }
117    
118    `[[.PCorpus` <-  function(x, i) {
119        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
120        class(x) <- "list"
121        filehash::dbFetch(db, x[[i]])
122    }
123    `[[.VCorpus` <-  function(x, i) {
124        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
125        if (!is.null(lazyTmMap))
126            .Call("copyCorpus", x, materialize(x, i))
127        class(x) <- "list"
128        x[[i]]
129    }
130    
131    `[[<-.PCorpus` <-  function(x, i, value) {
132        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
133        index <- unclass(x)[[i]]
134        db[[index]] <- value
135        x
136    }
137    `[[<-.VCorpus` <-  function(x, i, value) {
138        # Mark new objects as not active for lazy mapping
139        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
140        if (!is.null(lazyTmMap)) {
141            lazyTmMap$index[i] <- FALSE
142            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
143        }
144        # Set the value
145        cl <- class(x)
146        class(x) <- "list"
147        x[[i]] <- value
148        class(x) <- cl
149        x
150    }
151    
152    # Update \code{NodeID}s of a CMetaData tree
153    update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
154        # Traversal of (binary) CMetaData tree with setup of \code{NodeID}s
155        set_id <- function(x) {
156            x$NodeID <- id
157            id <<- id + 1
158            level <<- level + 1
159            if (length(x$Children) > 0) {
160                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
161                left <- set_id(x$Children[[1]])
162                if (level == 1) {
163                    left.mapping <<- mapping
164                    mapping <<- NULL
165                }
166                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
167                right <- set_id(x$Children[[2]])
168    
169                x$Children <- list(left, right)
170            }
171            level <<- level - 1
172            x
173        }
174        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
175    }
176    
177      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  c2 <- function(x, y, ...) {
178          description = description, id = id, origin = origin, heading = heading)      # Update the CMetaData tree
179        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
180        update.struct <- update_id(cmeta)
181    
182        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
183    
184        # Find indices to be updated for the left tree
185        indices.mapping <- NULL
186        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
187            indices <- (DMetaData(x)$MetaID == m)
188            indices.mapping <- c(indices.mapping, list(m = indices))
189            names(indices.mapping)[length(indices.mapping)] <- m
190        }
191    
192        # Update the DMetaData data frames for the left tree
193        for (i in 1:ncol(update.struct$left.mapping)) {
194            map <- update.struct$left.mapping[,i]
195            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
196        }
197    
198        # Find indices to be updated for the right tree
199        indices.mapping <- NULL
200        for (m in levels(as.factor(DMetaData(y)$MetaID))) {
201            indices <- (DMetaData(y)$MetaID == m)
202            indices.mapping <- c(indices.mapping, list(m = indices))
203            names(indices.mapping)[length(indices.mapping)] <- m
204        }
205    
206        # Update the DMetaData data frames for the right tree
207        for (i in 1:ncol(update.struct$right.mapping)) {
208            map <- update.struct$right.mapping[,i]
209            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
210        }
211    
212        # Merge the DMetaData data frames
213        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
214        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
215        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
216        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
217        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
218        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
219        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
220    
221        new
222    }
223    
224    c.Corpus <-
225    function(x, ..., recursive = FALSE)
226    {
227        args <- list(...)
228    
229        if (identical(length(args), 0))
230            return(x)
231    
232        if (!all(unlist(lapply(args, inherits, class(x)))))
233            stop("not all arguments are of the same corpus type")
234    
235        if (inherits(x, "PCorpus"))
236            stop("concatenation of corpora with underlying databases is not supported")
237    
238        Reduce(c2, base::c(list(x), args))
239    }
240    
241    c.TextDocument <- function(x, ..., recursive = FALSE) {
242        args <- list(...)
243    
244        if (identical(length(args), 0))
245            return(x)
246    
247        if (!all(unlist(lapply(args, inherits, class(x)))))
248            stop("not all arguments are text documents")
249    
250        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
251        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
252    }
253    
254    print.Corpus <- function(x, ...) {
255        cat(sprintf(ngettext(length(x),
256                             "A corpus with %d text document\n",
257                             "A corpus with %d text documents\n"),
258                    length(x)))
259        invisible(x)
260    }
261    
262    summary.Corpus <- function(object, ...) {
263        print(object)
264        if (length(DMetaData(object)) > 0) {
265            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
266                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
267                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
268                        length(CMetaData(object)$MetaData)))
269            cat("Available tags are:\n")
270            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
271            cat("Available variables in the data frame are:\n")
272            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
273        }
274    }
275    
276    inspect <- function(x) UseMethod("inspect", x)
277    inspect.PCorpus <- function(x) {
278        summary(x)
279        cat("\n")
280        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
281        show(filehash::dbMultiFetch(db, unlist(x)))
282    }
283    inspect.VCorpus <- function(x) {
284        summary(x)
285        cat("\n")
286        print(noquote(lapply(x, identity)))
287    }
288    
289    lapply.PCorpus <- function(X, FUN, ...) {
290        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
291        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
292    }
293    lapply.VCorpus <- function(X, FUN, ...) {
294        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
295        if (!is.null(lazyTmMap))
296            .Call("copyCorpus", X, materialize(X))
297        base::lapply(X, FUN, ...)
298    }
299    
300    writeCorpus <-  function(x, path = ".", filenames = NULL) {
301        filenames <- file.path(path,
302                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
303                               else filenames)
304        i <- 1
305        for (o in x) {
306            writeLines(as.PlainTextDocument(o), filenames[i])
307            i <- i + 1
308        }
309  }  }

Legend:
Removed from v.32  
changed lines
  Added in v.988

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge