SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 32, Thu Dec 15 13:13:54 2005 UTC pkg/R/corpus.R revision 1025, Fri Dec 11 08:56:22 2009 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4  # Text document collection      attr(x, "CMetaData") <- cmeta
5  setClass("textdoccol",      attr(x, "DMetaData") <- dmeta
6           contains = c("list"))      attr(x, "DBControl") <- dbcontrol
7        class(x) <- c("PCorpus", "Corpus", "list")
8  # Constructors      x
9    }
10  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))  DBControl <- function(x) attr(x, "DBControl")
11  setMethod("textdoccol",  
12            c("character", "character", "logical", "logical"),  PCorpus <- function(x,
13            function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {                      readerControl = list(reader = x$DefaultReader, language = "eng"),
14                        dbControl = list(dbName = "", dbType = "DB1"),
15                # Add a new type for each unique input source format                      ...) {
16                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
17                switch(type,  
18                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19                       # For the moment the first argument is still a single file          stop("error in creating database")
20                       # This will be changed to a directory as soon as we have the full RCV1 data set      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21                       "RCV1" = {  
22                           tree <- xmlTreeParse(object)      # Allocate memory in advance if length is known
23                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))      tdl <- if (x$Length > 0)
24                       },          vector("list", as.integer(x$Length))
25                       # Text in a special CSV format (as e.g. exported from an Excel sheet)      else
26                       # For details on the file format see data/Umfrage.csv          list()
27                       # The first argument has to be a single file  
28                       "CSV" = {      counter <- 1
29                           m <- as.matrix(read.csv(object))      while (!eoi(x)) {
30                           l <- vector("list", dim(m)[1])          x <- stepNext(x)
31                           for (i in 1:dim(m)[1]) {          elem <- getElem(x)
32                               author <- "Not yet implemented"          doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
33                               timestamp <- date()          filehash::dbInsert(db, ID(doc), doc)
34                               description <- "Not yet implemented"          if (x$Length > 0) tdl[[counter]] <- ID(doc)
35                               id <- i          else tdl <- c(tdl, ID(doc))
36                               corpus <- as.character(m[i,2:dim(m)[2]])          counter <- counter + 1
37                               if (stripWhiteSpace)      }
38                                   corpus <- gsub("[[:space:]]+", " ", corpus)  
39                               if (toLower)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
40                                   corpus <- tolower(corpus)      filehash::dbInsert(db, "DMetaData", df)
41                               origin <- "Not yet implemented"      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
42                               heading <- "Not yet implemented"  
43        .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
44                               l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  }
45                                   description = description, id = id, origin = origin, heading = heading)  
46                           }  .VCorpus <- function(x, cmeta, dmeta) {
47                           tdcl <- new("textdoccol", .Data = l)      attr(x, "CMetaData") <- cmeta
48                       },      attr(x, "DMetaData") <- dmeta
49                       # Read in text documents in Reuters-21578 XML (not SGML) format      class(x) <- c("VCorpus", "Corpus", "list")
50                       # Typically the first argument will be a directory where we can      x
51                       # find the files reut2-000.xml ... reut2-021.xml  }
52                       "REUT21578" = {  
53                           tdl <- sapply(dir(object,  # Register S3 corpus classes to be recognized by S4 methods. This is
54                                             pattern = ".xml",  # mainly a fix to be compatible with packages which were originally
55                                             full.names = TRUE),  # developed to cooperate with corresponding S4 tm classes. Necessary
56                                         function(file) {  # since tm's class architecture was changed to S3 since tm version 0.5.
57                                             tree <- xmlTreeParse(file)  setOldClass(c("VCorpus", "Corpus", "list"))
58                                             xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)  
59                                         })  # The "..." are additional arguments for the FunctionGenerator reader
60    VCorpus <- Corpus <- function(x,
61                           tdcl <- new("textdoccol", .Data = tdl)                      readerControl = list(reader = x$DefaultReader, language = "eng"),
62                       })                      ...) {
63                tdcl      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
64            })  
65        # Allocate memory in advance if length is known
66  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file      tdl <- if (x$Length > 0)
67  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {          vector("list", as.integer(x$Length))
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
   
     if (stripWhiteSpace)  
         corpus <- gsub("[[:space:]]+", " ", corpus)  
     if (toLower)  
         corpus <- tolower(corpus)  
   
     heading <- xmlValue(node[["title"]])  
   
     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
         description = description, id = id, origin = origin, heading = heading)  
 }  
   
 # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  
 parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlValue(node[["DATE"]])  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["NEWID"]])  
   
     origin <- "Not yet implemented"  
   
     # The <BODY></BODY> tag is unfortunately NOT obligatory!  
     if (!is.null(node[["TEXT"]][["BODY"]]))  
         corpus <- xmlValue(node[["TEXT"]][["BODY"]])  
68      else      else
69          corpus <- ""          list()
70    
71      if (stripWhiteSpace)      if (x$Vectorized)
72          corpus <- gsub("[[:space:]]+", " ", corpus)          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
73      if (toLower)                        pGetElem(x),
74          corpus <- tolower(corpus)                        id = as.character(seq_len(x$Length)),
75                          SIMPLIFY = FALSE)
76      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!      else {
77      if (!is.null(node[["TEXT"]][["TITLE"]]))          counter <- 1
78          heading <- xmlValue(node[["TEXT"]][["TITLE"]])          while (!eoi(x)) {
79                x <- stepNext(x)
80                elem <- getElem(x)
81                doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
82                if (x$Length > 0)
83                    tdl[[counter]] <- doc
84      else      else
85          heading <- ""                  tdl <- c(tdl, list(doc))
86                counter <- counter + 1
87            }
88        }
89    
90        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
91        .VCorpus(tdl, .MetaDataNode(), df)
92    }
93    
94    `[.PCorpus` <- function(x, i) {
95        if (missing(i)) return(x)
96        index <- attr(x, "DMetaData")[[1 , "subset"]]
97        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
98        dmeta <- attr(x, "DMetaData")
99        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
100    }
101    
102    `[.VCorpus` <- function(x, i) {
103        if (missing(i)) return(x)
104        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
105    }
106    
107    `[<-.PCorpus` <- function(x, i, value) {
108        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
109        counter <- 1
110        for (id in unclass(x)[i]) {
111            if (identical(length(value), 1L)) db[[id]] <- value
112            else db[[id]] <- value[[counter]]
113            counter <- counter + 1
114        }
115        x
116    }
117    
118    `[[.PCorpus` <-  function(x, i) {
119        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
120        filehash::dbFetch(db, NextMethod("[["))
121    }
122    `[[.VCorpus` <-  function(x, i) {
123        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
124        if (!is.null(lazyTmMap))
125            .Call("copyCorpus", x, materialize(x, i))
126        NextMethod("[[")
127    }
128    
129    `[[<-.PCorpus` <-  function(x, i, value) {
130        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
131        index <- unclass(x)[[i]]
132        db[[index]] <- value
133        x
134    }
135    `[[<-.VCorpus` <-  function(x, i, value) {
136        # Mark new objects as not active for lazy mapping
137        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
138        if (!is.null(lazyTmMap)) {
139            lazyTmMap$index[i] <- FALSE
140            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
141        }
142        # Set the value
143        cl <- class(x)
144        y <- NextMethod("[[<-")
145        class(y) <- cl
146        y
147    }
148    
149    # Update NodeIDs of a CMetaData tree
150    .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
151        # Traversal of (binary) CMetaData tree with setup of NodeIDs
152        set_id <- function(x) {
153            x$NodeID <- id
154            id <<- id + 1
155            level <<- level + 1
156            if (length(x$Children) > 0) {
157                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
158                left <- set_id(x$Children[[1]])
159                if (level == 1) {
160                    left.mapping <<- mapping
161                    mapping <<- NULL
162                }
163                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
164                right <- set_id(x$Children[[2]])
165    
166                x$Children <- list(left, right)
167            }
168            level <<- level - 1
169            x
170        }
171        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
172    }
173    
174      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  # Find indices to be updated for a CMetaData tree
175          description = description, id = id, origin = origin, heading = heading)  .find_indices <- function(x) {
176        indices.mapping <- NULL
177        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
178            indices <- (DMetaData(x)$MetaID == m)
179            indices.mapping <- c(indices.mapping, list(m = indices))
180            names(indices.mapping)[length(indices.mapping)] <- m
181        }
182        indices.mapping
183    }
184    
185    c2 <- function(x, y, ...) {
186        # Update the CMetaData tree
187        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
188        update.struct <- .update_id(cmeta)
189    
190        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
191    
192        # Find indices to be updated for the left tree
193        indices.mapping <- .find_indices(x)
194    
195        # Update the DMetaData data frames for the left tree
196        for (i in 1:ncol(update.struct$left.mapping)) {
197            map <- update.struct$left.mapping[,i]
198            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
199        }
200    
201        # Find indices to be updated for the right tree
202        indices.mapping <- .find_indices(y)
203    
204        # Update the DMetaData data frames for the right tree
205        for (i in 1:ncol(update.struct$right.mapping)) {
206            map <- update.struct$right.mapping[,i]
207            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
208        }
209    
210        # Merge the DMetaData data frames
211        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
212        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
213        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
214        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
215        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
216        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
217        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
218    
219        new
220    }
221    
222    c.Corpus <-
223    function(x, ..., recursive = FALSE)
224    {
225        args <- list(...)
226    
227        if (identical(length(args), 0L))
228            return(x)
229    
230        if (!all(unlist(lapply(args, inherits, class(x)))))
231            stop("not all arguments are of the same corpus type")
232    
233        if (inherits(x, "PCorpus"))
234            stop("concatenation of corpora with underlying databases is not supported")
235    
236        Reduce(c2, base::c(list(x), args))
237    }
238    
239    c.TextDocument <- function(x, ..., recursive = FALSE) {
240        args <- list(...)
241    
242        if (identical(length(args), 0L))
243            return(x)
244    
245        if (!all(unlist(lapply(args, inherits, class(x)))))
246            stop("not all arguments are text documents")
247    
248        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
249        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
250    }
251    
252    print.Corpus <- function(x, ...) {
253        cat(sprintf(ngettext(length(x),
254                             "A corpus with %d text document\n",
255                             "A corpus with %d text documents\n"),
256                    length(x)))
257        invisible(x)
258    }
259    
260    summary.Corpus <- function(object, ...) {
261        print(object)
262        if (length(DMetaData(object)) > 0) {
263            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
264                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
265                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
266                        length(CMetaData(object)$MetaData)))
267            cat("Available tags are:\n")
268            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
269            cat("Available variables in the data frame are:\n")
270            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
271        }
272    }
273    
274    inspect <- function(x) UseMethod("inspect", x)
275    inspect.PCorpus <- function(x) {
276        summary(x)
277        cat("\n")
278        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
279        show(filehash::dbMultiFetch(db, unlist(x)))
280    }
281    inspect.VCorpus <- function(x) {
282        summary(x)
283        cat("\n")
284        print(noquote(lapply(x, identity)))
285    }
286    
287    lapply.PCorpus <- function(X, FUN, ...) {
288        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
289        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
290    }
291    lapply.VCorpus <- function(X, FUN, ...) {
292        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
293        if (!is.null(lazyTmMap))
294            .Call("copyCorpus", X, materialize(X))
295        base::lapply(X, FUN, ...)
296    }
297    
298    writeCorpus <-  function(x, path = ".", filenames = NULL) {
299        filenames <- file.path(path,
300                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
301                               else filenames)
302        i <- 1
303        for (o in x) {
304            writeLines(as.PlainTextDocument(o), filenames[i])
305            i <- i + 1
306        }
307  }  }

Legend:
Removed from v.32  
changed lines
  Added in v.1025

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge