SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 32, Thu Dec 15 13:13:54 2005 UTC pkg/R/corpus.R revision 1004, Tue Sep 8 10:28:28 2009 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4  # Text document collection      attr(x, "CMetaData") <- cmeta
5  setClass("textdoccol",      attr(x, "DMetaData") <- dmeta
6           contains = c("list"))      attr(x, "DBControl") <- dbcontrol
7        class(x) <- c("PCorpus", "Corpus", "list")
8  # Constructors      x
9    }
10  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))  DBControl <- function(x) attr(x, "DBControl")
11  setMethod("textdoccol",  
12            c("character", "character", "logical", "logical"),  PCorpus <- function(x,
13            function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {                      readerControl = list(reader = x$DefaultReader, language = "eng"),
14                        dbControl = list(dbName = "", dbType = "DB1"),
15                # Add a new type for each unique input source format                      ...) {
16                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
17                switch(type,  
18                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19                       # For the moment the first argument is still a single file          stop("error in creating database")
20                       # This will be changed to a directory as soon as we have the full RCV1 data set      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21                       "RCV1" = {  
22                           tree <- xmlTreeParse(object)      # Allocate memory in advance if length is known
23                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))      tdl <- if (x$Length > 0)
24                       },          vector("list", as.integer(x$Length))
25                       # Text in a special CSV format (as e.g. exported from an Excel sheet)      else
26                       # For details on the file format see data/Umfrage.csv          list()
27                       # The first argument has to be a single file  
28                       "CSV" = {      counter <- 1
29                           m <- as.matrix(read.csv(object))      while (!eoi(x)) {
30                           l <- vector("list", dim(m)[1])          x <- stepNext(x)
31                           for (i in 1:dim(m)[1]) {          elem <- getElem(x)
32                               author <- "Not yet implemented"          doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
33                               timestamp <- date()          filehash::dbInsert(db, ID(doc), doc)
34                               description <- "Not yet implemented"          if (x$Length > 0) tdl[[counter]] <- ID(doc)
35                               id <- i          else tdl <- c(tdl, ID(doc))
36                               corpus <- as.character(m[i,2:dim(m)[2]])          counter <- counter + 1
37                               if (stripWhiteSpace)      }
38                                   corpus <- gsub("[[:space:]]+", " ", corpus)  
39                               if (toLower)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
40                                   corpus <- tolower(corpus)      filehash::dbInsert(db, "DMetaData", df)
41                               origin <- "Not yet implemented"      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
42                               heading <- "Not yet implemented"  
43        .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
44                               l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  }
45                                   description = description, id = id, origin = origin, heading = heading)  
46                           }  .VCorpus <- function(x, cmeta, dmeta) {
47                           tdcl <- new("textdoccol", .Data = l)      attr(x, "CMetaData") <- cmeta
48                       },      attr(x, "DMetaData") <- dmeta
49                       # Read in text documents in Reuters-21578 XML (not SGML) format      class(x) <- c("VCorpus", "Corpus", "list")
50                       # Typically the first argument will be a directory where we can      x
51                       # find the files reut2-000.xml ... reut2-021.xml  }
52                       "REUT21578" = {  
53                           tdl <- sapply(dir(object,  # The "..." are additional arguments for the FunctionGenerator reader
54                                             pattern = ".xml",  VCorpus <- Corpus <- function(x,
55                                             full.names = TRUE),                      readerControl = list(reader = x$DefaultReader, language = "eng"),
56                                         function(file) {                      ...) {
57                                             tree <- xmlTreeParse(file)      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
58                                             xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)  
59                                         })      # Allocate memory in advance if length is known
60        tdl <- if (x$Length > 0)
61                           tdcl <- new("textdoccol", .Data = tdl)          vector("list", as.integer(x$Length))
                      })  
               tdcl  
           })  
   
 # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  
 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
   
     if (stripWhiteSpace)  
         corpus <- gsub("[[:space:]]+", " ", corpus)  
     if (toLower)  
         corpus <- tolower(corpus)  
   
     heading <- xmlValue(node[["title"]])  
   
     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
         description = description, id = id, origin = origin, heading = heading)  
 }  
   
 # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  
 parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlValue(node[["DATE"]])  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["NEWID"]])  
   
     origin <- "Not yet implemented"  
   
     # The <BODY></BODY> tag is unfortunately NOT obligatory!  
     if (!is.null(node[["TEXT"]][["BODY"]]))  
         corpus <- xmlValue(node[["TEXT"]][["BODY"]])  
62      else      else
63          corpus <- ""          list()
64    
65      if (stripWhiteSpace)      if (x$Vectorized)
66          corpus <- gsub("[[:space:]]+", " ", corpus)          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
67      if (toLower)                        pGetElem(x),
68          corpus <- tolower(corpus)                        id = as.character(seq_len(x$Length)),
69                          SIMPLIFY = FALSE)
70      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!      else {
71      if (!is.null(node[["TEXT"]][["TITLE"]]))          counter <- 1
72          heading <- xmlValue(node[["TEXT"]][["TITLE"]])          while (!eoi(x)) {
73                x <- stepNext(x)
74                elem <- getElem(x)
75                doc <- readerControl$reader(elem, readerControl$language, as.character(counter))
76                if (x$Length > 0)
77                    tdl[[counter]] <- doc
78      else      else
79          heading <- ""                  tdl <- c(tdl, list(doc))
80                counter <- counter + 1
81            }
82        }
83    
84        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
85        .VCorpus(tdl, .MetaDataNode(), df)
86    }
87    
88    `[.PCorpus` <- function(x, i) {
89        if (missing(i)) return(x)
90        index <- attr(x, "DMetaData")[[1 , "subset"]]
91        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
92        dmeta <- attr(x, "DMetaData")
93        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
94    }
95    
96    `[.VCorpus` <- function(x, i) {
97        if (missing(i)) return(x)
98        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
99    }
100    
101    `[<-.PCorpus` <- function(x, i, value) {
102        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
103        counter <- 1
104        for (id in unclass(x)[i]) {
105            if (identical(length(value), 1)) db[[id]] <- value
106            else db[[id]] <- value[[counter]]
107            counter <- counter + 1
108        }
109        x
110    }
111    
112    `[[.PCorpus` <-  function(x, i) {
113        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
114        filehash::dbFetch(db, NextMethod("[["))
115    }
116    `[[.VCorpus` <-  function(x, i) {
117        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
118        if (!is.null(lazyTmMap))
119            .Call("copyCorpus", x, materialize(x, i))
120        NextMethod("[[")
121    }
122    
123    `[[<-.PCorpus` <-  function(x, i, value) {
124        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
125        index <- unclass(x)[[i]]
126        db[[index]] <- value
127        x
128    }
129    `[[<-.VCorpus` <-  function(x, i, value) {
130        # Mark new objects as not active for lazy mapping
131        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
132        if (!is.null(lazyTmMap)) {
133            lazyTmMap$index[i] <- FALSE
134            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
135        }
136        # Set the value
137        cl <- class(x)
138        y <- NextMethod("[[<-")
139        class(y) <- cl
140        y
141    }
142    
143    # Update NodeIDs of a CMetaData tree
144    .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
145        # Traversal of (binary) CMetaData tree with setup of NodeIDs
146        set_id <- function(x) {
147            x$NodeID <- id
148            id <<- id + 1
149            level <<- level + 1
150            if (length(x$Children) > 0) {
151                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
152                left <- set_id(x$Children[[1]])
153                if (level == 1) {
154                    left.mapping <<- mapping
155                    mapping <<- NULL
156                }
157                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
158                right <- set_id(x$Children[[2]])
159    
160                x$Children <- list(left, right)
161            }
162            level <<- level - 1
163            x
164        }
165        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
166    }
167    
168      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  # Find indices to be updated for a CMetaData tree
169          description = description, id = id, origin = origin, heading = heading)  .find_indices <- function(x) {
170        indices.mapping <- NULL
171        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
172            indices <- (DMetaData(x)$MetaID == m)
173            indices.mapping <- c(indices.mapping, list(m = indices))
174            names(indices.mapping)[length(indices.mapping)] <- m
175        }
176        indices.mapping
177    }
178    
179    c2 <- function(x, y, ...) {
180        # Update the CMetaData tree
181        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
182        update.struct <- .update_id(cmeta)
183    
184        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
185    
186        # Find indices to be updated for the left tree
187        indices.mapping <- .find_indices(x)
188    
189        # Update the DMetaData data frames for the left tree
190        for (i in 1:ncol(update.struct$left.mapping)) {
191            map <- update.struct$left.mapping[,i]
192            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
193        }
194    
195        # Find indices to be updated for the right tree
196        indices.mapping <- .find_indices(y)
197    
198        # Update the DMetaData data frames for the right tree
199        for (i in 1:ncol(update.struct$right.mapping)) {
200            map <- update.struct$right.mapping[,i]
201            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
202        }
203    
204        # Merge the DMetaData data frames
205        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
206        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
207        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
208        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
209        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
210        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
211        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
212    
213        new
214    }
215    
216    c.Corpus <-
217    function(x, ..., recursive = FALSE)
218    {
219        args <- list(...)
220    
221        if (identical(length(args), 0))
222            return(x)
223    
224        if (!all(unlist(lapply(args, inherits, class(x)))))
225            stop("not all arguments are of the same corpus type")
226    
227        if (inherits(x, "PCorpus"))
228            stop("concatenation of corpora with underlying databases is not supported")
229    
230        Reduce(c2, base::c(list(x), args))
231    }
232    
233    c.TextDocument <- function(x, ..., recursive = FALSE) {
234        args <- list(...)
235    
236        if (identical(length(args), 0))
237            return(x)
238    
239        if (!all(unlist(lapply(args, inherits, class(x)))))
240            stop("not all arguments are text documents")
241    
242        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
243        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
244    }
245    
246    print.Corpus <- function(x, ...) {
247        cat(sprintf(ngettext(length(x),
248                             "A corpus with %d text document\n",
249                             "A corpus with %d text documents\n"),
250                    length(x)))
251        invisible(x)
252    }
253    
254    summary.Corpus <- function(object, ...) {
255        print(object)
256        if (length(DMetaData(object)) > 0) {
257            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
258                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
259                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
260                        length(CMetaData(object)$MetaData)))
261            cat("Available tags are:\n")
262            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
263            cat("Available variables in the data frame are:\n")
264            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
265        }
266    }
267    
268    inspect <- function(x) UseMethod("inspect", x)
269    inspect.PCorpus <- function(x) {
270        summary(x)
271        cat("\n")
272        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
273        show(filehash::dbMultiFetch(db, unlist(x)))
274    }
275    inspect.VCorpus <- function(x) {
276        summary(x)
277        cat("\n")
278        print(noquote(lapply(x, identity)))
279    }
280    
281    lapply.PCorpus <- function(X, FUN, ...) {
282        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
283        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
284    }
285    lapply.VCorpus <- function(X, FUN, ...) {
286        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
287        if (!is.null(lazyTmMap))
288            .Call("copyCorpus", X, materialize(X))
289        base::lapply(X, FUN, ...)
290    }
291    
292    writeCorpus <-  function(x, path = ".", filenames = NULL) {
293        filenames <- file.path(path,
294                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
295                               else filenames)
296        i <- 1
297        for (o in x) {
298            writeLines(as.PlainTextDocument(o), filenames[i])
299            i <- i + 1
300        }
301  }  }

Legend:
Removed from v.32  
changed lines
  Added in v.1004

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge