SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 32, Thu Dec 15 13:13:54 2005 UTC pkg/R/corpus.R revision 1203, Fri Jan 11 19:43:37 2013 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <-
4  # Text document collection  function(x, cmeta, dmeta, dbcontrol)
5  setClass("textdoccol",  {
6           contains = c("list"))      attr(x, "CMetaData") <- cmeta
7        attr(x, "DMetaData") <- dmeta
8  # Constructors      attr(x, "DBControl") <- dbcontrol
9        class(x) <- c("PCorpus", "Corpus", "list")
10  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))      x
11  setMethod("textdoccol",  }
12            c("character", "character", "logical", "logical"),  
13            function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {  DBControl <-
14    function(x)
15                # Add a new type for each unique input source format      attr(x, "DBControl")
16                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))  
17                switch(type,  PCorpus <-
18                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format  function(x,
19                       # For the moment the first argument is still a single file           readerControl = list(reader = x$DefaultReader, language = "en"),
20                       # This will be changed to a directory as soon as we have the full RCV1 data set           dbControl = list(dbName = "", dbType = "DB1"),
21                       "RCV1" = {           ...)
22                           tree <- xmlTreeParse(object)  {
23                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
24                       },  
25                       # Text in a special CSV format (as e.g. exported from an Excel sheet)      if (is.function(readerControl$init))
26                       # For details on the file format see data/Umfrage.csv          readerControl$init()
27                       # The first argument has to be a single file  
28                       "CSV" = {      if (is.function(readerControl$exit))
29                           m <- as.matrix(read.csv(object))          on.exit(readerControl$exit())
30                           l <- vector("list", dim(m)[1])  
31                           for (i in 1:dim(m)[1]) {      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
32                               author <- "Not yet implemented"          stop("error in creating database")
33                               timestamp <- date()      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
34                               description <- "Not yet implemented"  
35                               id <- i      # Allocate memory in advance if length is known
36                               corpus <- as.character(m[i,2:dim(m)[2]])      tdl <- if (x$Length > 0)
37                               if (stripWhiteSpace)          vector("list", as.integer(x$Length))
                                  corpus <- gsub("[[:space:]]+", " ", corpus)  
                              if (toLower)  
                                  corpus <- tolower(corpus)  
                              origin <- "Not yet implemented"  
                              heading <- "Not yet implemented"  
   
                              l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
                                  description = description, id = id, origin = origin, heading = heading)  
                          }  
                          tdcl <- new("textdoccol", .Data = l)  
                      },  
                      # Read in text documents in Reuters-21578 XML (not SGML) format  
                      # Typically the first argument will be a directory where we can  
                      # find the files reut2-000.xml ... reut2-021.xml  
                      "REUT21578" = {  
                          tdl <- sapply(dir(object,  
                                            pattern = ".xml",  
                                            full.names = TRUE),  
                                        function(file) {  
                                            tree <- xmlTreeParse(file)  
                                            xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)  
                                        })  
   
                          tdcl <- new("textdoccol", .Data = tdl)  
                      })  
               tdcl  
           })  
   
 # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  
 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
   
     if (stripWhiteSpace)  
         corpus <- gsub("[[:space:]]+", " ", corpus)  
     if (toLower)  
         corpus <- tolower(corpus)  
   
     heading <- xmlValue(node[["title"]])  
   
     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
         description = description, id = id, origin = origin, heading = heading)  
 }  
   
 # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  
 parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlValue(node[["DATE"]])  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["NEWID"]])  
   
     origin <- "Not yet implemented"  
   
     # The <BODY></BODY> tag is unfortunately NOT obligatory!  
     if (!is.null(node[["TEXT"]][["BODY"]]))  
         corpus <- xmlValue(node[["TEXT"]][["BODY"]])  
38      else      else
39          corpus <- ""          list()
40    
41        counter <- 1
42        while (!eoi(x)) {
43            x <- stepNext(x)
44            elem <- getElem(x)
45            doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
46            filehash::dbInsert(db, ID(doc), doc)
47            if (x$Length > 0) tdl[[counter]] <- ID(doc)
48            else tdl <- c(tdl, ID(doc))
49            counter <- counter + 1
50        }
51        names(tdl) <- x$Names
52    
53        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
54        filehash::dbInsert(db, "DMetaData", df)
55        dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
56    
57        .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
58    }
59    
60    .VCorpus <-
61    function(x, cmeta, dmeta)
62    {
63        attr(x, "CMetaData") <- cmeta
64        attr(x, "DMetaData") <- dmeta
65        class(x) <- c("VCorpus", "Corpus", "list")
66        x
67    }
68    
69      if (stripWhiteSpace)  # Register S3 corpus classes to be recognized by S4 methods. This is
70          corpus <- gsub("[[:space:]]+", " ", corpus)  # mainly a fix to be compatible with packages which were originally
71      if (toLower)  # developed to cooperate with corresponding S4 tm classes. Necessary
72          corpus <- tolower(corpus)  # since tm's class architecture was changed to S3 since tm version 0.5.
73    setOldClass(c("VCorpus", "Corpus", "list"))
74      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!  
75      if (!is.null(node[["TEXT"]][["TITLE"]]))  # The "..." are additional arguments for the FunctionGenerator reader
76          heading <- xmlValue(node[["TEXT"]][["TITLE"]])  VCorpus <-
77    Corpus <-
78    function(x,
79             readerControl = list(reader = x$DefaultReader, language = "en"),
80             ...)
81    {
82        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
83    
84        if (is.function(readerControl$init))
85            readerControl$init()
86    
87        if (is.function(readerControl$exit))
88            on.exit(readerControl$exit())
89    
90        # Allocate memory in advance if length is known
91        tdl <- if (x$Length > 0)
92            vector("list", as.integer(x$Length))
93      else      else
94          heading <- ""          list()
95    
96        if (x$Vectorized)
97            tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
98                          pGetElem(x),
99                          id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,
100                          SIMPLIFY = FALSE)
101        else {
102            counter <- 1
103            while (!eoi(x)) {
104                x <- stepNext(x)
105                elem <- getElem(x)
106                doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
107                if (x$Length > 0)
108                    tdl[[counter]] <- doc
109                else
110                    tdl <- c(tdl, list(doc))
111                counter <- counter + 1
112            }
113        }
114        names(tdl) <- x$Names
115        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
116        .VCorpus(tdl, .MetaDataNode(), df)
117    }
118    
119    `[.PCorpus` <-
120    function(x, i)
121    {
122        if (missing(i)) return(x)
123        index <- attr(x, "DMetaData")[[1 , "subset"]]
124        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
125        dmeta <- attr(x, "DMetaData")
126        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
127    }
128    
129    `[.VCorpus` <-
130    function(x, i)
131    {
132        if (missing(i)) return(x)
133        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
134    }
135    
136    `[<-.PCorpus` <-
137    function(x, i, value)
138    {
139        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
140        counter <- 1
141        for (id in unclass(x)[i]) {
142            if (identical(length(value), 1L)) db[[id]] <- value
143            else db[[id]] <- value[[counter]]
144            counter <- counter + 1
145        }
146        x
147    }
148    
149    .map_name_index <-
150    function(x, i)
151    {
152        if (is.character(i)) {
153            if (is.null(names(x)))
154                match(i, meta(x, "ID", type = "local"))
155            else
156                match(i, names(x))
157        }
158        i
159    }
160    
161    `[[.PCorpus` <-
162    function(x, i)
163    {
164        i <- .map_name_index(x, i)
165        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
166        filehash::dbFetch(db, NextMethod("[["))
167    }
168    `[[.VCorpus` <-
169    function(x, i)
170    {
171        i <- .map_name_index(x, i)
172        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
173        if (!is.null(lazyTmMap))
174            .Call("copyCorpus", x, materialize(x, i))
175        NextMethod("[[")
176    }
177    
178    `[[<-.PCorpus` <-
179    function(x, i, value)
180    {
181        i <- .map_name_index(x, i)
182        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
183        index <- unclass(x)[[i]]
184        db[[index]] <- value
185        x
186    }
187    `[[<-.VCorpus` <-
188    function(x, i, value)
189    {
190        i <- .map_name_index(x, i)
191        # Mark new objects as not active for lazy mapping
192        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
193        if (!is.null(lazyTmMap)) {
194            lazyTmMap$index[i] <- FALSE
195            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
196        }
197        # Set the value
198        cl <- class(x)
199        y <- NextMethod("[[<-")
200        class(y) <- cl
201        y
202    }
203    
204    # Update NodeIDs of a CMetaData tree
205    .update_id <-
206    function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0)
207    {
208        # Traversal of (binary) CMetaData tree with setup of NodeIDs
209        set_id <- function(x) {
210            x$NodeID <- id
211            id <<- id + 1
212            level <<- level + 1
213            if (length(x$Children) > 0) {
214                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
215                left <- set_id(x$Children[[1]])
216                if (level == 1) {
217                    left.mapping <<- mapping
218                    mapping <<- NULL
219                }
220                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
221                right <- set_id(x$Children[[2]])
222    
223                x$Children <- list(left, right)
224            }
225            level <<- level - 1
226            x
227        }
228        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
229    }
230    
231      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  # Find indices to be updated for a CMetaData tree
232          description = description, id = id, origin = origin, heading = heading)  .find_indices <-
233    function(x)
234    {
235        indices.mapping <- NULL
236        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
237            indices <- (DMetaData(x)$MetaID == m)
238            indices.mapping <- c(indices.mapping, list(m = indices))
239            names(indices.mapping)[length(indices.mapping)] <- m
240        }
241        indices.mapping
242    }
243    
244    c2 <-
245    function(x, y, ...)
246    {
247        # Update the CMetaData tree
248        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
249        update.struct <- .update_id(cmeta)
250    
251        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
252    
253        # Find indices to be updated for the left tree
254        indices.mapping <- .find_indices(x)
255    
256        # Update the DMetaData data frames for the left tree
257        for (i in 1:ncol(update.struct$left.mapping)) {
258            map <- update.struct$left.mapping[,i]
259            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
260        }
261    
262        # Find indices to be updated for the right tree
263        indices.mapping <- .find_indices(y)
264    
265        # Update the DMetaData data frames for the right tree
266        for (i in 1:ncol(update.struct$right.mapping)) {
267            map <- update.struct$right.mapping[,i]
268            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
269        }
270    
271        # Merge the DMetaData data frames
272        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
273        na.matrix <- matrix(NA,
274                            nrow = nrow(DMetaData(x)),
275                            ncol = length(labels),
276                            dimnames = list(row.names(DMetaData(x)), labels))
277        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
278        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
279        na.matrix <- matrix(NA,
280                            nrow = nrow(DMetaData(y)),
281                            ncol = length(labels),
282                            dimnames = list(row.names(DMetaData(y)), labels))
283        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
284        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
285    
286        new
287    }
288    
289    c.Corpus <-
290    function(..., recursive = FALSE)
291    {
292        args <- list(...)
293        x <- args[[1L]]
294    
295        if(length(args) == 1L)
296            return(x)
297    
298        if (!all(unlist(lapply(args, inherits, class(x)))))
299            stop("not all arguments are of the same corpus type")
300    
301        if (inherits(x, "PCorpus"))
302            stop("concatenation of corpora with underlying databases is not supported")
303    
304        if (recursive)
305            Reduce(c2, args)
306        else {
307            args <- do.call("c", lapply(args, unclass))
308            .VCorpus(args,
309                     cmeta = .MetaDataNode(),
310                     dmeta = data.frame(MetaID = rep(0, length(args)),
311                                        stringsAsFactors = FALSE))
312        }
313    }
314    
315    c.TextDocument <-
316    function(..., recursive = FALSE)
317    {
318        args <- list(...)
319        x <- args[[1L]]
320    
321        if(length(args) == 1L)
322            return(x)
323    
324        if (!all(unlist(lapply(args, inherits, class(x)))))
325            stop("not all arguments are text documents")
326    
327        dmeta <- data.frame(MetaID = rep(0, length(args)),
328                            stringsAsFactors = FALSE)
329        .VCorpus(args, .MetaDataNode(), dmeta)
330    }
331    
332    print.Corpus <-
333    function(x, ...)
334    {
335        cat(sprintf(ngettext(length(x),
336                             "A corpus with %d text document\n",
337                             "A corpus with %d text documents\n"),
338                    length(x)))
339        invisible(x)
340    }
341    
342    summary.Corpus <-
343    function(object, ...)
344    {
345        print(object)
346        if (length(DMetaData(object)) > 0) {
347            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
348                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
349                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
350                        length(CMetaData(object)$MetaData)))
351            cat("Available tags are:\n")
352            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
353            cat("Available variables in the data frame are:\n")
354            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
355        }
356    }
357    
358    inspect <-
359    function(x)
360        UseMethod("inspect", x)
361    inspect.PCorpus <-
362    function(x)
363    {
364        summary(x)
365        cat("\n")
366        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
367        show(filehash::dbMultiFetch(db, unlist(x)))
368    }
369    inspect.VCorpus <-
370    function(x)
371    {
372        summary(x)
373        cat("\n")
374        print(noquote(lapply(x, identity)))
375    }
376    
377    lapply.PCorpus <-
378    function(X, FUN, ...)
379    {
380        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
381        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
382    }
383    lapply.VCorpus <-
384    function(X, FUN, ...)
385    {
386        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
387        if (!is.null(lazyTmMap))
388            .Call("copyCorpus", X, materialize(X))
389        base::lapply(X, FUN, ...)
390    }
391    
392    writeCorpus <-
393    function(x, path = ".", filenames = NULL)
394    {
395        filenames <- file.path(path,
396                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
397                               else filenames)
398        i <- 1
399        for (o in x) {
400            writeLines(as.PlainTextDocument(o), filenames[i])
401            i <- i + 1
402        }
403  }  }

Legend:
Removed from v.32  
changed lines
  Added in v.1203

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge