SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 32, Thu Dec 15 13:13:54 2005 UTC pkg/R/corpus.R revision 1261, Fri Sep 27 09:37:35 2013 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <-
4  # Text document collection  function(x, cmeta, dmeta, dbcontrol)
5  setClass("textdoccol",  {
6           contains = c("list"))      attr(x, "CMetaData") <- cmeta
7        attr(x, "DMetaData") <- dmeta
8  # Constructors      attr(x, "DBControl") <- dbcontrol
9        class(x) <- c("PCorpus", "Corpus", "list")
10  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))      x
11  setMethod("textdoccol",  }
12            c("character", "character", "logical", "logical"),  
13            function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {  DBControl <-
14    function(x)
15                # Add a new type for each unique input source format      attr(x, "DBControl")
16                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))  
17                switch(type,  PCorpus <-
18                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format  function(x,
19                       # For the moment the first argument is still a single file           readerControl = list(reader = x$DefaultReader, language = "en"),
20                       # This will be changed to a directory as soon as we have the full RCV1 data set           dbControl = list(dbName = "", dbType = "DB1"))
21                       "RCV1" = {  {
22                           tree <- xmlTreeParse(object)      readerControl <- prepareReader(readerControl, x$DefaultReader)
23                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))  
24                       },      if (is.function(readerControl$init))
25                       # Text in a special CSV format (as e.g. exported from an Excel sheet)          readerControl$init()
26                       # For details on the file format see data/Umfrage.csv  
27                       # The first argument has to be a single file      if (is.function(readerControl$exit))
28                       "CSV" = {          on.exit(readerControl$exit())
29                           m <- as.matrix(read.csv(object))  
30                           l <- vector("list", dim(m)[1])      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
31                           for (i in 1:dim(m)[1]) {          stop("error in creating database")
32                               author <- "Not yet implemented"      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
33                               timestamp <- date()  
34                               description <- "Not yet implemented"      # Allocate memory in advance if length is known
35                               id <- i      tdl <- if (x$Length > 0)
36                               corpus <- as.character(m[i,2:dim(m)[2]])          vector("list", as.integer(x$Length))
37                               if (stripWhiteSpace)      else
38                                   corpus <- gsub("[[:space:]]+", " ", corpus)          list()
39                               if (toLower)  
40                                   corpus <- tolower(corpus)      counter <- 1
41                               origin <- "Not yet implemented"      while (!eoi(x)) {
42                               heading <- "Not yet implemented"          x <- stepNext(x)
43            elem <- getElem(x)
44                               l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,          id <- if (is.null(x$Names) || is.na(x$Names))
45                                   description = description, id = id, origin = origin, heading = heading)                  as.character(counter)
46                           }              else
47                           tdcl <- new("textdoccol", .Data = l)                  x$Names[counter]
48                       },          doc <- readerControl$reader(elem, readerControl$language, id)
49                       # Read in text documents in Reuters-21578 XML (not SGML) format          filehash::dbInsert(db, ID(doc), doc)
50                       # Typically the first argument will be a directory where we can          if (x$Length > 0) tdl[[counter]] <- ID(doc)
51                       # find the files reut2-000.xml ... reut2-021.xml          else tdl <- c(tdl, ID(doc))
52                       "REUT21578" = {          counter <- counter + 1
53                           tdl <- sapply(dir(object,      }
54                                             pattern = ".xml",      if (!is.null(x$Names) && !is.na(x$Names))
55                                             full.names = TRUE),          names(tdl) <- x$Names
56                                         function(file) {  
57                                             tree <- xmlTreeParse(file)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
58                                             xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)      filehash::dbInsert(db, "DMetaData", df)
59                                         })      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
60    
61                           tdcl <- new("textdoccol", .Data = tdl)      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
62                       })  }
63                tdcl  
64            })  .VCorpus <-
65    function(x, cmeta, dmeta)
66  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  {
67  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {      attr(x, "CMetaData") <- cmeta
68      author <- "Not yet implemented"      attr(x, "DMetaData") <- dmeta
69      timestamp <- xmlAttrs(node)[["date"]]      class(x) <- c("VCorpus", "Corpus", "list")
70      description <- "Not yet implemented"      x
71      id <- as.integer(xmlAttrs(node)[["itemid"]])  }
72      origin <- "Not yet implemented"  
73      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  VCorpus <-
74    Corpus <-
75      if (stripWhiteSpace)  function(x, readerControl = list(reader = x$DefaultReader, language = "en"))
76          corpus <- gsub("[[:space:]]+", " ", corpus)  {
77      if (toLower)      readerControl <- prepareReader(readerControl, x$DefaultReader)
78          corpus <- tolower(corpus)  
79        if (is.function(readerControl$init))
80      heading <- xmlValue(node[["title"]])          readerControl$init()
81    
82      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,      if (is.function(readerControl$exit))
83          description = description, id = id, origin = origin, heading = heading)          on.exit(readerControl$exit())
84  }  
85        # Allocate memory in advance if length is known
86  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file      tdl <- if (x$Length > 0)
87  parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {          vector("list", as.integer(x$Length))
88      author <- "Not yet implemented"      else
89      timestamp <- xmlValue(node[["DATE"]])          list()
90      description <- "Not yet implemented"  
91      id <- as.integer(xmlAttrs(node)[["NEWID"]])      if (x$Vectorized)
92            tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
93      origin <- "Not yet implemented"                        pGetElem(x),
94                          id = if (is.null(x$Names) || is.na(x$Names)) as.character(seq_len(x$Length)) else x$Names,
95      # The <BODY></BODY> tag is unfortunately NOT obligatory!                        SIMPLIFY = FALSE)
96      if (!is.null(node[["TEXT"]][["BODY"]]))      else {
97          corpus <- xmlValue(node[["TEXT"]][["BODY"]])          counter <- 1
98      else          while (!eoi(x)) {
99          corpus <- ""              x <- stepNext(x)
100                elem <- getElem(x)
101      if (stripWhiteSpace)              id <- if (is.null(x$Names) || is.na(x$Names))
102          corpus <- gsub("[[:space:]]+", " ", corpus)                  as.character(counter)
     if (toLower)  
         corpus <- tolower(corpus)  
   
     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!  
     if (!is.null(node[["TEXT"]][["TITLE"]]))  
         heading <- xmlValue(node[["TEXT"]][["TITLE"]])  
103      else      else
104          heading <- ""                  x$Names[counter]
105                doc <- readerControl$reader(elem, readerControl$language, id)
106                if (x$Length > 0)
107                    tdl[[counter]] <- doc
108                else
109                    tdl <- c(tdl, list(doc))
110                counter <- counter + 1
111            }
112        }
113        if (!is.null(x$Names) && !is.na(x$Names))
114            names(tdl) <- x$Names
115        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
116        .VCorpus(tdl, .MetaDataNode(), df)
117    }
118    
119    `[.PCorpus` <-
120    function(x, i)
121    {
122        if (missing(i)) return(x)
123        index <- attr(x, "DMetaData")[[1 , "subset"]]
124        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
125        dmeta <- attr(x, "DMetaData")
126        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
127    }
128    
129    `[.VCorpus` <-
130    function(x, i)
131    {
132        if (missing(i)) return(x)
133        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
134    }
135    
136    `[<-.PCorpus` <-
137    function(x, i, value)
138    {
139        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
140        counter <- 1
141        for (id in unclass(x)[i]) {
142            if (identical(length(value), 1L)) db[[id]] <- value
143            else db[[id]] <- value[[counter]]
144            counter <- counter + 1
145        }
146        x
147    }
148    
149    .map_name_index <-
150    function(x, i)
151    {
152        if (is.character(i)) {
153            if (is.null(names(x)))
154                match(i, meta(x, "ID", type = "local"))
155            else
156                match(i, names(x))
157        }
158        i
159    }
160    
161    `[[.PCorpus` <-
162    function(x, i)
163    {
164        i <- .map_name_index(x, i)
165        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
166        filehash::dbFetch(db, NextMethod("[["))
167    }
168    `[[.VCorpus` <-
169    function(x, i)
170    {
171        i <- .map_name_index(x, i)
172        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
173        if (!is.null(lazyTmMap))
174            .Call("copyCorpus", x, materialize(x, i))
175        NextMethod("[[")
176    }
177    
178    `[[<-.PCorpus` <-
179    function(x, i, value)
180    {
181        i <- .map_name_index(x, i)
182        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
183        index <- unclass(x)[[i]]
184        db[[index]] <- value
185        x
186    }
187    `[[<-.VCorpus` <-
188    function(x, i, value)
189    {
190        i <- .map_name_index(x, i)
191        # Mark new objects as not active for lazy mapping
192        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
193        if (!is.null(lazyTmMap)) {
194            lazyTmMap$index[i] <- FALSE
195            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
196        }
197        # Set the value
198        cl <- class(x)
199        y <- NextMethod("[[<-")
200        class(y) <- cl
201        y
202    }
203    
204    # Update NodeIDs of a CMetaData tree
205    .update_id <-
206    function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0)
207    {
208        # Traversal of (binary) CMetaData tree with setup of NodeIDs
209        set_id <- function(x) {
210            x$NodeID <- id
211            id <<- id + 1
212            level <<- level + 1
213            if (length(x$Children) > 0) {
214                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
215                left <- set_id(x$Children[[1]])
216                if (level == 1) {
217                    left.mapping <<- mapping
218                    mapping <<- NULL
219                }
220                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
221                right <- set_id(x$Children[[2]])
222    
223                x$Children <- list(left, right)
224            }
225            level <<- level - 1
226            x
227        }
228        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
229    }
230    
231    # Find indices to be updated for a CMetaData tree
232    .find_indices <-
233    function(x)
234    {
235        indices.mapping <- NULL
236        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
237            indices <- (DMetaData(x)$MetaID == m)
238            indices.mapping <- c(indices.mapping, list(m = indices))
239            names(indices.mapping)[length(indices.mapping)] <- m
240        }
241        indices.mapping
242    }
243    
244    c2 <-
245    function(x, y, ...)
246    {
247        # Update the CMetaData tree
248        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
249        update.struct <- .update_id(cmeta)
250    
251        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
252    
253        # Find indices to be updated for the left tree
254        indices.mapping <- .find_indices(x)
255    
256        # Update the DMetaData data frames for the left tree
257        for (i in 1:ncol(update.struct$left.mapping)) {
258            map <- update.struct$left.mapping[,i]
259            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
260        }
261    
262        # Find indices to be updated for the right tree
263        indices.mapping <- .find_indices(y)
264    
265        # Update the DMetaData data frames for the right tree
266        for (i in 1:ncol(update.struct$right.mapping)) {
267            map <- update.struct$right.mapping[,i]
268            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
269        }
270    
271        # Merge the DMetaData data frames
272        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
273        na.matrix <- matrix(NA,
274                            nrow = nrow(DMetaData(x)),
275                            ncol = length(labels),
276                            dimnames = list(row.names(DMetaData(x)), labels))
277        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
278        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
279        na.matrix <- matrix(NA,
280                            nrow = nrow(DMetaData(y)),
281                            ncol = length(labels),
282                            dimnames = list(row.names(DMetaData(y)), labels))
283        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
284        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
285    
286        new
287    }
288    
289      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  c.Corpus <-
290          description = description, id = id, origin = origin, heading = heading)  function(..., recursive = FALSE)
291    {
292        args <- list(...)
293        x <- args[[1L]]
294    
295        if(length(args) == 1L)
296            return(x)
297    
298        if (!all(unlist(lapply(args, inherits, class(x)))))
299            stop("not all arguments are of the same corpus type")
300    
301        if (inherits(x, "PCorpus"))
302            stop("concatenation of corpora with underlying databases is not supported")
303    
304        if (recursive)
305            Reduce(c2, args)
306        else {
307            args <- do.call("c", lapply(args, unclass))
308            .VCorpus(args,
309                     cmeta = .MetaDataNode(),
310                     dmeta = data.frame(MetaID = rep(0, length(args)),
311                                        stringsAsFactors = FALSE))
312        }
313    }
314    
315    c.TextDocument <-
316    function(..., recursive = FALSE)
317    {
318        args <- list(...)
319        x <- args[[1L]]
320    
321        if(length(args) == 1L)
322            return(x)
323    
324        if (!all(unlist(lapply(args, inherits, class(x)))))
325            stop("not all arguments are text documents")
326    
327        dmeta <- data.frame(MetaID = rep(0, length(args)),
328                            stringsAsFactors = FALSE)
329        .VCorpus(args, .MetaDataNode(), dmeta)
330    }
331    
332    print.Corpus <-
333    function(x, ...)
334    {
335        cat(sprintf(ngettext(length(x),
336                             "A corpus with %d text document\n",
337                             "A corpus with %d text documents\n"),
338                    length(x)))
339        invisible(x)
340    }
341    
342    summary.Corpus <-
343    function(object, ...)
344    {
345        print(object)
346        if (length(DMetaData(object)) > 0) {
347            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
348                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
349                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
350                        length(CMetaData(object)$MetaData)))
351            cat("Available tags are:\n")
352            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
353            cat("Available variables in the data frame are:\n")
354            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
355        }
356    }
357    
358    inspect <-
359    function(x)
360        UseMethod("inspect", x)
361    inspect.PCorpus <-
362    function(x)
363    {
364        summary(x)
365        cat("\n")
366        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
367        show(filehash::dbMultiFetch(db, unlist(x)))
368    }
369    inspect.VCorpus <-
370    function(x)
371    {
372        summary(x)
373        cat("\n")
374        print(noquote(lapply(x, identity)))
375    }
376    
377    lapply.PCorpus <-
378    function(X, FUN, ...)
379    {
380        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
381        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
382    }
383    lapply.VCorpus <-
384    function(X, FUN, ...)
385    {
386        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
387        if (!is.null(lazyTmMap))
388            .Call("copyCorpus", X, materialize(X))
389        base::lapply(X, FUN, ...)
390    }
391    
392    writeCorpus <-
393    function(x, path = ".", filenames = NULL)
394    {
395        filenames <- file.path(path,
396                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
397                               else filenames)
398        i <- 1
399        for (o in x) {
400            writeLines(as.PlainTextDocument(o), filenames[i])
401            i <- i + 1
402        }
403  }  }

Legend:
Removed from v.32  
changed lines
  Added in v.1261

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge