SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 32, Thu Dec 15 13:13:54 2005 UTC pkg/R/corpus.R revision 1285, Sat Jan 11 09:31:10 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <-
4  # Text document collection  function(x, cmeta, dmeta, dbcontrol)
5  setClass("textdoccol",  {
6           contains = c("list"))      attr(x, "CMetaData") <- cmeta
7        attr(x, "DMetaData") <- dmeta
8  # Constructors      attr(x, "DBControl") <- dbcontrol
9        class(x) <- c("PCorpus", "Corpus", "list")
10  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))      x
11  setMethod("textdoccol",  }
12            c("character", "character", "logical", "logical"),  
13            function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {  DBControl <-
14    function(x)
15                # Add a new type for each unique input source format      attr(x, "DBControl")
16                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))  
17                switch(type,  PCorpus <-
18                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format  function(x,
19                       # For the moment the first argument is still a single file           readerControl = list(reader = x$DefaultReader, language = "en"),
20                       # This will be changed to a directory as soon as we have the full RCV1 data set           dbControl = list(dbName = "", dbType = "DB1"))
21                       "RCV1" = {  {
22                           tree <- xmlTreeParse(object)      stopifnot(is.Source(x))
23                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))  
24                       },      readerControl <- prepareReader(readerControl, x$DefaultReader)
25                       # Text in a special CSV format (as e.g. exported from an Excel sheet)  
26                       # For details on the file format see data/Umfrage.csv      if (is.function(readerControl$init))
27                       # The first argument has to be a single file          readerControl$init()
28                       "CSV" = {  
29                           m <- as.matrix(read.csv(object))      if (is.function(readerControl$exit))
30                           l <- vector("list", dim(m)[1])          on.exit(readerControl$exit())
31                           for (i in 1:dim(m)[1]) {  
32                               author <- "Not yet implemented"      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
33                               timestamp <- date()          stop("error in creating database")
34                               description <- "Not yet implemented"      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
35                               id <- i  
36                               corpus <- as.character(m[i,2:dim(m)[2]])      # Allocate memory in advance if length is known
37                               if (stripWhiteSpace)      tdl <- if (x$Length > 0)
38                                   corpus <- gsub("[[:space:]]+", " ", corpus)          vector("list", as.integer(x$Length))
39                               if (toLower)      else
40                                   corpus <- tolower(corpus)          list()
41                               origin <- "Not yet implemented"  
42                               heading <- "Not yet implemented"      counter <- 1
43        while (!eoi(x)) {
44                               l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,          x <- stepNext(x)
45                                   description = description, id = id, origin = origin, heading = heading)          elem <- getElem(x)
46                           }          id <- if (is.null(x$Names) || is.na(x$Names))
47                           tdcl <- new("textdoccol", .Data = l)                  as.character(counter)
48                       },              else
49                       # Read in text documents in Reuters-21578 XML (not SGML) format                  x$Names[counter]
50                       # Typically the first argument will be a directory where we can          doc <- readerControl$reader(elem, readerControl$language, id)
51                       # find the files reut2-000.xml ... reut2-021.xml          filehash::dbInsert(db, ID(doc), doc)
52                       "REUT21578" = {          if (x$Length > 0) tdl[[counter]] <- ID(doc)
53                           tdl <- sapply(dir(object,          else tdl <- c(tdl, ID(doc))
54                                             pattern = ".xml",          counter <- counter + 1
55                                             full.names = TRUE),      }
56                                         function(file) {      if (!is.null(x$Names) && !is.na(x$Names))
57                                             tree <- xmlTreeParse(file)          names(tdl) <- x$Names
58                                             xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)  
59                                         })      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
60        filehash::dbInsert(db, "DMetaData", df)
61                           tdcl <- new("textdoccol", .Data = tdl)      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
62                       })  
63                tdcl      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
64            })  }
65    
66  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  .VCorpus <-
67  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  function(x, cmeta, dmeta)
68      author <- "Not yet implemented"  {
69      timestamp <- xmlAttrs(node)[["date"]]      attr(x, "CMetaData") <- cmeta
70      description <- "Not yet implemented"      attr(x, "DMetaData") <- dmeta
71      id <- as.integer(xmlAttrs(node)[["itemid"]])      class(x) <- c("VCorpus", "Corpus", "list")
72      origin <- "Not yet implemented"      x
73      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  }
74    
75      if (stripWhiteSpace)  VCorpus <-
76          corpus <- gsub("[[:space:]]+", " ", corpus)  Corpus <-
77      if (toLower)  function(x, readerControl = list(reader = x$DefaultReader, language = "en"))
78          corpus <- tolower(corpus)  {
79        stopifnot(is.Source(x))
80      heading <- xmlValue(node[["title"]])  
81        readerControl <- prepareReader(readerControl, x$DefaultReader)
82      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
83          description = description, id = id, origin = origin, heading = heading)      if (is.function(readerControl$init))
84  }          readerControl$init()
85    
86  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file      if (is.function(readerControl$exit))
87  parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {          on.exit(readerControl$exit())
88      author <- "Not yet implemented"  
89      timestamp <- xmlValue(node[["DATE"]])      # Allocate memory in advance if length is known
90      description <- "Not yet implemented"      tdl <- if (x$Length > 0)
91      id <- as.integer(xmlAttrs(node)[["NEWID"]])          vector("list", as.integer(x$Length))
92        else
93      origin <- "Not yet implemented"          list()
94    
95      # The <BODY></BODY> tag is unfortunately NOT obligatory!      if (x$Vectorized)
96      if (!is.null(node[["TEXT"]][["BODY"]]))          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
97          corpus <- xmlValue(node[["TEXT"]][["BODY"]])                        pGetElem(x),
98      else                        id = if (is.null(x$Names) || is.na(x$Names)) as.character(seq_len(x$Length)) else x$Names,
99          corpus <- ""                        SIMPLIFY = FALSE)
100        else {
101      if (stripWhiteSpace)          counter <- 1
102          corpus <- gsub("[[:space:]]+", " ", corpus)          while (!eoi(x)) {
103      if (toLower)              x <- stepNext(x)
104          corpus <- tolower(corpus)              elem <- getElem(x)
105                id <- if (is.null(x$Names) || is.na(x$Names))
106      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!                  as.character(counter)
     if (!is.null(node[["TEXT"]][["TITLE"]]))  
         heading <- xmlValue(node[["TEXT"]][["TITLE"]])  
107      else      else
108          heading <- ""                  x$Names[counter]
109                doc <- readerControl$reader(elem, readerControl$language, id)
110                if (x$Length > 0)
111                    tdl[[counter]] <- doc
112                else
113                    tdl <- c(tdl, list(doc))
114                counter <- counter + 1
115            }
116        }
117        if (!is.null(x$Names) && !is.na(x$Names))
118            names(tdl) <- x$Names
119        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
120        .VCorpus(tdl, .MetaDataNode(), df)
121    }
122    
123    `[.PCorpus` <-
124    function(x, i)
125    {
126        if (missing(i)) return(x)
127        index <- attr(x, "DMetaData")[[1 , "subset"]]
128        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
129        dmeta <- attr(x, "DMetaData")
130        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
131    }
132    
133    `[.VCorpus` <-
134    function(x, i)
135    {
136        if (missing(i)) return(x)
137        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
138    }
139    
140    `[<-.PCorpus` <-
141    function(x, i, value)
142    {
143        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
144        counter <- 1
145        for (id in unclass(x)[i]) {
146            if (identical(length(value), 1L)) db[[id]] <- value
147            else db[[id]] <- value[[counter]]
148            counter <- counter + 1
149        }
150        x
151    }
152    
153    .map_name_index <-
154    function(x, i)
155    {
156        if (is.character(i)) {
157            if (is.null(names(x)))
158                match(i, meta(x, "ID", type = "local"))
159            else
160                match(i, names(x))
161        }
162        i
163    }
164    
165    `[[.PCorpus` <-
166    function(x, i)
167    {
168        i <- .map_name_index(x, i)
169        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
170        filehash::dbFetch(db, NextMethod("[["))
171    }
172    `[[.VCorpus` <-
173    function(x, i)
174    {
175        i <- .map_name_index(x, i)
176        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
177        if (!is.null(lazyTmMap))
178            .Call("copyCorpus", x, materialize(x, i))
179        NextMethod("[[")
180    }
181    
182    `[[<-.PCorpus` <-
183    function(x, i, value)
184    {
185        i <- .map_name_index(x, i)
186        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
187        index <- unclass(x)[[i]]
188        db[[index]] <- value
189        x
190    }
191    `[[<-.VCorpus` <-
192    function(x, i, value)
193    {
194        i <- .map_name_index(x, i)
195        # Mark new objects as not active for lazy mapping
196        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
197        if (!is.null(lazyTmMap)) {
198            lazyTmMap$index[i] <- FALSE
199            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
200        }
201        # Set the value
202        cl <- class(x)
203        y <- NextMethod("[[<-")
204        class(y) <- cl
205        y
206    }
207    
208    # Update NodeIDs of a CMetaData tree
209    .update_id <-
210    function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0)
211    {
212        # Traversal of (binary) CMetaData tree with setup of NodeIDs
213        set_id <- function(x) {
214            x$NodeID <- id
215            id <<- id + 1
216            level <<- level + 1
217            if (length(x$Children)) {
218                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
219                left <- set_id(x$Children[[1]])
220                if (level == 1) {
221                    left.mapping <<- mapping
222                    mapping <<- NULL
223                }
224                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
225                right <- set_id(x$Children[[2]])
226    
227                x$Children <- list(left, right)
228            }
229            level <<- level - 1
230            x
231        }
232        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
233    }
234    
235    # Find indices to be updated for a CMetaData tree
236    .find_indices <-
237    function(x)
238    {
239        indices.mapping <- NULL
240        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
241            indices <- (DMetaData(x)$MetaID == m)
242            indices.mapping <- c(indices.mapping, list(m = indices))
243            names(indices.mapping)[length(indices.mapping)] <- m
244        }
245        indices.mapping
246    }
247    
248    c2 <-
249    function(x, y, ...)
250    {
251        # Update the CMetaData tree
252        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
253        update.struct <- .update_id(cmeta)
254    
255        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
256    
257        # Find indices to be updated for the left tree
258        indices.mapping <- .find_indices(x)
259    
260        # Update the DMetaData data frames for the left tree
261        for (i in 1:ncol(update.struct$left.mapping)) {
262            map <- update.struct$left.mapping[,i]
263            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
264        }
265    
266        # Find indices to be updated for the right tree
267        indices.mapping <- .find_indices(y)
268    
269        # Update the DMetaData data frames for the right tree
270        for (i in 1:ncol(update.struct$right.mapping)) {
271            map <- update.struct$right.mapping[,i]
272            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
273        }
274    
275        # Merge the DMetaData data frames
276        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
277        na.matrix <- matrix(NA,
278                            nrow = nrow(DMetaData(x)),
279                            ncol = length(labels),
280                            dimnames = list(row.names(DMetaData(x)), labels))
281        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
282        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
283        na.matrix <- matrix(NA,
284                            nrow = nrow(DMetaData(y)),
285                            ncol = length(labels),
286                            dimnames = list(row.names(DMetaData(y)), labels))
287        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
288        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
289    
290        new
291    }
292    
293      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  c.Corpus <-
294          description = description, id = id, origin = origin, heading = heading)  function(..., recursive = FALSE)
295    {
296        args <- list(...)
297        x <- args[[1L]]
298    
299        if(length(args) == 1L)
300            return(x)
301    
302        if (!all(unlist(lapply(args, inherits, class(x)))))
303            stop("not all arguments are of the same corpus type")
304    
305        if (inherits(x, "PCorpus"))
306            stop("concatenation of corpora with underlying databases is not supported")
307    
308        if (recursive)
309            Reduce(c2, args)
310        else {
311            args <- do.call("c", lapply(args, unclass))
312            .VCorpus(args,
313                     cmeta = .MetaDataNode(),
314                     dmeta = data.frame(MetaID = rep(0, length(args)),
315                                        stringsAsFactors = FALSE))
316        }
317    }
318    
319    c.TextDocument <-
320    function(..., recursive = FALSE)
321    {
322        args <- list(...)
323        x <- args[[1L]]
324    
325        if(length(args) == 1L)
326            return(x)
327    
328        if (!all(unlist(lapply(args, inherits, class(x)))))
329            stop("not all arguments are text documents")
330    
331        dmeta <- data.frame(MetaID = rep(0, length(args)),
332                            stringsAsFactors = FALSE)
333        .VCorpus(args, .MetaDataNode(), dmeta)
334    }
335    
336    print.Corpus <-
337    function(x, ...)
338    {
339        cat(sprintf(ngettext(length(x),
340                             "A corpus with %d text document\n",
341                             "A corpus with %d text documents\n"),
342                    length(x)))
343        invisible(x)
344    }
345    
346    summary.Corpus <-
347    function(object, ...)
348    {
349        print(object)
350        if (length(DMetaData(object))) {
351            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
352                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
353                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
354                        length(CMetaData(object)$MetaData)))
355            cat("Available tags are:\n")
356            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
357            cat("Available variables in the data frame are:\n")
358            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
359        }
360    }
361    
362    inspect <-
363    function(x)
364        UseMethod("inspect", x)
365    inspect.PCorpus <-
366    function(x)
367    {
368        summary(x)
369        cat("\n")
370        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
371        show(filehash::dbMultiFetch(db, unlist(x)))
372    }
373    inspect.VCorpus <-
374    function(x)
375    {
376        summary(x)
377        cat("\n")
378        print(noquote(lapply(x, identity)))
379    }
380    
381    lapply.PCorpus <-
382    function(X, FUN, ...)
383    {
384        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
385        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
386    }
387    lapply.VCorpus <-
388    function(X, FUN, ...)
389    {
390        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
391        if (!is.null(lazyTmMap))
392            .Call("copyCorpus", X, materialize(X))
393        base::lapply(X, FUN, ...)
394    }
395    
396    writeCorpus <-
397    function(x, path = ".", filenames = NULL)
398    {
399        filenames <- file.path(path,
400                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
401                               else filenames)
402        i <- 1
403        for (o in x) {
404            writeLines(as.PlainTextDocument(o), filenames[i])
405            i <- i + 1
406        }
407  }  }

Legend:
Removed from v.32  
changed lines
  Added in v.1285

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge