SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 32, Thu Dec 15 13:13:54 2005 UTC pkg/R/corpus.R revision 1114, Fri Nov 26 14:05:54 2010 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4  # Text document collection      attr(x, "CMetaData") <- cmeta
5  setClass("textdoccol",      attr(x, "DMetaData") <- dmeta
6           contains = c("list"))      attr(x, "DBControl") <- dbcontrol
7        class(x) <- c("PCorpus", "Corpus", "list")
8  # Constructors      x
9    }
10  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))  DBControl <- function(x) attr(x, "DBControl")
11  setMethod("textdoccol",  
12            c("character", "character", "logical", "logical"),  PCorpus <- function(x,
13            function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {                      readerControl = list(reader = x$DefaultReader, language = "en"),
14                        dbControl = list(dbName = "", dbType = "DB1"),
15                # Add a new type for each unique input source format                      ...) {
16                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
17                switch(type,  
18                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format      if (is.function(readerControl$init))
19                       # For the moment the first argument is still a single file          readerControl$init()
20                       # This will be changed to a directory as soon as we have the full RCV1 data set  
21                       "RCV1" = {      if (is.function(readerControl$exit))
22                           tree <- xmlTreeParse(object)          on.exit(readerControl$exit())
23                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))  
24                       },      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
25                       # Text in a special CSV format (as e.g. exported from an Excel sheet)          stop("error in creating database")
26                       # For details on the file format see data/Umfrage.csv      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
27                       # The first argument has to be a single file  
28                       "CSV" = {      # Allocate memory in advance if length is known
29                           m <- as.matrix(read.csv(object))      tdl <- if (x$Length > 0)
30                           l <- vector("list", dim(m)[1])          vector("list", as.integer(x$Length))
31                           for (i in 1:dim(m)[1]) {      else
32                               author <- "Not yet implemented"          list()
33                               timestamp <- date()  
34                               description <- "Not yet implemented"      counter <- 1
35                               id <- i      while (!eoi(x)) {
36                               corpus <- as.character(m[i,2:dim(m)[2]])          x <- stepNext(x)
37                               if (stripWhiteSpace)          elem <- getElem(x)
38                                   corpus <- gsub("[[:space:]]+", " ", corpus)          doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
39                               if (toLower)          filehash::dbInsert(db, ID(doc), doc)
40                                   corpus <- tolower(corpus)          if (x$Length > 0) tdl[[counter]] <- ID(doc)
41                               origin <- "Not yet implemented"          else tdl <- c(tdl, ID(doc))
42                               heading <- "Not yet implemented"          counter <- counter + 1
43        }
44                               l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,      names(tdl) <- x$Names
45                                   description = description, id = id, origin = origin, heading = heading)  
46                           }      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
47                           tdcl <- new("textdoccol", .Data = l)      filehash::dbInsert(db, "DMetaData", df)
48                       },      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
49                       # Read in text documents in Reuters-21578 XML (not SGML) format  
50                       # Typically the first argument will be a directory where we can      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
51                       # find the files reut2-000.xml ... reut2-021.xml  }
52                       "REUT21578" = {  
53                           tdl <- sapply(dir(object,  .VCorpus <- function(x, cmeta, dmeta) {
54                                             pattern = ".xml",      attr(x, "CMetaData") <- cmeta
55                                             full.names = TRUE),      attr(x, "DMetaData") <- dmeta
56                                         function(file) {      class(x) <- c("VCorpus", "Corpus", "list")
57                                             tree <- xmlTreeParse(file)      x
58                                             xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)  }
59                                         })  
60    # Register S3 corpus classes to be recognized by S4 methods. This is
61                           tdcl <- new("textdoccol", .Data = tdl)  # mainly a fix to be compatible with packages which were originally
62                       })  # developed to cooperate with corresponding S4 tm classes. Necessary
63                tdcl  # since tm's class architecture was changed to S3 since tm version 0.5.
64            })  setOldClass(c("VCorpus", "Corpus", "list"))
65    
66  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  # The "..." are additional arguments for the FunctionGenerator reader
67  parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  VCorpus <- Corpus <- function(x,
68      author <- "Not yet implemented"                                readerControl = list(reader = x$DefaultReader, language = "en"),
69      timestamp <- xmlAttrs(node)[["date"]]                                ...) {
70      description <- "Not yet implemented"      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
71      id <- as.integer(xmlAttrs(node)[["itemid"]])  
72      origin <- "Not yet implemented"      if (is.function(readerControl$init))
73      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)          readerControl$init()
74    
75      if (stripWhiteSpace)      if (is.function(readerControl$exit))
76          corpus <- gsub("[[:space:]]+", " ", corpus)          on.exit(readerControl$exit())
77      if (toLower)  
78          corpus <- tolower(corpus)      # Allocate memory in advance if length is known
79        tdl <- if (x$Length > 0)
80      heading <- xmlValue(node[["title"]])          vector("list", as.integer(x$Length))
   
     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
         description = description, id = id, origin = origin, heading = heading)  
 }  
   
 # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  
 parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlValue(node[["DATE"]])  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["NEWID"]])  
   
     origin <- "Not yet implemented"  
   
     # The <BODY></BODY> tag is unfortunately NOT obligatory!  
     if (!is.null(node[["TEXT"]][["BODY"]]))  
         corpus <- xmlValue(node[["TEXT"]][["BODY"]])  
81      else      else
82          corpus <- ""          list()
83    
84      if (stripWhiteSpace)      if (x$Vectorized)
85          corpus <- gsub("[[:space:]]+", " ", corpus)          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
86      if (toLower)                        pGetElem(x),
87          corpus <- tolower(corpus)                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,
88                          SIMPLIFY = FALSE)
89      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!      else {
90      if (!is.null(node[["TEXT"]][["TITLE"]]))          counter <- 1
91          heading <- xmlValue(node[["TEXT"]][["TITLE"]])          while (!eoi(x)) {
92                x <- stepNext(x)
93                elem <- getElem(x)
94                doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
95                if (x$Length > 0)
96                    tdl[[counter]] <- doc
97      else      else
98          heading <- ""                  tdl <- c(tdl, list(doc))
99                counter <- counter + 1
100            }
101        }
102        names(tdl) <- x$Names
103        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
104        .VCorpus(tdl, .MetaDataNode(), df)
105    }
106    
107    `[.PCorpus` <- function(x, i) {
108        if (missing(i)) return(x)
109        index <- attr(x, "DMetaData")[[1 , "subset"]]
110        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
111        dmeta <- attr(x, "DMetaData")
112        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
113    }
114    
115    `[.VCorpus` <- function(x, i) {
116        if (missing(i)) return(x)
117        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
118    }
119    
120      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  `[<-.PCorpus` <- function(x, i, value) {
121          description = description, id = id, origin = origin, heading = heading)      db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
122        counter <- 1
123        for (id in unclass(x)[i]) {
124            if (identical(length(value), 1L)) db[[id]] <- value
125            else db[[id]] <- value[[counter]]
126            counter <- counter + 1
127        }
128        x
129    }
130    
131    .map_name_index <- function(x, i) {
132        if (is.character(i)) {
133            if (is.null(names(x)))
134                match(i, meta(x, "ID", type = "local"))
135            else
136                match(i, names(x))
137        }
138        i
139    }
140    
141    `[[.PCorpus` <-  function(x, i) {
142        i <- .map_name_index(x, i)
143        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
144        filehash::dbFetch(db, NextMethod("[["))
145    }
146    `[[.VCorpus` <-  function(x, i) {
147        i <- .map_name_index(x, i)
148        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
149        if (!is.null(lazyTmMap))
150            .Call("copyCorpus", x, materialize(x, i))
151        NextMethod("[[")
152    }
153    
154    `[[<-.PCorpus` <-  function(x, i, value) {
155        i <- .map_name_index(x, i)
156        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
157        index <- unclass(x)[[i]]
158        db[[index]] <- value
159        x
160    }
161    `[[<-.VCorpus` <-  function(x, i, value) {
162        i <- .map_name_index(x, i)
163        # Mark new objects as not active for lazy mapping
164        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
165        if (!is.null(lazyTmMap)) {
166            lazyTmMap$index[i] <- FALSE
167            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
168        }
169        # Set the value
170        cl <- class(x)
171        y <- NextMethod("[[<-")
172        class(y) <- cl
173        y
174    }
175    
176    # Update NodeIDs of a CMetaData tree
177    .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
178        # Traversal of (binary) CMetaData tree with setup of NodeIDs
179        set_id <- function(x) {
180            x$NodeID <- id
181            id <<- id + 1
182            level <<- level + 1
183            if (length(x$Children) > 0) {
184                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
185                left <- set_id(x$Children[[1]])
186                if (level == 1) {
187                    left.mapping <<- mapping
188                    mapping <<- NULL
189                }
190                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
191                right <- set_id(x$Children[[2]])
192    
193                x$Children <- list(left, right)
194            }
195            level <<- level - 1
196            x
197        }
198        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
199    }
200    
201    # Find indices to be updated for a CMetaData tree
202    .find_indices <- function(x) {
203        indices.mapping <- NULL
204        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
205            indices <- (DMetaData(x)$MetaID == m)
206            indices.mapping <- c(indices.mapping, list(m = indices))
207            names(indices.mapping)[length(indices.mapping)] <- m
208        }
209        indices.mapping
210    }
211    
212    c2 <- function(x, y, ...) {
213        # Update the CMetaData tree
214        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
215        update.struct <- .update_id(cmeta)
216    
217        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
218    
219        # Find indices to be updated for the left tree
220        indices.mapping <- .find_indices(x)
221    
222        # Update the DMetaData data frames for the left tree
223        for (i in 1:ncol(update.struct$left.mapping)) {
224            map <- update.struct$left.mapping[,i]
225            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
226        }
227    
228        # Find indices to be updated for the right tree
229        indices.mapping <- .find_indices(y)
230    
231        # Update the DMetaData data frames for the right tree
232        for (i in 1:ncol(update.struct$right.mapping)) {
233            map <- update.struct$right.mapping[,i]
234            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
235        }
236    
237        # Merge the DMetaData data frames
238        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
239        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
240        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
241        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
242        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
243        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
244        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
245    
246        new
247    }
248    
249    c.Corpus <-
250    function(x, ..., recursive = FALSE)
251    {
252        args <- list(...)
253    
254        if (identical(length(args), 0L))
255            return(x)
256    
257        if (!all(unlist(lapply(args, inherits, class(x)))))
258            stop("not all arguments are of the same corpus type")
259    
260        if (inherits(x, "PCorpus"))
261            stop("concatenation of corpora with underlying databases is not supported")
262    
263        l <- base::c(list(x), args)
264        if (recursive)
265            Reduce(c2, l)
266        else {
267            l <- do.call("c", lapply(l, unclass))
268            .VCorpus(l,
269                     cmeta = .MetaDataNode(),
270                     dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))
271        }
272    }
273    
274    c.TextDocument <- function(x, ..., recursive = FALSE) {
275        args <- list(...)
276    
277        if (identical(length(args), 0L))
278            return(x)
279    
280        if (!all(unlist(lapply(args, inherits, class(x)))))
281            stop("not all arguments are text documents")
282    
283        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
284        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
285    }
286    
287    print.Corpus <- function(x, ...) {
288        cat(sprintf(ngettext(length(x),
289                             "A corpus with %d text document\n",
290                             "A corpus with %d text documents\n"),
291                    length(x)))
292        invisible(x)
293    }
294    
295    summary.Corpus <- function(object, ...) {
296        print(object)
297        if (length(DMetaData(object)) > 0) {
298            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
299                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
300                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
301                        length(CMetaData(object)$MetaData)))
302            cat("Available tags are:\n")
303            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
304            cat("Available variables in the data frame are:\n")
305            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
306        }
307    }
308    
309    inspect <- function(x) UseMethod("inspect", x)
310    inspect.PCorpus <- function(x) {
311        summary(x)
312        cat("\n")
313        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
314        show(filehash::dbMultiFetch(db, unlist(x)))
315    }
316    inspect.VCorpus <- function(x) {
317        summary(x)
318        cat("\n")
319        print(noquote(lapply(x, identity)))
320    }
321    
322    lapply.PCorpus <- function(X, FUN, ...) {
323        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
324        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
325    }
326    lapply.VCorpus <- function(X, FUN, ...) {
327        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
328        if (!is.null(lazyTmMap))
329            .Call("copyCorpus", X, materialize(X))
330        base::lapply(X, FUN, ...)
331    }
332    
333    writeCorpus <-  function(x, path = ".", filenames = NULL) {
334        filenames <- file.path(path,
335                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
336                               else filenames)
337        i <- 1
338        for (o in x) {
339            writeLines(as.PlainTextDocument(o), filenames[i])
340            i <- i + 1
341        }
342  }  }

Legend:
Removed from v.32  
changed lines
  Added in v.1114

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge