SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 37, Wed Jan 11 17:49:17 2006 UTC pkg/R/corpus.R revision 1108, Fri Oct 22 18:32:47 2010 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4  setMethod("textdoccol",      attr(x, "CMetaData") <- cmeta
5            c("character", "character", "logical", "logical"),      attr(x, "DMetaData") <- dmeta
6            function(object, inputType = "CSV", stripWhiteSpace = FALSE, toLower = FALSE) {      attr(x, "DBControl") <- dbcontrol
7                # Add a new type for each unique input source format      class(x) <- c("PCorpus", "Corpus", "list")
8                type <- match.arg(inputType,c("CSV","RCV1","REUT21578"))      x
9                switch(type,  }
10                       # Text in a special CSV format  DBControl <- function(x) attr(x, "DBControl")
11                       # For details on the file format see the R documentation file  
12                       # The first argument is a directory with .csv files  PCorpus <- function(x,
13                       "CSV" = {                      readerControl = list(reader = x$DefaultReader, language = "en"),
14                           tdl <- sapply(dir(object,                      dbControl = list(dbName = "", dbType = "DB1"),
15                                             pattern = ".csv",                      ...) {
16                                             full.names = TRUE),      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
17                                         function(file) {  
18                                             m <- as.matrix(read.csv(file, header = FALSE))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19                                             l <- vector("list", dim(m)[1])          stop("error in creating database")
20                                             for (i in 1:dim(m)[1]) {      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21                                                 author <- ""  
22                                                 timestamp <- date()      # Allocate memory in advance if length is known
23                                                 description <- ""      tdl <- if (x$Length > 0)
24                                                 id <- as.integer(m[i,1])          vector("list", as.integer(x$Length))
                                                corpus <- as.character(m[i,2:dim(m)[2]])  
                                                if (stripWhiteSpace)  
                                                    corpus <- gsub("[[:space:]]+", " ", corpus)  
                                                if (toLower)  
                                                    corpus <- tolower(corpus)  
                                                origin <- "CSV"  
                                                heading <- ""  
   
                                                l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
                                                              description = description, id = id, origin = origin, heading = heading)  
                                            }  
                                            l  
                                        })  
                          tdcl <- new("textdoccol", .Data = tdl)  
                      },  
                      # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format  
                      # The first argument is a directory with the RCV1 XML files  
                      "RCV1" = {  
                          tdl <- sapply(dir(object,  
                                            pattern = ".xml",  
                                            full.names = TRUE),  
                                        function(file) {  
                                            tree <- xmlTreeParse(file)  
                                            xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower)  
                                        })  
                          tdcl <- new("textdoccol", .Data = tdl)  
                      },  
                      # Read in text documents in Reuters-21578 XML (not SGML) format  
                      # Typically the first argument will be a directory where we can  
                      # find the files reut2-000.xml ... reut2-021.xml  
                      "REUT21578" = {  
                          tdl <- sapply(dir(object,  
                                            pattern = ".xml",  
                                            full.names = TRUE),  
                                        function(file) {  
                                            tree <- xmlTreeParse(file)  
                                            xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)  
                                        })  
                          tdcl <- new("textdoccol", .Data = tdl)  
                      })  
               tdcl  
           })  
   
 # TODO: Implement lacking fields as soon I have access to the original RCV1  
 # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  
 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Reuters Corpus Volume 1 XML"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
   
     if (stripWhiteSpace)  
         corpus <- gsub("[[:space:]]+", " ", corpus)  
     if (toLower)  
         corpus <- tolower(corpus)  
   
     heading <- xmlValue(node[["title"]])  
   
     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
         description = description, id = id, origin = origin, heading = heading)  
 }  
   
 # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  
 parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!  
     if (!is.null(node[["TEXT"]][["AUTHOR"]]))  
         author <- xmlValue(node[["TEXT"]][["AUTHOR"]])  
25      else      else
26          author <- ""          list()
27    
28      timestamp <- xmlValue(node[["DATE"]])      counter <- 1
29      description <- ""      while (!eoi(x)) {
30      id <- as.integer(xmlAttrs(node)[["NEWID"]])          x <- stepNext(x)
31            elem <- getElem(x)
32            doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
33            filehash::dbInsert(db, ID(doc), doc)
34            if (x$Length > 0) tdl[[counter]] <- ID(doc)
35            else tdl <- c(tdl, ID(doc))
36            counter <- counter + 1
37        }
38        names(tdl) <- x$Names
39    
40      origin <- "Reuters-21578 XML"      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
41        filehash::dbInsert(db, "DMetaData", df)
42        dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
43    
44        .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
45    }
46    
47      # The <BODY></BODY> tag is unfortunately NOT obligatory!  .VCorpus <- function(x, cmeta, dmeta) {
48      if (!is.null(node[["TEXT"]][["BODY"]]))      attr(x, "CMetaData") <- cmeta
49          corpus <- xmlValue(node[["TEXT"]][["BODY"]])      attr(x, "DMetaData") <- dmeta
50        class(x) <- c("VCorpus", "Corpus", "list")
51        x
52    }
53    
54    # Register S3 corpus classes to be recognized by S4 methods. This is
55    # mainly a fix to be compatible with packages which were originally
56    # developed to cooperate with corresponding S4 tm classes. Necessary
57    # since tm's class architecture was changed to S3 since tm version 0.5.
58    setOldClass(c("VCorpus", "Corpus", "list"))
59    
60    # The "..." are additional arguments for the FunctionGenerator reader
61    VCorpus <- Corpus <- function(x,
62                                  readerControl = list(reader = x$DefaultReader, language = "en"),
63                                  ...) {
64        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
65    
66        # Allocate memory in advance if length is known
67        tdl <- if (x$Length > 0)
68            vector("list", as.integer(x$Length))
69      else      else
70          corpus <- ""          list()
71    
72      if (stripWhiteSpace)      if (x$Vectorized)
73          corpus <- gsub("[[:space:]]+", " ", corpus)          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
74      if (toLower)                        pGetElem(x),
75          corpus <- tolower(corpus)                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,
76                          SIMPLIFY = FALSE)
77      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!      else {
78      if (!is.null(node[["TEXT"]][["TITLE"]]))          counter <- 1
79          heading <- xmlValue(node[["TEXT"]][["TITLE"]])          while (!eoi(x)) {
80                x <- stepNext(x)
81                elem <- getElem(x)
82                doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
83                if (x$Length > 0)
84                    tdl[[counter]] <- doc
85      else      else
86          heading <- ""                  tdl <- c(tdl, list(doc))
87                counter <- counter + 1
88            }
89        }
90        names(tdl) <- x$Names
91        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
92        .VCorpus(tdl, .MetaDataNode(), df)
93    }
94    
95    `[.PCorpus` <- function(x, i) {
96        if (missing(i)) return(x)
97        index <- attr(x, "DMetaData")[[1 , "subset"]]
98        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
99        dmeta <- attr(x, "DMetaData")
100        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
101    }
102    
103    `[.VCorpus` <- function(x, i) {
104        if (missing(i)) return(x)
105        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
106    }
107    
108    `[<-.PCorpus` <- function(x, i, value) {
109        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
110        counter <- 1
111        for (id in unclass(x)[i]) {
112            if (identical(length(value), 1L)) db[[id]] <- value
113            else db[[id]] <- value[[counter]]
114            counter <- counter + 1
115        }
116        x
117    }
118    
119      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  .map_name_index <- function(x, i) {
120          description = description, id = id, origin = origin, heading = heading)      if (is.character(i)) {
121            if (is.null(names(x)))
122                match(i, meta(x, "ID", type = "local"))
123            else
124                match(i, names(x))
125        }
126        i
127    }
128    
129    `[[.PCorpus` <-  function(x, i) {
130        i <- .map_name_index(x, i)
131        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
132        filehash::dbFetch(db, NextMethod("[["))
133    }
134    `[[.VCorpus` <-  function(x, i) {
135        i <- .map_name_index(x, i)
136        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
137        if (!is.null(lazyTmMap))
138            .Call("copyCorpus", x, materialize(x, i))
139        NextMethod("[[")
140    }
141    
142    `[[<-.PCorpus` <-  function(x, i, value) {
143        i <- .map_name_index(x, i)
144        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
145        index <- unclass(x)[[i]]
146        db[[index]] <- value
147        x
148    }
149    `[[<-.VCorpus` <-  function(x, i, value) {
150        i <- .map_name_index(x, i)
151        # Mark new objects as not active for lazy mapping
152        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
153        if (!is.null(lazyTmMap)) {
154            lazyTmMap$index[i] <- FALSE
155            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
156        }
157        # Set the value
158        cl <- class(x)
159        y <- NextMethod("[[<-")
160        class(y) <- cl
161        y
162    }
163    
164    # Update NodeIDs of a CMetaData tree
165    .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
166        # Traversal of (binary) CMetaData tree with setup of NodeIDs
167        set_id <- function(x) {
168            x$NodeID <- id
169            id <<- id + 1
170            level <<- level + 1
171            if (length(x$Children) > 0) {
172                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
173                left <- set_id(x$Children[[1]])
174                if (level == 1) {
175                    left.mapping <<- mapping
176                    mapping <<- NULL
177                }
178                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
179                right <- set_id(x$Children[[2]])
180    
181                x$Children <- list(left, right)
182            }
183            level <<- level - 1
184            x
185        }
186        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
187    }
188    
189    # Find indices to be updated for a CMetaData tree
190    .find_indices <- function(x) {
191        indices.mapping <- NULL
192        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
193            indices <- (DMetaData(x)$MetaID == m)
194            indices.mapping <- c(indices.mapping, list(m = indices))
195            names(indices.mapping)[length(indices.mapping)] <- m
196        }
197        indices.mapping
198    }
199    
200    c2 <- function(x, y, ...) {
201        # Update the CMetaData tree
202        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
203        update.struct <- .update_id(cmeta)
204    
205        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
206    
207        # Find indices to be updated for the left tree
208        indices.mapping <- .find_indices(x)
209    
210        # Update the DMetaData data frames for the left tree
211        for (i in 1:ncol(update.struct$left.mapping)) {
212            map <- update.struct$left.mapping[,i]
213            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
214        }
215    
216        # Find indices to be updated for the right tree
217        indices.mapping <- .find_indices(y)
218    
219        # Update the DMetaData data frames for the right tree
220        for (i in 1:ncol(update.struct$right.mapping)) {
221            map <- update.struct$right.mapping[,i]
222            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
223        }
224    
225        # Merge the DMetaData data frames
226        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
227        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
228        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
229        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
230        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
231        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
232        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
233    
234        new
235    }
236    
237    c.Corpus <-
238    function(x, ..., recursive = FALSE)
239    {
240        args <- list(...)
241    
242        if (identical(length(args), 0L))
243            return(x)
244    
245        if (!all(unlist(lapply(args, inherits, class(x)))))
246            stop("not all arguments are of the same corpus type")
247    
248        if (inherits(x, "PCorpus"))
249            stop("concatenation of corpora with underlying databases is not supported")
250    
251        l <- base::c(list(x), args)
252        if (recursive)
253            Reduce(c2, l)
254        else {
255            l <- do.call("c", lapply(l, unclass))
256            .VCorpus(l,
257                     cmeta = .MetaDataNode(),
258                     dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))
259        }
260    }
261    
262    c.TextDocument <- function(x, ..., recursive = FALSE) {
263        args <- list(...)
264    
265        if (identical(length(args), 0L))
266            return(x)
267    
268        if (!all(unlist(lapply(args, inherits, class(x)))))
269            stop("not all arguments are text documents")
270    
271        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
272        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
273    }
274    
275    print.Corpus <- function(x, ...) {
276        cat(sprintf(ngettext(length(x),
277                             "A corpus with %d text document\n",
278                             "A corpus with %d text documents\n"),
279                    length(x)))
280        invisible(x)
281    }
282    
283    summary.Corpus <- function(object, ...) {
284        print(object)
285        if (length(DMetaData(object)) > 0) {
286            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
287                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
288                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
289                        length(CMetaData(object)$MetaData)))
290            cat("Available tags are:\n")
291            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
292            cat("Available variables in the data frame are:\n")
293            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
294        }
295    }
296    
297    inspect <- function(x) UseMethod("inspect", x)
298    inspect.PCorpus <- function(x) {
299        summary(x)
300        cat("\n")
301        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
302        show(filehash::dbMultiFetch(db, unlist(x)))
303    }
304    inspect.VCorpus <- function(x) {
305        summary(x)
306        cat("\n")
307        print(noquote(lapply(x, identity)))
308    }
309    
310    lapply.PCorpus <- function(X, FUN, ...) {
311        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
312        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
313    }
314    lapply.VCorpus <- function(X, FUN, ...) {
315        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
316        if (!is.null(lazyTmMap))
317            .Call("copyCorpus", X, materialize(X))
318        base::lapply(X, FUN, ...)
319    }
320    
321    writeCorpus <-  function(x, path = ".", filenames = NULL) {
322        filenames <- file.path(path,
323                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
324                               else filenames)
325        i <- 1
326        for (o in x) {
327            writeLines(as.PlainTextDocument(o), filenames[i])
328            i <- i + 1
329        }
330  }  }

Legend:
Removed from v.37  
changed lines
  Added in v.1108

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge