SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 37, Wed Jan 11 17:49:17 2006 UTC pkg/R/corpus.R revision 1095, Wed Aug 25 19:05:38 2010 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))  .PCorpus <- function(x, cmeta, dmeta, dbcontrol) {
4  setMethod("textdoccol",      attr(x, "CMetaData") <- cmeta
5            c("character", "character", "logical", "logical"),      attr(x, "DMetaData") <- dmeta
6            function(object, inputType = "CSV", stripWhiteSpace = FALSE, toLower = FALSE) {      attr(x, "DBControl") <- dbcontrol
7                # Add a new type for each unique input source format      class(x) <- c("PCorpus", "Corpus", "list")
8                type <- match.arg(inputType,c("CSV","RCV1","REUT21578"))      x
9                switch(type,  }
10                       # Text in a special CSV format  DBControl <- function(x) attr(x, "DBControl")
11                       # For details on the file format see the R documentation file  
12                       # The first argument is a directory with .csv files  PCorpus <- function(x,
13                       "CSV" = {                      readerControl = list(reader = x$DefaultReader, language = "en"),
14                           tdl <- sapply(dir(object,                      dbControl = list(dbName = "", dbType = "DB1"),
15                                             pattern = ".csv",                      ...) {
16                                             full.names = TRUE),      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
17                                         function(file) {  
18                                             m <- as.matrix(read.csv(file, header = FALSE))      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
19                                             l <- vector("list", dim(m)[1])          stop("error in creating database")
20                                             for (i in 1:dim(m)[1]) {      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
21                                                 author <- ""  
22                                                 timestamp <- date()      # Allocate memory in advance if length is known
23                                                 description <- ""      tdl <- if (x$Length > 0)
24                                                 id <- as.integer(m[i,1])          vector("list", as.integer(x$Length))
                                                corpus <- as.character(m[i,2:dim(m)[2]])  
                                                if (stripWhiteSpace)  
                                                    corpus <- gsub("[[:space:]]+", " ", corpus)  
                                                if (toLower)  
                                                    corpus <- tolower(corpus)  
                                                origin <- "CSV"  
                                                heading <- ""  
   
                                                l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
                                                              description = description, id = id, origin = origin, heading = heading)  
                                            }  
                                            l  
                                        })  
                          tdcl <- new("textdoccol", .Data = tdl)  
                      },  
                      # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format  
                      # The first argument is a directory with the RCV1 XML files  
                      "RCV1" = {  
                          tdl <- sapply(dir(object,  
                                            pattern = ".xml",  
                                            full.names = TRUE),  
                                        function(file) {  
                                            tree <- xmlTreeParse(file)  
                                            xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower)  
                                        })  
                          tdcl <- new("textdoccol", .Data = tdl)  
                      },  
                      # Read in text documents in Reuters-21578 XML (not SGML) format  
                      # Typically the first argument will be a directory where we can  
                      # find the files reut2-000.xml ... reut2-021.xml  
                      "REUT21578" = {  
                          tdl <- sapply(dir(object,  
                                            pattern = ".xml",  
                                            full.names = TRUE),  
                                        function(file) {  
                                            tree <- xmlTreeParse(file)  
                                            xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)  
                                        })  
                          tdcl <- new("textdoccol", .Data = tdl)  
                      })  
               tdcl  
           })  
   
 # TODO: Implement lacking fields as soon I have access to the original RCV1  
 # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  
 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Reuters Corpus Volume 1 XML"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
   
     if (stripWhiteSpace)  
         corpus <- gsub("[[:space:]]+", " ", corpus)  
     if (toLower)  
         corpus <- tolower(corpus)  
   
     heading <- xmlValue(node[["title"]])  
   
     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
         description = description, id = id, origin = origin, heading = heading)  
 }  
   
 # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  
 parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!  
     if (!is.null(node[["TEXT"]][["AUTHOR"]]))  
         author <- xmlValue(node[["TEXT"]][["AUTHOR"]])  
25      else      else
26          author <- ""          list()
27    
28        counter <- 1
29        while (!eoi(x)) {
30            x <- stepNext(x)
31            elem <- getElem(x)
32            doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
33            filehash::dbInsert(db, ID(doc), doc)
34            if (x$Length > 0) tdl[[counter]] <- ID(doc)
35            else tdl <- c(tdl, ID(doc))
36            counter <- counter + 1
37        }
38        names(tdl) <- x$Names
39    
40      timestamp <- xmlValue(node[["DATE"]])      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
41      description <- ""      filehash::dbInsert(db, "DMetaData", df)
42      id <- as.integer(xmlAttrs(node)[["NEWID"]])      dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
43    
44      origin <- "Reuters-21578 XML"      .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
45    }
46    
47    .VCorpus <- function(x, cmeta, dmeta) {
48        attr(x, "CMetaData") <- cmeta
49        attr(x, "DMetaData") <- dmeta
50        class(x) <- c("VCorpus", "Corpus", "list")
51        x
52    }
53    
54      # The <BODY></BODY> tag is unfortunately NOT obligatory!  # Register S3 corpus classes to be recognized by S4 methods. This is
55      if (!is.null(node[["TEXT"]][["BODY"]]))  # mainly a fix to be compatible with packages which were originally
56          corpus <- xmlValue(node[["TEXT"]][["BODY"]])  # developed to cooperate with corresponding S4 tm classes. Necessary
57    # since tm's class architecture was changed to S3 since tm version 0.5.
58    setOldClass(c("VCorpus", "Corpus", "list"))
59    
60    # The "..." are additional arguments for the FunctionGenerator reader
61    VCorpus <- Corpus <- function(x,
62                                  readerControl = list(reader = x$DefaultReader, language = "en"),
63                                  ...) {
64        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
65    
66        # Allocate memory in advance if length is known
67        tdl <- if (x$Length > 0)
68            vector("list", as.integer(x$Length))
69      else      else
70          corpus <- ""          list()
71    
72      if (stripWhiteSpace)      if (x$Vectorized)
73          corpus <- gsub("[[:space:]]+", " ", corpus)          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
74      if (toLower)                        pGetElem(x),
75          corpus <- tolower(corpus)                        id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,
76                          SIMPLIFY = FALSE)
77      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!      else {
78      if (!is.null(node[["TEXT"]][["TITLE"]]))          counter <- 1
79          heading <- xmlValue(node[["TEXT"]][["TITLE"]])          while (!eoi(x)) {
80                x <- stepNext(x)
81                elem <- getElem(x)
82                doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
83                if (x$Length > 0)
84                    tdl[[counter]] <- doc
85      else      else
86          heading <- ""                  tdl <- c(tdl, list(doc))
87                counter <- counter + 1
88            }
89        }
90        names(tdl) <- x$Names
91        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
92        .VCorpus(tdl, .MetaDataNode(), df)
93    }
94    
95    `[.PCorpus` <- function(x, i) {
96        if (missing(i)) return(x)
97        index <- attr(x, "DMetaData")[[1 , "subset"]]
98        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
99        dmeta <- attr(x, "DMetaData")
100        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
101    }
102    
103      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  `[.VCorpus` <- function(x, i) {
104          description = description, id = id, origin = origin, heading = heading)      if (missing(i)) return(x)
105        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
106    }
107    
108    `[<-.PCorpus` <- function(x, i, value) {
109        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
110        counter <- 1
111        for (id in unclass(x)[i]) {
112            if (identical(length(value), 1L)) db[[id]] <- value
113            else db[[id]] <- value[[counter]]
114            counter <- counter + 1
115        }
116        x
117    }
118    
119    `[[.PCorpus` <-  function(x, i) {
120        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
121        filehash::dbFetch(db, NextMethod("[["))
122    }
123    `[[.VCorpus` <-  function(x, i) {
124        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
125        if (!is.null(lazyTmMap))
126            .Call("copyCorpus", x, materialize(x, i))
127        NextMethod("[[")
128    }
129    
130    `[[<-.PCorpus` <-  function(x, i, value) {
131        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
132        index <- unclass(x)[[i]]
133        db[[index]] <- value
134        x
135    }
136    `[[<-.VCorpus` <-  function(x, i, value) {
137        # Mark new objects as not active for lazy mapping
138        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
139        if (!is.null(lazyTmMap)) {
140            lazyTmMap$index[i] <- FALSE
141            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
142        }
143        # Set the value
144        cl <- class(x)
145        y <- NextMethod("[[<-")
146        class(y) <- cl
147        y
148    }
149    
150    # Update NodeIDs of a CMetaData tree
151    .update_id <- function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0) {
152        # Traversal of (binary) CMetaData tree with setup of NodeIDs
153        set_id <- function(x) {
154            x$NodeID <- id
155            id <<- id + 1
156            level <<- level + 1
157            if (length(x$Children) > 0) {
158                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
159                left <- set_id(x$Children[[1]])
160                if (level == 1) {
161                    left.mapping <<- mapping
162                    mapping <<- NULL
163                }
164                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
165                right <- set_id(x$Children[[2]])
166    
167                x$Children <- list(left, right)
168            }
169            level <<- level - 1
170            x
171        }
172        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
173    }
174    
175    # Find indices to be updated for a CMetaData tree
176    .find_indices <- function(x) {
177        indices.mapping <- NULL
178        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
179            indices <- (DMetaData(x)$MetaID == m)
180            indices.mapping <- c(indices.mapping, list(m = indices))
181            names(indices.mapping)[length(indices.mapping)] <- m
182        }
183        indices.mapping
184    }
185    
186    c2 <- function(x, y, ...) {
187        # Update the CMetaData tree
188        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
189        update.struct <- .update_id(cmeta)
190    
191        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
192    
193        # Find indices to be updated for the left tree
194        indices.mapping <- .find_indices(x)
195    
196        # Update the DMetaData data frames for the left tree
197        for (i in 1:ncol(update.struct$left.mapping)) {
198            map <- update.struct$left.mapping[,i]
199            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
200        }
201    
202        # Find indices to be updated for the right tree
203        indices.mapping <- .find_indices(y)
204    
205        # Update the DMetaData data frames for the right tree
206        for (i in 1:ncol(update.struct$right.mapping)) {
207            map <- update.struct$right.mapping[,i]
208            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
209        }
210    
211        # Merge the DMetaData data frames
212        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
213        na.matrix <- matrix(NA, nrow = nrow(DMetaData(x)), ncol = length(labels), dimnames = list(row.names(DMetaData(x)), labels))
214        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
215        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
216        na.matrix <- matrix(NA, nrow = nrow(DMetaData(y)), ncol = length(labels), dimnames = list(row.names(DMetaData(y)), labels))
217        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
218        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
219    
220        new
221    }
222    
223    c.Corpus <-
224    function(x, ..., recursive = FALSE)
225    {
226        args <- list(...)
227    
228        if (identical(length(args), 0L))
229            return(x)
230    
231        if (!all(unlist(lapply(args, inherits, class(x)))))
232            stop("not all arguments are of the same corpus type")
233    
234        if (inherits(x, "PCorpus"))
235            stop("concatenation of corpora with underlying databases is not supported")
236    
237        l <- base::c(list(x), args)
238        if (recursive)
239            Reduce(c2, l)
240        else {
241            l <- do.call("c", lapply(l, unclass))
242            .VCorpus(l,
243                     cmeta = .MetaDataNode(),
244                     dmeta = data.frame(MetaID = rep(0, length(l)), stringsAsFactors = FALSE))
245        }
246    }
247    
248    c.TextDocument <- function(x, ..., recursive = FALSE) {
249        args <- list(...)
250    
251        if (identical(length(args), 0L))
252            return(x)
253    
254        if (!all(unlist(lapply(args, inherits, class(x)))))
255            stop("not all arguments are text documents")
256    
257        dmeta <- data.frame(MetaID = rep(0, length(list(x, ...))), stringsAsFactors = FALSE)
258        .VCorpus(list(x, ...), .MetaDataNode(), dmeta)
259    }
260    
261    print.Corpus <- function(x, ...) {
262        cat(sprintf(ngettext(length(x),
263                             "A corpus with %d text document\n",
264                             "A corpus with %d text documents\n"),
265                    length(x)))
266        invisible(x)
267    }
268    
269    summary.Corpus <- function(object, ...) {
270        print(object)
271        if (length(DMetaData(object)) > 0) {
272            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
273                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
274                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
275                        length(CMetaData(object)$MetaData)))
276            cat("Available tags are:\n")
277            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
278            cat("Available variables in the data frame are:\n")
279            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
280        }
281    }
282    
283    inspect <- function(x) UseMethod("inspect", x)
284    inspect.PCorpus <- function(x) {
285        summary(x)
286        cat("\n")
287        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
288        show(filehash::dbMultiFetch(db, unlist(x)))
289    }
290    inspect.VCorpus <- function(x) {
291        summary(x)
292        cat("\n")
293        print(noquote(lapply(x, identity)))
294    }
295    
296    lapply.PCorpus <- function(X, FUN, ...) {
297        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
298        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
299    }
300    lapply.VCorpus <- function(X, FUN, ...) {
301        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
302        if (!is.null(lazyTmMap))
303            .Call("copyCorpus", X, materialize(X))
304        base::lapply(X, FUN, ...)
305    }
306    
307    writeCorpus <-  function(x, path = ".", filenames = NULL) {
308        filenames <- file.path(path,
309                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
310                               else filenames)
311        i <- 1
312        for (o in x) {
313            writeLines(as.PlainTextDocument(o), filenames[i])
314            i <- i + 1
315        }
316  }  }

Legend:
Removed from v.37  
changed lines
  Added in v.1095

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge