SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 32, Thu Dec 15 13:13:54 2005 UTC pkg/R/corpus.R revision 1242, Mon Aug 19 05:33:57 2013 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <-
4  # Text document collection  function(x, cmeta, dmeta, dbcontrol)
5  setClass("textdoccol",  {
6           contains = c("list"))      attr(x, "CMetaData") <- cmeta
7        attr(x, "DMetaData") <- dmeta
8  # Constructors      attr(x, "DBControl") <- dbcontrol
9        class(x) <- c("PCorpus", "Corpus", "list")
10  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))      x
11  setMethod("textdoccol",  }
12            c("character", "character", "logical", "logical"),  
13            function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {  DBControl <-
14    function(x)
15                # Add a new type for each unique input source format      attr(x, "DBControl")
16                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))  
17                switch(type,  PCorpus <-
18                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format  function(x,
19                       # For the moment the first argument is still a single file           readerControl = list(reader = x$DefaultReader, language = "en"),
20                       # This will be changed to a directory as soon as we have the full RCV1 data set           dbControl = list(dbName = "", dbType = "DB1"),
21                       "RCV1" = {           ...)
22                           tree <- xmlTreeParse(object)  {
23                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
24                       },  
25                       # Text in a special CSV format (as e.g. exported from an Excel sheet)      if (is.function(readerControl$init))
26                       # For details on the file format see data/Umfrage.csv          readerControl$init()
27                       # The first argument has to be a single file  
28                       "CSV" = {      if (is.function(readerControl$exit))
29                           m <- as.matrix(read.csv(object))          on.exit(readerControl$exit())
30                           l <- vector("list", dim(m)[1])  
31                           for (i in 1:dim(m)[1]) {      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
32                               author <- "Not yet implemented"          stop("error in creating database")
33                               timestamp <- date()      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
34                               description <- "Not yet implemented"  
35                               id <- i      # Allocate memory in advance if length is known
36                               corpus <- as.character(m[i,2:dim(m)[2]])      tdl <- if (x$Length > 0)
37                               if (stripWhiteSpace)          vector("list", as.integer(x$Length))
                                  corpus <- gsub("[[:space:]]+", " ", corpus)  
                              if (toLower)  
                                  corpus <- tolower(corpus)  
                              origin <- "Not yet implemented"  
                              heading <- "Not yet implemented"  
   
                              l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
                                  description = description, id = id, origin = origin, heading = heading)  
                          }  
                          tdcl <- new("textdoccol", .Data = l)  
                      },  
                      # Read in text documents in Reuters-21578 XML (not SGML) format  
                      # Typically the first argument will be a directory where we can  
                      # find the files reut2-000.xml ... reut2-021.xml  
                      "REUT21578" = {  
                          tdl <- sapply(dir(object,  
                                            pattern = ".xml",  
                                            full.names = TRUE),  
                                        function(file) {  
                                            tree <- xmlTreeParse(file)  
                                            xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)  
                                        })  
   
                          tdcl <- new("textdoccol", .Data = tdl)  
                      })  
               tdcl  
           })  
   
 # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  
 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
   
     if (stripWhiteSpace)  
         corpus <- gsub("[[:space:]]+", " ", corpus)  
     if (toLower)  
         corpus <- tolower(corpus)  
   
     heading <- xmlValue(node[["title"]])  
   
     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
         description = description, id = id, origin = origin, heading = heading)  
 }  
   
 # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  
 parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlValue(node[["DATE"]])  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["NEWID"]])  
   
     origin <- "Not yet implemented"  
   
     # The <BODY></BODY> tag is unfortunately NOT obligatory!  
     if (!is.null(node[["TEXT"]][["BODY"]]))  
         corpus <- xmlValue(node[["TEXT"]][["BODY"]])  
38      else      else
39          corpus <- ""          list()
40    
41        counter <- 1
42        while (!eoi(x)) {
43            x <- stepNext(x)
44            elem <- getElem(x)
45            doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
46            filehash::dbInsert(db, ID(doc), doc)
47            if (x$Length > 0) tdl[[counter]] <- ID(doc)
48            else tdl <- c(tdl, ID(doc))
49            counter <- counter + 1
50        }
51        names(tdl) <- x$Names
52    
53        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
54        filehash::dbInsert(db, "DMetaData", df)
55        dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
56    
57        .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
58    }
59    
60    .VCorpus <-
61    function(x, cmeta, dmeta)
62    {
63        attr(x, "CMetaData") <- cmeta
64        attr(x, "DMetaData") <- dmeta
65        class(x) <- c("VCorpus", "Corpus", "list")
66        x
67    }
68    
69      if (stripWhiteSpace)  # The "..." are additional arguments for the FunctionGenerator reader
70          corpus <- gsub("[[:space:]]+", " ", corpus)  VCorpus <-
71      if (toLower)  Corpus <-
72          corpus <- tolower(corpus)  function(x,
73             readerControl = list(reader = x$DefaultReader, language = "en"),
74      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!           ...)
75      if (!is.null(node[["TEXT"]][["TITLE"]]))  {
76          heading <- xmlValue(node[["TEXT"]][["TITLE"]])      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
77    
78        if (is.function(readerControl$init))
79            readerControl$init()
80    
81        if (is.function(readerControl$exit))
82            on.exit(readerControl$exit())
83    
84        # Allocate memory in advance if length is known
85        tdl <- if (x$Length > 0)
86            vector("list", as.integer(x$Length))
87      else      else
88          heading <- ""          list()
89    
90        if (x$Vectorized)
91            tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
92                          pGetElem(x),
93                          id = if (is.null(x$Names)) as.character(seq_len(x$Length)) else x$Names,
94                          SIMPLIFY = FALSE)
95        else {
96            counter <- 1
97            while (!eoi(x)) {
98                x <- stepNext(x)
99                elem <- getElem(x)
100                doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
101                if (x$Length > 0)
102                    tdl[[counter]] <- doc
103                else
104                    tdl <- c(tdl, list(doc))
105                counter <- counter + 1
106            }
107        }
108        names(tdl) <- x$Names
109        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
110        .VCorpus(tdl, .MetaDataNode(), df)
111    }
112    
113    `[.PCorpus` <-
114    function(x, i)
115    {
116        if (missing(i)) return(x)
117        index <- attr(x, "DMetaData")[[1 , "subset"]]
118        attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
119        dmeta <- attr(x, "DMetaData")
120        .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
121    }
122    
123    `[.VCorpus` <-
124    function(x, i)
125    {
126        if (missing(i)) return(x)
127        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
128    }
129    
130    `[<-.PCorpus` <-
131    function(x, i, value)
132    {
133        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
134        counter <- 1
135        for (id in unclass(x)[i]) {
136            if (identical(length(value), 1L)) db[[id]] <- value
137            else db[[id]] <- value[[counter]]
138            counter <- counter + 1
139        }
140        x
141    }
142    
143    .map_name_index <-
144    function(x, i)
145    {
146        if (is.character(i)) {
147            if (is.null(names(x)))
148                match(i, meta(x, "ID", type = "local"))
149            else
150                match(i, names(x))
151        }
152        i
153    }
154    
155    `[[.PCorpus` <-
156    function(x, i)
157    {
158        i <- .map_name_index(x, i)
159        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
160        filehash::dbFetch(db, NextMethod("[["))
161    }
162    `[[.VCorpus` <-
163    function(x, i)
164    {
165        i <- .map_name_index(x, i)
166        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
167        if (!is.null(lazyTmMap))
168            .Call("copyCorpus", x, materialize(x, i))
169        NextMethod("[[")
170    }
171    
172    `[[<-.PCorpus` <-
173    function(x, i, value)
174    {
175        i <- .map_name_index(x, i)
176        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
177        index <- unclass(x)[[i]]
178        db[[index]] <- value
179        x
180    }
181    `[[<-.VCorpus` <-
182    function(x, i, value)
183    {
184        i <- .map_name_index(x, i)
185        # Mark new objects as not active for lazy mapping
186        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
187        if (!is.null(lazyTmMap)) {
188            lazyTmMap$index[i] <- FALSE
189            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
190        }
191        # Set the value
192        cl <- class(x)
193        y <- NextMethod("[[<-")
194        class(y) <- cl
195        y
196    }
197    
198    # Update NodeIDs of a CMetaData tree
199    .update_id <-
200    function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0)
201    {
202        # Traversal of (binary) CMetaData tree with setup of NodeIDs
203        set_id <- function(x) {
204            x$NodeID <- id
205            id <<- id + 1
206            level <<- level + 1
207            if (length(x$Children) > 0) {
208                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
209                left <- set_id(x$Children[[1]])
210                if (level == 1) {
211                    left.mapping <<- mapping
212                    mapping <<- NULL
213                }
214                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
215                right <- set_id(x$Children[[2]])
216    
217                x$Children <- list(left, right)
218            }
219            level <<- level - 1
220            x
221        }
222        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
223    }
224    
225      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  # Find indices to be updated for a CMetaData tree
226          description = description, id = id, origin = origin, heading = heading)  .find_indices <-
227    function(x)
228    {
229        indices.mapping <- NULL
230        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
231            indices <- (DMetaData(x)$MetaID == m)
232            indices.mapping <- c(indices.mapping, list(m = indices))
233            names(indices.mapping)[length(indices.mapping)] <- m
234        }
235        indices.mapping
236    }
237    
238    c2 <-
239    function(x, y, ...)
240    {
241        # Update the CMetaData tree
242        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
243        update.struct <- .update_id(cmeta)
244    
245        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
246    
247        # Find indices to be updated for the left tree
248        indices.mapping <- .find_indices(x)
249    
250        # Update the DMetaData data frames for the left tree
251        for (i in 1:ncol(update.struct$left.mapping)) {
252            map <- update.struct$left.mapping[,i]
253            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
254        }
255    
256        # Find indices to be updated for the right tree
257        indices.mapping <- .find_indices(y)
258    
259        # Update the DMetaData data frames for the right tree
260        for (i in 1:ncol(update.struct$right.mapping)) {
261            map <- update.struct$right.mapping[,i]
262            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
263        }
264    
265        # Merge the DMetaData data frames
266        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
267        na.matrix <- matrix(NA,
268                            nrow = nrow(DMetaData(x)),
269                            ncol = length(labels),
270                            dimnames = list(row.names(DMetaData(x)), labels))
271        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
272        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
273        na.matrix <- matrix(NA,
274                            nrow = nrow(DMetaData(y)),
275                            ncol = length(labels),
276                            dimnames = list(row.names(DMetaData(y)), labels))
277        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
278        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
279    
280        new
281    }
282    
283    c.Corpus <-
284    function(..., recursive = FALSE)
285    {
286        args <- list(...)
287        x <- args[[1L]]
288    
289        if(length(args) == 1L)
290            return(x)
291    
292        if (!all(unlist(lapply(args, inherits, class(x)))))
293            stop("not all arguments are of the same corpus type")
294    
295        if (inherits(x, "PCorpus"))
296            stop("concatenation of corpora with underlying databases is not supported")
297    
298        if (recursive)
299            Reduce(c2, args)
300        else {
301            args <- do.call("c", lapply(args, unclass))
302            .VCorpus(args,
303                     cmeta = .MetaDataNode(),
304                     dmeta = data.frame(MetaID = rep(0, length(args)),
305                                        stringsAsFactors = FALSE))
306        }
307    }
308    
309    c.TextDocument <-
310    function(..., recursive = FALSE)
311    {
312        args <- list(...)
313        x <- args[[1L]]
314    
315        if(length(args) == 1L)
316            return(x)
317    
318        if (!all(unlist(lapply(args, inherits, class(x)))))
319            stop("not all arguments are text documents")
320    
321        dmeta <- data.frame(MetaID = rep(0, length(args)),
322                            stringsAsFactors = FALSE)
323        .VCorpus(args, .MetaDataNode(), dmeta)
324    }
325    
326    print.Corpus <-
327    function(x, ...)
328    {
329        cat(sprintf(ngettext(length(x),
330                             "A corpus with %d text document\n",
331                             "A corpus with %d text documents\n"),
332                    length(x)))
333        invisible(x)
334    }
335    
336    summary.Corpus <-
337    function(object, ...)
338    {
339        print(object)
340        if (length(DMetaData(object)) > 0) {
341            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
342                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
343                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
344                        length(CMetaData(object)$MetaData)))
345            cat("Available tags are:\n")
346            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
347            cat("Available variables in the data frame are:\n")
348            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
349        }
350    }
351    
352    inspect <-
353    function(x)
354        UseMethod("inspect", x)
355    inspect.PCorpus <-
356    function(x)
357    {
358        summary(x)
359        cat("\n")
360        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
361        show(filehash::dbMultiFetch(db, unlist(x)))
362    }
363    inspect.VCorpus <-
364    function(x)
365    {
366        summary(x)
367        cat("\n")
368        print(noquote(lapply(x, identity)))
369    }
370    
371    lapply.PCorpus <-
372    function(X, FUN, ...)
373    {
374        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
375        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
376    }
377    lapply.VCorpus <-
378    function(X, FUN, ...)
379    {
380        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
381        if (!is.null(lazyTmMap))
382            .Call("copyCorpus", X, materialize(X))
383        base::lapply(X, FUN, ...)
384    }
385    
386    writeCorpus <-
387    function(x, path = ".", filenames = NULL)
388    {
389        filenames <- file.path(path,
390                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
391                               else filenames)
392        i <- 1
393        for (o in x) {
394            writeLines(as.PlainTextDocument(o), filenames[i])
395            i <- i + 1
396        }
397  }  }

Legend:
Removed from v.32  
changed lines
  Added in v.1242

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge