SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 32, Thu Dec 15 13:13:54 2005 UTC pkg/R/corpus.R revision 1258, Fri Sep 20 12:15:42 2013 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  .PCorpus <-
4  # Text document collection  function(x, cmeta, dmeta, dbcontrol)
5  setClass("textdoccol",  {
6           contains = c("list"))      attr(x, "CMetaData") <- cmeta
7        attr(x, "DMetaData") <- dmeta
8  # Constructors      attr(x, "DBControl") <- dbcontrol
9        class(x) <- c("PCorpus", "Corpus", "list")
10  setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))      x
11  setMethod("textdoccol",  }
12            c("character", "character", "logical", "logical"),  
13            function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {  DBControl <-
14    function(x)
15                # Add a new type for each unique input source format      attr(x, "DBControl")
16                type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))  
17                switch(type,  PCorpus <-
18                       # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format  function(x,
19                       # For the moment the first argument is still a single file           readerControl = list(reader = x$DefaultReader, language = "en"),
20                       # This will be changed to a directory as soon as we have the full RCV1 data set           dbControl = list(dbName = "", dbType = "DB1"),
21                       "RCV1" = {           ...)
22                           tree <- xmlTreeParse(object)  {
23                           tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))      readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
24                       },  
25                       # Text in a special CSV format (as e.g. exported from an Excel sheet)      if (is.function(readerControl$init))
26                       # For details on the file format see data/Umfrage.csv          readerControl$init()
27                       # The first argument has to be a single file  
28                       "CSV" = {      if (is.function(readerControl$exit))
29                           m <- as.matrix(read.csv(object))          on.exit(readerControl$exit())
30                           l <- vector("list", dim(m)[1])  
31                           for (i in 1:dim(m)[1]) {      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
32                               author <- "Not yet implemented"          stop("error in creating database")
33                               timestamp <- date()      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
34                               description <- "Not yet implemented"  
35                               id <- i      # Allocate memory in advance if length is known
36                               corpus <- as.character(m[i,2:dim(m)[2]])      tdl <- if (x$Length > 0)
37                               if (stripWhiteSpace)          vector("list", as.integer(x$Length))
                                  corpus <- gsub("[[:space:]]+", " ", corpus)  
                              if (toLower)  
                                  corpus <- tolower(corpus)  
                              origin <- "Not yet implemented"  
                              heading <- "Not yet implemented"  
   
                              l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
                                  description = description, id = id, origin = origin, heading = heading)  
                          }  
                          tdcl <- new("textdoccol", .Data = l)  
                      },  
                      # Read in text documents in Reuters-21578 XML (not SGML) format  
                      # Typically the first argument will be a directory where we can  
                      # find the files reut2-000.xml ... reut2-021.xml  
                      "REUT21578" = {  
                          tdl <- sapply(dir(object,  
                                            pattern = ".xml",  
                                            full.names = TRUE),  
                                        function(file) {  
                                            tree <- xmlTreeParse(file)  
                                            xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)  
                                        })  
   
                          tdcl <- new("textdoccol", .Data = tdl)  
                      })  
               tdcl  
           })  
   
 # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  
 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlAttrs(node)[["date"]]  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["itemid"]])  
     origin <- "Not yet implemented"  
     corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  
   
     if (stripWhiteSpace)  
         corpus <- gsub("[[:space:]]+", " ", corpus)  
     if (toLower)  
         corpus <- tolower(corpus)  
   
     heading <- xmlValue(node[["title"]])  
   
     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  
         description = description, id = id, origin = origin, heading = heading)  
 }  
   
 # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  
 parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {  
     author <- "Not yet implemented"  
     timestamp <- xmlValue(node[["DATE"]])  
     description <- "Not yet implemented"  
     id <- as.integer(xmlAttrs(node)[["NEWID"]])  
   
     origin <- "Not yet implemented"  
   
     # The <BODY></BODY> tag is unfortunately NOT obligatory!  
     if (!is.null(node[["TEXT"]][["BODY"]]))  
         corpus <- xmlValue(node[["TEXT"]][["BODY"]])  
38      else      else
39          corpus <- ""          list()
40    
41        counter <- 1
42        while (!eoi(x)) {
43            x <- stepNext(x)
44            elem <- getElem(x)
45            doc <- readerControl$reader(elem, readerControl$language, if (is.null(x$Names)) as.character(counter) else x$Names[counter])
46            filehash::dbInsert(db, ID(doc), doc)
47            if (x$Length > 0) tdl[[counter]] <- ID(doc)
48            else tdl <- c(tdl, ID(doc))
49            counter <- counter + 1
50        }
51        names(tdl) <- x$Names
52    
53        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
54        filehash::dbInsert(db, "DMetaData", df)
55        dmeta.df <- data.frame(key = "DMetaData", subset = I(list(NA)))
56    
57        .PCorpus(tdl, .MetaDataNode(), dmeta.df, dbControl)
58    }
59    
60    .VCorpus <-
61    function(x, cmeta, dmeta)
62    {
63        attr(x, "CMetaData") <- cmeta
64        attr(x, "DMetaData") <- dmeta
65        class(x) <- c("VCorpus", "Corpus", "list")
66        x
67    }
68    
69    # The "..." are additional arguments for the FunctionGenerator reader
70    VCorpus <-
71    Corpus <-
72    function(x,
73             readerControl = list(reader = x$DefaultReader, language = "en"),
74             ...)
75    {
76        readerControl <- prepareReader(readerControl, x$DefaultReader, ...)
77    
78        if (is.function(readerControl$init))
79            readerControl$init()
80    
81        if (is.function(readerControl$exit))
82            on.exit(readerControl$exit())
83    
84        # Allocate memory in advance if length is known
85        tdl <- if (x$Length > 0)
86            vector("list", as.integer(x$Length))
87        else
88            list()
89    
90        if (x$Vectorized)
91            tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),
92                          pGetElem(x),
93                          id = if (is.null(x$Names) || is.na(x$Names)) as.character(seq_len(x$Length)) else x$Names,
94                          SIMPLIFY = FALSE)
95        else {
96            counter <- 1
97            while (!eoi(x)) {
98                x <- stepNext(x)
99                elem <- getElem(x)
100                id <- if (is.null(x$Names) || is.na(x$Names))
101                    as.character(counter)
102                else
103                    x$Names[counter]
104                doc <- readerControl$reader(elem, readerControl$language, id)
105                if (x$Length > 0)
106                    tdl[[counter]] <- doc
107                else
108                    tdl <- c(tdl, list(doc))
109                counter <- counter + 1
110            }
111        }
112        names(tdl) <- x$Names
113        df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
114        .VCorpus(tdl, .MetaDataNode(), df)
115    }
116    
117      if (stripWhiteSpace)  `[.PCorpus` <-
118          corpus <- gsub("[[:space:]]+", " ", corpus)  function(x, i)
119      if (toLower)  {
120          corpus <- tolower(corpus)      if (missing(i)) return(x)
121        index <- attr(x, "DMetaData")[[1 , "subset"]]
122      # The <TITLE></TITLE> tag is unfortunately NOT obligatory!      attr(x, "DMetaData")[[1 , "subset"]] <- if (is.numeric(index)) index[i] else i
123      if (!is.null(node[["TEXT"]][["TITLE"]]))      dmeta <- attr(x, "DMetaData")
124          heading <- xmlValue(node[["TEXT"]][["TITLE"]])      .PCorpus(NextMethod("["), CMetaData(x), dmeta, DBControl(x))
125    }
126    
127    `[.VCorpus` <-
128    function(x, i)
129    {
130        if (missing(i)) return(x)
131        .VCorpus(NextMethod("["), CMetaData(x), DMetaData(x)[i, , drop = FALSE])
132    }
133    
134    `[<-.PCorpus` <-
135    function(x, i, value)
136    {
137        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
138        counter <- 1
139        for (id in unclass(x)[i]) {
140            if (identical(length(value), 1L)) db[[id]] <- value
141            else db[[id]] <- value[[counter]]
142            counter <- counter + 1
143        }
144        x
145    }
146    
147    .map_name_index <-
148    function(x, i)
149    {
150        if (is.character(i)) {
151            if (is.null(names(x)))
152                match(i, meta(x, "ID", type = "local"))
153      else      else
154          heading <- ""              match(i, names(x))
155        }
156        i
157    }
158    
159      new("textdocument", .Data = corpus, author = author, timestamp = timestamp,  `[[.PCorpus` <-
160          description = description, id = id, origin = origin, heading = heading)  function(x, i)
161    {
162        i <- .map_name_index(x, i)
163        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
164        filehash::dbFetch(db, NextMethod("[["))
165    }
166    `[[.VCorpus` <-
167    function(x, i)
168    {
169        i <- .map_name_index(x, i)
170        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
171        if (!is.null(lazyTmMap))
172            .Call("copyCorpus", x, materialize(x, i))
173        NextMethod("[[")
174    }
175    
176    `[[<-.PCorpus` <-
177    function(x, i, value)
178    {
179        i <- .map_name_index(x, i)
180        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
181        index <- unclass(x)[[i]]
182        db[[index]] <- value
183        x
184    }
185    `[[<-.VCorpus` <-
186    function(x, i, value)
187    {
188        i <- .map_name_index(x, i)
189        # Mark new objects as not active for lazy mapping
190        lazyTmMap <- meta(x, tag = "lazyTmMap", type = "corpus")
191        if (!is.null(lazyTmMap)) {
192            lazyTmMap$index[i] <- FALSE
193            meta(x, tag = "lazyTmMap", type = "corpus") <- lazyTmMap
194        }
195        # Set the value
196        cl <- class(x)
197        y <- NextMethod("[[<-")
198        class(y) <- cl
199        y
200    }
201    
202    # Update NodeIDs of a CMetaData tree
203    .update_id <-
204    function(x, id = 0, mapping = NULL, left.mapping = NULL, level = 0)
205    {
206        # Traversal of (binary) CMetaData tree with setup of NodeIDs
207        set_id <- function(x) {
208            x$NodeID <- id
209            id <<- id + 1
210            level <<- level + 1
211            if (length(x$Children) > 0) {
212                mapping <<- cbind(mapping, c(x$Children[[1]]$NodeID, id))
213                left <- set_id(x$Children[[1]])
214                if (level == 1) {
215                    left.mapping <<- mapping
216                    mapping <<- NULL
217                }
218                mapping <<- cbind(mapping, c(x$Children[[2]]$NodeID, id))
219                right <- set_id(x$Children[[2]])
220    
221                x$Children <- list(left, right)
222            }
223            level <<- level - 1
224            x
225        }
226        list(root = set_id(x), left.mapping = left.mapping, right.mapping = mapping)
227    }
228    
229    # Find indices to be updated for a CMetaData tree
230    .find_indices <-
231    function(x)
232    {
233        indices.mapping <- NULL
234        for (m in levels(as.factor(DMetaData(x)$MetaID))) {
235            indices <- (DMetaData(x)$MetaID == m)
236            indices.mapping <- c(indices.mapping, list(m = indices))
237            names(indices.mapping)[length(indices.mapping)] <- m
238        }
239        indices.mapping
240    }
241    
242    c2 <-
243    function(x, y, ...)
244    {
245        # Update the CMetaData tree
246        cmeta <- .MetaDataNode(0, list(merge_date = as.POSIXlt(Sys.time(), tz = "GMT"), merger = Sys.getenv("LOGNAME")), list(CMetaData(x), CMetaData(y)))
247        update.struct <- .update_id(cmeta)
248    
249        new <- .VCorpus(c(unclass(x), unclass(y)), update.struct$root, NULL)
250    
251        # Find indices to be updated for the left tree
252        indices.mapping <- .find_indices(x)
253    
254        # Update the DMetaData data frames for the left tree
255        for (i in 1:ncol(update.struct$left.mapping)) {
256            map <- update.struct$left.mapping[,i]
257            DMetaData(x)$MetaID <- replace(DMetaData(x)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
258        }
259    
260        # Find indices to be updated for the right tree
261        indices.mapping <- .find_indices(y)
262    
263        # Update the DMetaData data frames for the right tree
264        for (i in 1:ncol(update.struct$right.mapping)) {
265            map <- update.struct$right.mapping[,i]
266            DMetaData(y)$MetaID <- replace(DMetaData(y)$MetaID, indices.mapping[[as.character(map[1])]], map[2])
267        }
268    
269        # Merge the DMetaData data frames
270        labels <- setdiff(names(DMetaData(y)), names(DMetaData(x)))
271        na.matrix <- matrix(NA,
272                            nrow = nrow(DMetaData(x)),
273                            ncol = length(labels),
274                            dimnames = list(row.names(DMetaData(x)), labels))
275        x.dmeta.aug <- cbind(DMetaData(x), na.matrix)
276        labels <- setdiff(names(DMetaData(x)), names(DMetaData(y)))
277        na.matrix <- matrix(NA,
278                            nrow = nrow(DMetaData(y)),
279                            ncol = length(labels),
280                            dimnames = list(row.names(DMetaData(y)), labels))
281        y.dmeta.aug <- cbind(DMetaData(y), na.matrix)
282        DMetaData(new) <- rbind(x.dmeta.aug, y.dmeta.aug)
283    
284        new
285    }
286    
287    c.Corpus <-
288    function(..., recursive = FALSE)
289    {
290        args <- list(...)
291        x <- args[[1L]]
292    
293        if(length(args) == 1L)
294            return(x)
295    
296        if (!all(unlist(lapply(args, inherits, class(x)))))
297            stop("not all arguments are of the same corpus type")
298    
299        if (inherits(x, "PCorpus"))
300            stop("concatenation of corpora with underlying databases is not supported")
301    
302        if (recursive)
303            Reduce(c2, args)
304        else {
305            args <- do.call("c", lapply(args, unclass))
306            .VCorpus(args,
307                     cmeta = .MetaDataNode(),
308                     dmeta = data.frame(MetaID = rep(0, length(args)),
309                                        stringsAsFactors = FALSE))
310        }
311    }
312    
313    c.TextDocument <-
314    function(..., recursive = FALSE)
315    {
316        args <- list(...)
317        x <- args[[1L]]
318    
319        if(length(args) == 1L)
320            return(x)
321    
322        if (!all(unlist(lapply(args, inherits, class(x)))))
323            stop("not all arguments are text documents")
324    
325        dmeta <- data.frame(MetaID = rep(0, length(args)),
326                            stringsAsFactors = FALSE)
327        .VCorpus(args, .MetaDataNode(), dmeta)
328    }
329    
330    print.Corpus <-
331    function(x, ...)
332    {
333        cat(sprintf(ngettext(length(x),
334                             "A corpus with %d text document\n",
335                             "A corpus with %d text documents\n"),
336                    length(x)))
337        invisible(x)
338    }
339    
340    summary.Corpus <-
341    function(object, ...)
342    {
343        print(object)
344        if (length(DMetaData(object)) > 0) {
345            cat(sprintf(ngettext(length(attr(CMetaData(object), "MetaData")),
346                                 "\nThe metadata consists of %d tag-value pair and a data frame\n",
347                                 "\nThe metadata consists of %d tag-value pairs and a data frame\n"),
348                        length(CMetaData(object)$MetaData)))
349            cat("Available tags are:\n")
350            cat(strwrap(paste(names(CMetaData(object)$MetaData), collapse = " "), indent = 2, exdent = 2), "\n")
351            cat("Available variables in the data frame are:\n")
352            cat(strwrap(paste(names(DMetaData(object)), collapse = " "), indent = 2, exdent = 2), "\n")
353        }
354    }
355    
356    inspect <-
357    function(x)
358        UseMethod("inspect", x)
359    inspect.PCorpus <-
360    function(x)
361    {
362        summary(x)
363        cat("\n")
364        db <- filehash::dbInit(DBControl(x)[["dbName"]], DBControl(x)[["dbType"]])
365        show(filehash::dbMultiFetch(db, unlist(x)))
366    }
367    inspect.VCorpus <-
368    function(x)
369    {
370        summary(x)
371        cat("\n")
372        print(noquote(lapply(x, identity)))
373    }
374    
375    lapply.PCorpus <-
376    function(X, FUN, ...)
377    {
378        db <- filehash::dbInit(DBControl(X)[["dbName"]], DBControl(X)[["dbType"]])
379        lapply(filehash::dbMultiFetch(db, unlist(X)), FUN, ...)
380    }
381    lapply.VCorpus <-
382    function(X, FUN, ...)
383    {
384        lazyTmMap <- meta(X, tag = "lazyTmMap", type = "corpus")
385        if (!is.null(lazyTmMap))
386            .Call("copyCorpus", X, materialize(X))
387        base::lapply(X, FUN, ...)
388    }
389    
390    writeCorpus <-
391    function(x, path = ".", filenames = NULL)
392    {
393        filenames <- file.path(path,
394                               if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", ID(x))))
395                               else filenames)
396        i <- 1
397        for (o in x) {
398            writeLines(as.PlainTextDocument(o), filenames[i])
399            i <- i + 1
400        }
401  }  }

Legend:
Removed from v.32  
changed lines
  Added in v.1258

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge