SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

trunk/R/trunk/R/textdoccol.R revision 18, Sat Nov 5 19:00:05 2005 UTC pkg/R/corpus.R revision 1383, Thu May 29 07:32:14 2014 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # S4 class definition  PCorpus <-
4  # Text document collection  function(x,
5  # TODO: Define proper S4 term-document matrix           readerControl = list(reader = reader(x), language = "en"),
6  setClass("textdoccol", representation(docs = "list",           dbControl = list(dbName = "", dbType = "DB1"))
7                                        tdm = "matrix"))  {
8        stopifnot(inherits(x, "Source"))
9  # Accessor function  
10  if (!isGeneric("docs")) {      readerControl <- prepareReader(readerControl, reader(x))
11      if (is.function("docs"))  
12          fun <- docs      if (!filehash::dbCreate(dbControl$dbName, dbControl$dbType))
13            stop("error in creating database")
14        db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
15    
16        tdl <- vector("list", length(x))
17        counter <- 1
18        while (!eoi(x)) {
19            x <- stepNext(x)
20            elem <- getElem(x)
21            doc <- readerControl$reader(elem,
22                                        readerControl$language,
23                                        as.character(counter))
24            filehash::dbInsert(db, meta(doc, "id"), doc)
25            tdl[[counter]] <- meta(doc, "id")
26            counter <- counter + 1
27        }
28    
29        structure(list(content = tdl,
30                       meta = CorpusMeta(),
31                       dmeta = data.frame(row.names = seq_along(tdl)),
32                       dbcontrol = dbControl),
33                  class = c("PCorpus", "Corpus"))
34    }
35    
36    Corpus <-
37    VCorpus <-
38    function(x, readerControl = list(reader = reader(x), language = "en"))
39    {
40        stopifnot(inherits(x, "Source"))
41    
42        readerControl <- prepareReader(readerControl, reader(x))
43    
44        tdl <- vector("list", length(x))
45        # Check for parallel element access
46        if (is.function(getS3method("pGetElem", class(x), TRUE)))
47            tdl <- mapply(function(elem, id)
48                              readerControl$reader(elem, readerControl$language, id),
49                          pGetElem(x),
50                          id = as.character(seq_along(x)),
51                          SIMPLIFY = FALSE)
52        else {
53            counter <- 1
54            while (!eoi(x)) {
55                x <- stepNext(x)
56                elem <- getElem(x)
57                doc <- readerControl$reader(elem,
58                                            readerControl$language,
59                                            as.character(counter))
60                tdl[[counter]] <- doc
61                counter <- counter + 1
62            }
63        }
64    
65        structure(list(content = tdl,
66                       meta = CorpusMeta(),
67                       dmeta = data.frame(row.names = seq_along(tdl))),
68                  class = c("VCorpus", "Corpus"))
69    }
70    
71    `[.PCorpus` <-
72    function(x, i)
73    {
74        if (!missing(i)) {
75            x$content <- x$content[i]
76            x$dmeta <- x$dmeta[i, , drop = FALSE]
77        }
78        x
79    }
80    
81    `[.VCorpus` <-
82    function(x, i)
83    {
84        if (!missing(i)) {
85            x$content <- x$content[i]
86            x$dmeta <- x$dmeta[i, , drop = FALSE]
87            if (!is.null(x$lazy))
88                x$lazy$index <- x$lazy$index[i]
89        }
90        x
91    }
92    
93    .map_name_index <-
94    function(x, i)
95    {
96        if (is.character(i))
97            match(i, meta(x, "id", "local"))
98      else      else
99          fun <- function(object) standardGeneric("docs")          i
100      setGeneric("docs", fun)  }
101    
102    `[[.PCorpus` <-
103    function(x, i)
104    {
105        i <- .map_name_index(x, i)
106        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
107        filehash::dbFetch(db, x$content[[i]])
108    }
109    `[[.VCorpus` <-
110    function(x, i)
111    {
112        i <- .map_name_index(x, i)
113        if (!is.null(x$lazy))
114            .Call(copyCorpus, x, materialize(x, i))
115        x$content[[i]]
116  }  }
 setMethod("docs", "textdoccol", function(object) object@docs)  
117    
118  setGeneric("textdoccol", function(docs) standardGeneric("textdoccol"))  `[[<-.PCorpus` <-
119  # Read in XML text documents  function(x, i, value)
120  # Reuters Corpus Volume 1 (RCV1)  {
121  setMethod("textdoccol", "character", function(docs) {      i <- .map_name_index(x, i)
122      require(XML)      db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
123        db[[x$content[[i]]]] <- value
124      tree <- xmlTreeParse(docs)      x
125      root <- xmlRoot(tree)  }
126    `[[<-.VCorpus` <-
127      # TODO: At each loop node points to the current newsitem  function(x, i, value)
128      node <- root  {
129        i <- .map_name_index(x, i)
130      # TODO: Implement lacking fields.      # Mark new objects as inactive for lazy mapping
131      # For this we need the full RCV1 XML set to know where to find those things      if (!is.null(x$lazy))
132      author <- "Not yet implemented"          x$lazy$index[i] <- FALSE
133      timestamp <- xmlAttrs(node)[["date"]]      x$content[[i]] <- value
134      description <- "Not yet implemented"      x
135      id <- as.integer(xmlAttrs(node)[["itemid"]])  }
136      origin <- "Not yet implemented"  
137      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)  as.list.PCorpus <- as.list.VCorpus <-
138    function(x, ...)
139        content(x)
140    
141    as.VCorpus <-
142    function(x)
143        UseMethod("as.VCorpus")
144    as.VCorpus.VCorpus <- identity
145    
146    outer_union <-
147    function(x, y, ...)
148    {
149        if (nrow(x) > 0L)
150            x[, setdiff(names(y), names(x))] <- NA
151        if (nrow(y) > 0L)
152            y[, setdiff(names(x), names(y))] <- NA
153        res <- rbind(x, y)
154        if (ncol(res) == 0L)
155            res <- data.frame(row.names = seq_len(nrow(x) + nrow(y)))
156        res
157    }
158    
159    c.VCorpus <-
160    function(..., recursive = FALSE)
161    {
162        args <- list(...)
163        x <- args[[1L]]
164    
165        if (length(args) == 1L)
166            return(x)
167    
168        if (!all(unlist(lapply(args, inherits, class(x)))))
169            stop("not all arguments are of the same corpus type")
170    
171        structure(list(content = do.call("c", lapply(args, content)),
172                       meta = CorpusMeta(meta = do.call("c",
173                         lapply(args, function(a) meta(a, type = "corpus")))),
174                       dmeta = Reduce(outer_union, lapply(args, meta))),
175                  class = c("VCorpus", "Corpus"))
176    }
177    
178    content.VCorpus <-
179    function(x)
180    {
181        if (!is.null(x$lazy))
182            .Call(copyCorpus, x, materialize(x))
183        x$content
184    }
185    
186    content.PCorpus <-
187    function(x)
188    {
189        db <- filehash::dbInit(x$dbcontrol[["dbName"]], x$dbcontrol[["dbType"]])
190        filehash::dbMultiFetch(db, unlist(x$content))
191    }
192    
193      heading <- xmlValue(node[["title"]])  inspect <-
194    function(x)
195        UseMethod("inspect", x)
196    inspect.PCorpus <- inspect.VCorpus <-
197    function(x)
198    {
199        print(x)
200        cat("\n")
201        print(noquote(content(x)))
202        invisible(x)
203    }
204    
205      doc <- new("textdocument", author = author, timestamp = timestamp, description = description,  length.PCorpus <- length.VCorpus <-
206                 id = id, origin = origin, corpus = corpus, heading = heading)  function(x)
207        length(x$content)
208    
209    names.PCorpus <- names.VCorpus <-
210    function(x)
211        as.character(meta(x, "id", "local"))
212    
213    `names<-.PCorpus` <- `names<-.VCorpus` <-
214    function(x, value)
215    {
216        meta(x, "id", "local") <- as.character(value)
217        x
218    }
219    
220    print.PCorpus <- print.VCorpus <-
221    function(x, ...)
222    {
223        writeLines(sprintf("<<%s (documents: %d, metadata (corpus/indexed): %d/%d)>>",
224                           class(x)[1],
225                           length(x),
226                           length(meta(x, type = "corpus")),
227                           ncol(meta(x, type = "indexed"))))
228        invisible(x)
229    }
230    
231      new("textdoccol", docs = list(doc), tdm = matrix())  writeCorpus <-
232  })  function(x, path = ".", filenames = NULL)
233    {
234        filenames <- file.path(path,
235          if (is.null(filenames))
236              sprintf("%s.txt", as.character(meta(x, "id", "local")))
237          else filenames)
238    
239        stopifnot(length(x) == length(filenames))
240    
241        mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
242    
243        invisible(x)
244    }

Legend:
Removed from v.18  
changed lines
  Added in v.1383

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge