SCM

SCM Repository

[tm] Diff of /trunk/R/textmin/R/textdoccol.R
ViewVC logotype

Diff of /trunk/R/textmin/R/textdoccol.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 64, Sun Oct 29 14:29:43 2006 UTC revision 65, Tue Oct 31 17:10:24 2006 UTC
# Line 1  Line 1 
1  # Author: Ingo Feinerer  # Author: Ingo Feinerer
2    
3  # ... fuer Argumente des Funktionengenerators  # The "..." are additional arguments for the function_generator parser
4  setGeneric("TextDocCol", function(object, parser = plaintext_parser, ...) standardGeneric("TextDocCol"))  setGeneric("TextDocCol", function(object, parser = plaintext_parser, ...) standardGeneric("TextDocCol"))
5  setMethod("TextDocCol",  setMethod("TextDocCol",
6            signature(object = "Source"),            signature(object = "Source"),
# Line 34  Line 34 
34                    Position = 0, Load = load)                    Position = 0, Load = load)
35            })            })
36    
37  setGeneric("CSVSource", function(file) standardGeneric("CSVSource"))  setGeneric("CSVSource", function(object, isConCall = FALSE) standardGeneric("CSVSource"))
38  setMethod("CSVSource",  setMethod("CSVSource",
39            signature(file = "character"),            signature(object = "character"),
40            function(file) {            function(object, isConCall = FALSE) {
41                new("CSVSource", LoDSupport = FALSE, FileName = file,                if (!isConCall)
42                    Content = scan(file, what = "character"), Position = 0)                    object <- paste('file("', object, '")', sep = "")
43                  con <- eval(parse(text = object))
44                  content <- scan(con, what = "character")
45                  close(con)
46                  new("CSVSource", LoDSupport = FALSE, URI = object,
47                      Content = content, Position = 0)
48            })            })
49    
50  setGeneric("Reuters21578XMLSource", function(file) standardGeneric("Reuters21578XMLSource"))  setGeneric("ReutersSource", function(object, isConCall = FALSE) standardGeneric("ReutersSource"))
51  setMethod("Reuters21578XMLSource",  setMethod("ReutersSource",
52            signature(file = "character"),            signature(object = "character"),
53            function(file) {            function(object, isConCall = FALSE) {
54                tree <- xmlTreeParse(file)                if (!isConCall)
55                     object <- paste('file("', object, '")', sep = "")
56                  con <- eval(parse(text = object))
57                  corpus <- paste(readLines(con), "\n", collapse = "")
58                  close(con)
59                  tree <- xmlTreeParse(corpus, asText = TRUE)
60                content <- xmlRoot(tree)$children                content <- xmlRoot(tree)$children
61                new("Reuters21578XMLSource", LoDSupport = FALSE, FileName = file,  
62                  new("ReutersSource", LoDSupport = FALSE, URI = object,
63                    Content = content, Position = 0)                    Content = content, Position = 0)
64            })            })
65    
# Line 66  Line 77 
77                object                object
78            })            })
79  setMethod("stepNext",  setMethod("stepNext",
80            signature(object = "Reuters21578XMLSource"),            signature(object = "ReutersSource"),
81            function(object) {            function(object) {
82                object@Position <- object@Position + 1                object@Position <- object@Position + 1
83                object                object
# Line 77  Line 88 
88            signature(object = "DirSource"),            signature(object = "DirSource"),
89            function(object) {            function(object) {
90                list(content = readLines(object@FileList[object@Position]),                list(content = readLines(object@FileList[object@Position]),
91                     filename = object@FileList[object@Position])                     uri = paste('file("', object@FileList[object@Position], '")', sep = ""))
92            })            })
93  setMethod("getElem",  setMethod("getElem",
94            signature(object = "CSVSource"),            signature(object = "CSVSource"),
95            function(object) {            function(object) {
96                list(content = object@Content[object@Position],                list(content = object@Content[object@Position],
97                     filename = object@FileName)                     uri = object@URI)
98            })            })
99  setMethod("getElem",  setMethod("getElem",
100            signature(object = "Reuters21578XMLSource"),            signature(object = "ReutersSource"),
101            function(object) {            function(object) {
102                list(content = object@Content[object@Position],                # Construct a character representation from the XMLNode
103                     filename = object@FileName)                con <- textConnection("virtual.file", "w")
104                  saveXML(object@Content[[object@Position]], con)
105                  close(con)
106    
107                  list(content = virtual.file, uri = object@URI)
108            })            })
109    
110  setGeneric("eoi", function(object) standardGeneric("eoi"))  setGeneric("eoi", function(object) standardGeneric("eoi"))
# Line 110  Line 125 
125                    return(FALSE)                    return(FALSE)
126            })            })
127  setMethod("eoi",  setMethod("eoi",
128            signature(object = "Reuters21578XMLSource"),            signature(object = "ReutersSource"),
129            function(object) {            function(object) {
130                if (length(object@Content) <= object@Position)                if (length(object@Content) <= object@Position)
131                    return(TRUE)                    return(TRUE)
# Line 121  Line 136 
136  plaintext_parser <- function(...) {  plaintext_parser <- function(...) {
137      function(elem, lodsupport, load, id) {      function(elem, lodsupport, load, id) {
138          if (!lodsupport || (lodsupport && load)) {          if (!lodsupport || (lodsupport && load)) {
139              doc <- new("PlainTextDocument", .Data = elem$content, FileName = elem$filename, Cached = TRUE,              doc <- new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,
140                         Author = "", DateTimeStamp = date(), Description = "", ID = id, Origin = "", Heading = "")                         Author = "", DateTimeStamp = date(), Description = "", ID = id, Origin = "", Heading = "")
141          }          }
142          else {          else {
143              doc <- new("PlainTextDocument", FileName = elem$filename, Cached = FALSE,              doc <- new("PlainTextDocument", URI = elem$uri, Cached = FALSE,
144                         Author = "", DateTimeStamp = date(), Description = "", ID = id, Origin = "", Heading = "")                         Author = "", DateTimeStamp = date(), Description = "", ID = id, Origin = "", Heading = "")
145          }          }
146    
# Line 136  Line 151 
151    
152  reuters21578xml_parser <- function(...) {  reuters21578xml_parser <- function(...) {
153      function(elem, lodsupport, load, id) {      function(elem, lodsupport, load, id) {
154          tree <- xmlTreeParse(elem$filename)          corpus <- paste(elem$content, "\n", collapse = "")
155            tree <- xmlTreeParse(corpus, asText = TRUE)
156          node <- xmlRoot(tree)          node <- xmlRoot(tree)
157    
158            # Mask as list to bypass S4 checks
159            class(tree) <- "list"
160    
161          # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!          # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
162          if (!is.null(node[["TEXT"]][["AUTHOR"]]))          if (!is.null(node[["TEXT"]][["AUTHOR"]]))
163              author <- xmlValue(node[["TEXT"]][["AUTHOR"]])              author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
# Line 158  Line 177 
177          topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)          topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
178    
179          if (!lodsupport || (lodsupport && load)) {          if (!lodsupport || (lodsupport && load)) {
180              doc <- new("XMLTextDocument", .Data = elem$content, FileName = elem$filename, Cached = TRUE, Author = author,              doc <- new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = author,
181                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
182                         Heading = heading, LocalMetaData = list(Topics = topics))                         Heading = heading, LocalMetaData = list(Topics = topics))
183          } else {          } else {
184              doc <- new("XMLTextDocument", FileName = elem$filename, Cached = FALSE, Author = author,              doc <- new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = author,
185                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters-21578 XML",
186                         Heading = heading, LocalMetaData = list(Topics = topics))                         Heading = heading, LocalMetaData = list(Topics = topics))
187          }          }
# Line 174  Line 193 
193    
194  rcv1_parser <- function(...) {  rcv1_parser <- function(...) {
195      function(elem, lodsupport, load, id) {      function(elem, lodsupport, load, id) {
196          tree <- xmlTreeParse(elem$filename)          corpus <- paste(elem$content, "\n", collapse = "")
197            tree <- xmlTreeParse(corpus, asText = TRUE)
198          node <- xmlRoot(tree)          node <- xmlRoot(tree)
199    
200            # Mask as list to bypass S4 checks
201            class(tree) <- "list"
202    
203          datetimestamp <- xmlAttrs(node)[["date"]]          datetimestamp <- xmlAttrs(node)[["date"]]
204          id <- xmlAttrs(node)[["itemid"]]          id <- xmlAttrs(node)[["itemid"]]
205          heading <- xmlValue(node[["title"]])          heading <- xmlValue(node[["title"]])
206    
207          if (!lodsupport || (lodsupport && load)) {          if (!lodsupport || (lodsupport && load)) {
208              doc <- new("XMLTextDocument", .Data = elem$content, FileName = elem$filename, Cached = TRUE, Author = "",              doc <- new("XMLTextDocument", .Data = tree, URI = elem$uri, Cached = TRUE, Author = "",
209                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
210                         Heading = heading)                         Heading = heading)
211          } else {          } else {
212              doc <- new("XMLTextDocument", FileName = elem$filename, Cached = FALSE, Author = "",              doc <- new("XMLTextDocument", URI = elem$uri, Cached = FALSE, Author = "",
213                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",                         DateTimeStamp = datetimestamp, Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML",
214                         Heading = heading)                         Heading = heading)
215          }          }
# Line 198  Line 221 
221    
222  uci_kdd_newsgroup_parser <- function(...) {  uci_kdd_newsgroup_parser <- function(...) {
223      function(elem, lodsupport, load, id) {      function(elem, lodsupport, load, id) {
224          mail <- readLines(elem$filename)          mail <- elem$content
225          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))
226          datetimestamp <- gsub("Date: ", "", grep("^Date:", mail, value = TRUE))          datetimestamp <- gsub("Date: ", "", grep("^Date:", mail, value = TRUE))
227          origin <- gsub("Path: ", "", grep("^Path:", mail, value = TRUE))          origin <- gsub("Path: ", "", grep("^Path:", mail, value = TRUE))
# Line 206  Line 229 
229          newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE))          newsgroup <- gsub("Newsgroups: ", "", grep("^Newsgroups:", mail, value = TRUE))
230    
231          if (!lodsupport || (lodsupport && load)) {          if (!lodsupport || (lodsupport && load)) {
232              index <- grep("^Lines:", mail)              # The header is separated from the body by a blank line.
233                # Reference: \url{http://en.wikipedia.org/wiki/E-mail#Internet_e-mail_format}
234                for (index in seq(along = mail)) {
235                    if (mail[index] == "")
236                        break
237                }
238              content <- mail[(index + 1):length(mail)]              content <- mail[(index + 1):length(mail)]
239    
240              doc <- new("NewsgroupDocument", .Data = content, FileName = elem$filename, Cached = TRUE,              doc <- new("NewsgroupDocument", .Data = content, URI = elem$uri, Cached = TRUE,
241                         Author = author, DateTimeStamp = datetimestamp,                         Author = author, DateTimeStamp = datetimestamp,
242                         Description = "", ID = elem$filename, Origin = origin,                         Description = "", ID = id, Origin = origin,
243                         Heading = heading, Newsgroup = newsgroup)                         Heading = heading, Newsgroup = newsgroup)
244          } else {          } else {
245              doc <- new("NewsgroupDocument", FileName = elem$filename, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,              doc <- new("NewsgroupDocument", URI = elem$uri, Cached = FALSE, Author = author, DateTimeStamp = datetimestamp,
246                         Description = "", ID = elem$filename, Origin = origin, Heading = heading, Newsgroup = newsgroup)                         Description = "", ID = id, Origin = origin, Heading = heading, Newsgroup = newsgroup)
247          }          }
248    
249          return(doc)          return(doc)
# Line 224  Line 252 
252  class(uci_kdd_newsgroup_parser) <- "function_generator"  class(uci_kdd_newsgroup_parser) <- "function_generator"
253    
254  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file  # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
255  rcv1_to_plain <- function(node) {  rcv1_to_plain <- function(node, ...) {
256      datetimestamp <- xmlAttrs(node)[["date"]]      datetimestamp <- xmlAttrs(node)[["date"]]
257      id <- xmlAttrs(node)[["itemid"]]      id <- xmlAttrs(node)[["itemid"]]
258      origin <- "Reuters Corpus Volume 1 XML"      origin <- "Reuters Corpus Volume 1 XML"
259      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)      corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
260      heading <- xmlValue(node[["title"]])      heading <- xmlValue(node[["title"]])
261    
262      new("PlainTextDocument", .Data = corpus, Author = "", DateTimeStamp = datetimestamp,      new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = "", DateTimeStamp = datetimestamp,
263          Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML", Heading = heading)          Description = "", ID = id, Origin = "Reuters Corpus Volume 1 XML", Heading = heading)
264  }  }
265    
266  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file  # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
267  reuters21578xml_to_plain <- function(node) {  reuters21578xml_to_plain <- function(node, ...) {
268      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!      # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
269      if (!is.null(node[["TEXT"]][["AUTHOR"]]))      if (!is.null(node[["TEXT"]][["AUTHOR"]]))
270          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])          author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
# Line 263  Line 291 
291    
292      topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)      topics <- unlist(xmlApply(node[["TOPICS"]], function(x) xmlValue(x)), use.names = FALSE)
293    
294      new("PlainTextDocument", .Data = corpus, Cached = TRUE, Author = author, DateTimeStamp = datetimestamp,      new("PlainTextDocument", .Data = corpus, Cached = TRUE, URI = "", Author = author, DateTimeStamp = datetimestamp,
295          Description = description, ID = id, Origin = origin, Heading = heading, LocalMetaData = list(Topics = topics))          Description = description, ID = id, Origin = origin, Heading = heading, LocalMetaData = list(Topics = topics))
296  }  }
297    
298  setGeneric("loadFileIntoMem", function(object) standardGeneric("loadFileIntoMem"))  setGeneric("loadFileIntoMem", function(object, ...) standardGeneric("loadFileIntoMem"))
299  setMethod("loadFileIntoMem",  setMethod("loadFileIntoMem",
300            signature(object = "PlainTextDocument"),            signature(object = "PlainTextDocument"),
301            function(object) {            function(object, ...) {
302                if (!Cached(object)) {                if (!Cached(object)) {
303                    corpus <- readLines(FileName(object))                    con <- eval(parse(text = URI(object)))
304                      corpus <- readLines(con)
305                      close(con)
306                    Corpus(object) <- corpus                    Corpus(object) <- corpus
307                    Cached(object) <- TRUE                    Cached(object) <- TRUE
308                    return(object)                    return(object)
# Line 282  Line 312 
312            })            })
313  setMethod("loadFileIntoMem",  setMethod("loadFileIntoMem",
314            signature(object =  "XMLTextDocument"),            signature(object =  "XMLTextDocument"),
315            function(object) {            function(object, ...) {
316                if (!Cached(object)) {                if (!Cached(object)) {
317                    file <- FileName(object)                    con <- eval(parse(text = URI(object)))
318                    doc <- xmlTreeParse(file)                    corpus <- paste(readLines(con), "\n", collapse = "")
319                      close(con)
320                      doc <- xmlTreeParse(corpus, asText = TRUE)
321                    class(doc) <- "list"                    class(doc) <- "list"
322                    Corpus(object) <- doc                    Corpus(object) <- doc
323                    Cached(object) <- TRUE                    Cached(object) <- TRUE
# Line 296  Line 328 
328            })            })
329  setMethod("loadFileIntoMem",  setMethod("loadFileIntoMem",
330            signature(object = "NewsgroupDocument"),            signature(object = "NewsgroupDocument"),
331            function(object) {            function(object, ...) {
332                if (!Cached(object)) {                if (!Cached(object)) {
333                    mail <- readLines(FileName(object))                    con <- eval(parse(text = URI(object)))
334                      mail <- readLines(con)
335                      close(con)
336                    Cached(object) <- TRUE                    Cached(object) <- TRUE
337                    index <- grep("^Lines:", mail)                    for (index in seq(along = mail)) {
338                          if (mail[index] == "")
339                              break
340                      }
341                    Corpus(object) <- mail[(index + 1):length(mail)]                    Corpus(object) <- mail[(index + 1):length(mail)]
342                    return(object)                    return(object)
343                } else {                } else {
# Line 479  Line 516 
516                    return(x)                    return(x)
517                return(as(c(as(x, "list"), ...), "TextDocCol"))                return(as(c(as(x, "list"), ...), "TextDocCol"))
518      })      })
519    setMethod("c",
520              signature(x = "TextDocument"),
521              function(x, ..., recursive = TRUE){
522                  args <- list(...)
523                  if(length(args) == 0)
524                      return(x)
525                  return(new("TextDocCol", .Data = list(x, ...)))
526        })
527    
528  setMethod("length",  setMethod("length",
529            signature(x = "TextDocCol"),            signature(x = "TextDocCol"),
# Line 513  Line 558 
558    
559  setGeneric("inspect", function(object) standardGeneric("inspect"))  setGeneric("inspect", function(object) standardGeneric("inspect"))
560  setMethod("inspect",  setMethod("inspect",
561            c("TextDocCol"),            signature("TextDocCol"),
562            function(object) {            function(object) {
563                summary(object)                summary(object)
564                cat("\n")                cat("\n")
565                show(as(object, "list"))                show(as(object, "list"))
566            })            })
567    
568    # No metadata is checked
569    setGeneric("%IN%", function(x, y) standardGeneric("%IN%"))
570    setMethod("%IN%",
571              signature(x = "TextDocument", y = "TextDocCol"),
572              function(x, y) {
573                  x %in% y
574              })

Legend:
Removed from v.64  
changed lines
  Added in v.65

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge