SCM

SCM Repository

[tm] Diff of /trunk/tm/R/reader.R
ViewVC logotype

Diff of /trunk/tm/R/reader.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 776, Sun Jul 29 15:27:41 2007 UTC revision 777, Tue Aug 28 07:19:12 2007 UTC
# Line 2  Line 2 
2    
3  # Reader  # Reader
4    
5  readPlain <- function(...) {  readPlain <- FunctionGenerator(function(...) {
6      function(elem, load, language, id) {      function(elem, load, language, id) {
7          doc <- if (load) {          doc <- if (load) {
8              new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,              new("PlainTextDocument", .Data = elem$content, URI = elem$uri, Cached = TRUE,
# Line 15  Line 15 
15    
16          return(doc)          return(doc)
17      }      }
18  }  })
 attr(readPlain, "FunctionGenerator") <- TRUE  
19    
20  readReut21578XML <- function(...) {  readReut21578XML <- FunctionGenerator(function(...) {
21      function(elem, load, language, id) {      function(elem, load, language, id) {
22          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
23          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- xmlTreeParse(corpus, asText = TRUE)
# Line 56  Line 55 
55    
56          return(doc)          return(doc)
57      }      }
58  }  })
 attr(readReut21578XML, "FunctionGenerator") <- TRUE  
59    
60  readRCV1 <- function(...) {  readRCV1 <- FunctionGenerator(function(...) {
61      function(elem, load, language, id) {      function(elem, load, language, id) {
62          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
63          tree <- xmlTreeParse(corpus, asText = TRUE)          tree <- xmlTreeParse(corpus, asText = TRUE)
# Line 84  Line 82 
82    
83          return(doc)          return(doc)
84      }      }
85  }  })
 attr(readRCV1, "FunctionGenerator") <- TRUE  
86    
87  readNewsgroup <- function(...) {  readNewsgroup <- FunctionGenerator(function(...) {
88      function(elem, load, language, id) {      function(elem, load, language, id) {
89          mail <- elem$content          mail <- elem$content
90          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))          author <- gsub("From: ", "", grep("^From:", mail, value = TRUE))
# Line 116  Line 113 
113    
114          return(doc)          return(doc)
115      }      }
116  }  })
 attr(readNewsgroup, "FunctionGenerator") <- TRUE  
117    
118  readGmane <- function(...) {  readGmane <- FunctionGenerator(function(...) {
119      function(elem, load, language, id) {      function(elem, load, language, id) {
120          corpus <- paste(elem$content, "\n", collapse = "")          corpus <- paste(elem$content, "\n", collapse = "")
121          # Remove namespaces          # Remove namespaces
# Line 149  Line 145 
145    
146          return(doc)          return(doc)
147      }      }
148  }  })
 attr(readGmane, "FunctionGenerator") <- TRUE  
149    
150  # readPDF needs pdftotext and pdfinfo installed to be able to extract the text and meta information  # readPDF needs pdftotext and pdfinfo installed to be able to extract the text and meta information
151  readPDF <- function(...) {  readPDF <- FunctionGenerator(function(...) {
152      function(elem, load, language, id) {      function(elem, load, language, id) {
153          meta <- system(paste("pdfinfo", shQuote(as.character(elem$uri[2]))), intern = TRUE)          meta <- system(paste("pdfinfo", shQuote(as.character(elem$uri[2]))), intern = TRUE)
154          heading <- gsub("Title:[[:space:]]*", "", grep("Title:", meta, value = TRUE))          heading <- gsub("Title:[[:space:]]*", "", grep("Title:", meta, value = TRUE))
# Line 172  Line 167 
167              Author = author, DateTimeStamp = datetimestamp, Description = description, ID = id,              Author = author, DateTimeStamp = datetimestamp, Description = description, ID = id,
168              Origin = origin, Heading = heading, Language = language)              Origin = origin, Heading = heading, Language = language)
169      }      }
170  }  })
 attr(readPDF, "FunctionGenerator") <- TRUE  
171    
172  readHTML <- function(...) {  readHTML <- FunctionGenerator(function(...) {
173      function(elem, load, language, id) {      function(elem, load, language, id) {
174          tree <- xmlTreeParse(elem$content, asText = TRUE)          tree <- xmlTreeParse(elem$content, asText = TRUE)
175          root <- xmlRoot(tree)          root <- xmlRoot(tree)
# Line 214  Line 208 
208              Author = author, DateTimeStamp = datetimestamp, Description = description, ID = id,              Author = author, DateTimeStamp = datetimestamp, Description = description, ID = id,
209              Origin = origin, Heading = heading, Language = language)              Origin = origin, Heading = heading, Language = language)
210      }      }
211  }  })
 attr(readHTML, "FunctionGenerator") <- TRUE  
212    
213  # Converter  # Converter
214    

Legend:
Removed from v.776  
changed lines
  Added in v.777

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge