SCM

SCM Repository

[tm] Annotation of /pkg/R/corpus.R
ViewVC logotype

Annotation of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 32 - (view) (download)
Original Path: trunk/R/trunk/R/textdoccol.R

1 : feinerer 17 # Author: Ingo Feinerer
2 :    
3 :     # S4 class definition
4 :     # Text document collection
5 : feinerer 21 setClass("textdoccol",
6 :     contains = c("list"))
7 : feinerer 17
8 : feinerer 21 # Constructors
9 :    
10 :     setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))
11 :     setMethod("textdoccol",
12 : feinerer 32 c("character", "character", "logical", "logical"),
13 :     function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {
14 : feinerer 18
15 : feinerer 22 # Add a new type for each unique input source format
16 : feinerer 23 type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))
17 : feinerer 22 switch(type,
18 :     # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
19 : feinerer 24 # For the moment the first argument is still a single file
20 :     # This will be changed to a directory as soon as we have the full RCV1 data set
21 : feinerer 22 "RCV1" = {
22 :     tree <- xmlTreeParse(object)
23 :     tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))
24 :     },
25 : feinerer 24 # Text in a special CSV format (as e.g. exported from an Excel sheet)
26 :     # For details on the file format see data/Umfrage.csv
27 :     # The first argument has to be a single file
28 : feinerer 22 "CSV" = {
29 :     m <- as.matrix(read.csv(object))
30 :     l <- vector("list", dim(m)[1])
31 :     for (i in 1:dim(m)[1]) {
32 :     author <- "Not yet implemented"
33 :     timestamp <- date()
34 :     description <- "Not yet implemented"
35 :     id <- i
36 :     corpus <- as.character(m[i,2:dim(m)[2]])
37 :     if (stripWhiteSpace)
38 :     corpus <- gsub("[[:space:]]+", " ", corpus)
39 :     if (toLower)
40 :     corpus <- tolower(corpus)
41 :     origin <- "Not yet implemented"
42 :     heading <- "Not yet implemented"
43 :    
44 :     l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
45 :     description = description, id = id, origin = origin, heading = heading)
46 :     }
47 :     tdcl <- new("textdoccol", .Data = l)
48 : feinerer 23 },
49 :     # Read in text documents in Reuters-21578 XML (not SGML) format
50 :     # Typically the first argument will be a directory where we can
51 :     # find the files reut2-000.xml ... reut2-021.xml
52 :     "REUT21578" = {
53 : feinerer 24 tdl <- sapply(dir(object,
54 :     pattern = ".xml",
55 :     full.names = TRUE),
56 :     function(file) {
57 :     tree <- xmlTreeParse(file)
58 :     xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)
59 :     })
60 : feinerer 22
61 : feinerer 24 tdcl <- new("textdoccol", .Data = tdl)
62 :     })
63 : feinerer 21 tdcl
64 :     })
65 :    
66 : feinerer 23 # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
67 : feinerer 21 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
68 : feinerer 17 author <- "Not yet implemented"
69 : feinerer 18 timestamp <- xmlAttrs(node)[["date"]]
70 : feinerer 17 description <- "Not yet implemented"
71 :     id <- as.integer(xmlAttrs(node)[["itemid"]])
72 :     origin <- "Not yet implemented"
73 : feinerer 18 corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
74 : feinerer 21
75 :     if (stripWhiteSpace)
76 :     corpus <- gsub("[[:space:]]+", " ", corpus)
77 :     if (toLower)
78 :     corpus <- tolower(corpus)
79 :    
80 : feinerer 18 heading <- xmlValue(node[["title"]])
81 : feinerer 17
82 : feinerer 21 new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
83 :     description = description, id = id, origin = origin, heading = heading)
84 : feinerer 19 }
85 : feinerer 23
86 :     # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
87 :     parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
88 :     author <- "Not yet implemented"
89 :     timestamp <- xmlValue(node[["DATE"]])
90 :     description <- "Not yet implemented"
91 :     id <- as.integer(xmlAttrs(node)[["NEWID"]])
92 :    
93 :     origin <- "Not yet implemented"
94 :    
95 :     # The <BODY></BODY> tag is unfortunately NOT obligatory!
96 :     if (!is.null(node[["TEXT"]][["BODY"]]))
97 :     corpus <- xmlValue(node[["TEXT"]][["BODY"]])
98 :     else
99 :     corpus <- ""
100 :    
101 :     if (stripWhiteSpace)
102 :     corpus <- gsub("[[:space:]]+", " ", corpus)
103 :     if (toLower)
104 :     corpus <- tolower(corpus)
105 :    
106 :     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
107 :     if (!is.null(node[["TEXT"]][["TITLE"]]))
108 :     heading <- xmlValue(node[["TEXT"]][["TITLE"]])
109 :     else
110 :     heading <- ""
111 :    
112 :     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
113 :     description = description, id = id, origin = origin, heading = heading)
114 :     }

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge