SCM Repository
Annotation of /pkg/R/corpus.R
Parent Directory
|
Revision Log
Revision 23 -
(view)
(download)
Original Path: trunk/R/trunk/R/textdoccol.R
1 : | feinerer | 17 | # Author: Ingo Feinerer |
2 : | |||
3 : | # S4 class definition | ||
4 : | # Text document collection | ||
5 : | feinerer | 21 | setClass("textdoccol", |
6 : | representation(tdm = "termdocmatrix"), | ||
7 : | contains = c("list")) | ||
8 : | feinerer | 17 | |
9 : | feinerer | 21 | # Accessor functions as described in "S4 Classes in 15 pages, more or less" |
10 : | |||
11 : | if (!isGeneric("tdm")) { | ||
12 : | if (is.function("tdm")) | ||
13 : | fun <- tdm | ||
14 : | feinerer | 17 | else |
15 : | feinerer | 21 | fun <- function(object) standardGeneric("tdm") |
16 : | setGeneric("tdm", fun) | ||
17 : | feinerer | 17 | } |
18 : | feinerer | 21 | setMethod("tdm", "textdoccol", function(object) object@tdm) |
19 : | feinerer | 17 | |
20 : | feinerer | 21 | # Constructors |
21 : | |||
22 : | setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol")) | ||
23 : | setMethod("textdoccol", | ||
24 : | feinerer | 22 | c("character", "character", "logical", "logical", "character", |
25 : | "logical", "character", "integer", "integer", "logical"), | ||
26 : | function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE, weighting = "tf", | ||
27 : | stemming = FALSE, language = "english", minWordLength = 3, minDocFreq = 1, stopwords = NULL) { | ||
28 : | feinerer | 18 | |
29 : | feinerer | 22 | # Add a new type for each unique input source format |
30 : | feinerer | 23 | type <- match.arg(inputType,c("RCV1","CSV","REUT21578")) |
31 : | feinerer | 22 | switch(type, |
32 : | # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format | ||
33 : | "RCV1" = { | ||
34 : | require(XML) | ||
35 : | feinerer | 17 | |
36 : | feinerer | 22 | tree <- xmlTreeParse(object) |
37 : | tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower)) | ||
38 : | }, | ||
39 : | # Text in CSV format (as e.g. exported from an Excel sheet) | ||
40 : | "CSV" = { | ||
41 : | m <- as.matrix(read.csv(object)) | ||
42 : | l <- vector("list", dim(m)[1]) | ||
43 : | for (i in 1:dim(m)[1]) { | ||
44 : | author <- "Not yet implemented" | ||
45 : | timestamp <- date() | ||
46 : | description <- "Not yet implemented" | ||
47 : | id <- i | ||
48 : | corpus <- as.character(m[i,2:dim(m)[2]]) | ||
49 : | if (stripWhiteSpace) | ||
50 : | corpus <- gsub("[[:space:]]+", " ", corpus) | ||
51 : | if (toLower) | ||
52 : | corpus <- tolower(corpus) | ||
53 : | origin <- "Not yet implemented" | ||
54 : | heading <- "Not yet implemented" | ||
55 : | |||
56 : | l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp, | ||
57 : | description = description, id = id, origin = origin, heading = heading) | ||
58 : | } | ||
59 : | tdcl <- new("textdoccol", .Data = l) | ||
60 : | feinerer | 23 | }, |
61 : | # Read in text documents in Reuters-21578 XML (not SGML) format | ||
62 : | # Typically the first argument will be a directory where we can | ||
63 : | # find the files reut2-000.xml ... reut2-021.xml | ||
64 : | "REUT21578" = { | ||
65 : | require(XML) | ||
66 : | |||
67 : | # TODO: Read in a whole directory of XML files | ||
68 : | # lapply(dir(object, full.names = TRUE), function) | ||
69 : | # object is for the moment still a single XML file | ||
70 : | tree <- xmlTreeParse(object) | ||
71 : | tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)) | ||
72 : | feinerer | 22 | } |
73 : | ) | ||
74 : | |||
75 : | tdcl@tdm <- termdocmatrix(tdcl, weighting, stemming, language, minWordLength, minDocFreq, stopwords) | ||
76 : | |||
77 : | feinerer | 21 | tdcl |
78 : | }) | ||
79 : | |||
80 : | feinerer | 23 | # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file |
81 : | feinerer | 21 | parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) { |
82 : | feinerer | 17 | author <- "Not yet implemented" |
83 : | feinerer | 18 | timestamp <- xmlAttrs(node)[["date"]] |
84 : | feinerer | 17 | description <- "Not yet implemented" |
85 : | id <- as.integer(xmlAttrs(node)[["itemid"]]) | ||
86 : | origin <- "Not yet implemented" | ||
87 : | feinerer | 18 | corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE) |
88 : | feinerer | 21 | |
89 : | if (stripWhiteSpace) | ||
90 : | corpus <- gsub("[[:space:]]+", " ", corpus) | ||
91 : | if (toLower) | ||
92 : | corpus <- tolower(corpus) | ||
93 : | |||
94 : | feinerer | 18 | heading <- xmlValue(node[["title"]]) |
95 : | feinerer | 17 | |
96 : | feinerer | 21 | new("textdocument", .Data = corpus, author = author, timestamp = timestamp, |
97 : | description = description, id = id, origin = origin, heading = heading) | ||
98 : | feinerer | 19 | } |
99 : | feinerer | 23 | |
100 : | # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file | ||
101 : | parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) { | ||
102 : | author <- "Not yet implemented" | ||
103 : | timestamp <- xmlValue(node[["DATE"]]) | ||
104 : | description <- "Not yet implemented" | ||
105 : | id <- as.integer(xmlAttrs(node)[["NEWID"]]) | ||
106 : | |||
107 : | origin <- "Not yet implemented" | ||
108 : | |||
109 : | # The <BODY></BODY> tag is unfortunately NOT obligatory! | ||
110 : | if (!is.null(node[["TEXT"]][["BODY"]])) | ||
111 : | corpus <- xmlValue(node[["TEXT"]][["BODY"]]) | ||
112 : | else | ||
113 : | corpus <- "" | ||
114 : | |||
115 : | if (stripWhiteSpace) | ||
116 : | corpus <- gsub("[[:space:]]+", " ", corpus) | ||
117 : | if (toLower) | ||
118 : | corpus <- tolower(corpus) | ||
119 : | |||
120 : | # The <TITLE></TITLE> tag is unfortunately NOT obligatory! | ||
121 : | if (!is.null(node[["TEXT"]][["TITLE"]])) | ||
122 : | heading <- xmlValue(node[["TEXT"]][["TITLE"]]) | ||
123 : | else | ||
124 : | heading <- "" | ||
125 : | |||
126 : | new("textdocument", .Data = corpus, author = author, timestamp = timestamp, | ||
127 : | description = description, id = id, origin = origin, heading = heading) | ||
128 : | } |
R-Forge@R-project.org | ViewVC Help |
Powered by ViewVC 1.0.0 |