SCM

SCM Repository

[tm] Annotation of /pkg/R/corpus.R
ViewVC logotype

Annotation of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log


Revision 36 - (view) (download)
Original Path: trunk/R/trunk/R/textdoccol.R

1 : feinerer 17 # Author: Ingo Feinerer
2 :    
3 : feinerer 21 setGeneric("textdoccol", function(object, ...) standardGeneric("textdoccol"))
4 :     setMethod("textdoccol",
5 : feinerer 32 c("character", "character", "logical", "logical"),
6 :     function(object, inputType = "RCV1", stripWhiteSpace = FALSE, toLower = FALSE) {
7 : feinerer 18
8 : feinerer 22 # Add a new type for each unique input source format
9 : feinerer 23 type <- match.arg(inputType,c("RCV1","CSV","REUT21578"))
10 : feinerer 22 switch(type,
11 :     # Read in text documents in XML Reuters Corpus Volume 1 (RCV1) format
12 : feinerer 24 # For the moment the first argument is still a single file
13 :     # This will be changed to a directory as soon as we have the full RCV1 data set
14 : feinerer 22 "RCV1" = {
15 :     tree <- xmlTreeParse(object)
16 :     tdcl <- new("textdoccol", .Data = xmlApply(xmlRoot(tree), parseNewsItem, stripWhiteSpace, toLower))
17 :     },
18 : feinerer 24 # Text in a special CSV format (as e.g. exported from an Excel sheet)
19 :     # For details on the file format see data/Umfrage.csv
20 :     # The first argument has to be a single file
21 : feinerer 22 "CSV" = {
22 :     m <- as.matrix(read.csv(object))
23 :     l <- vector("list", dim(m)[1])
24 :     for (i in 1:dim(m)[1]) {
25 :     author <- "Not yet implemented"
26 :     timestamp <- date()
27 :     description <- "Not yet implemented"
28 :     id <- i
29 :     corpus <- as.character(m[i,2:dim(m)[2]])
30 :     if (stripWhiteSpace)
31 :     corpus <- gsub("[[:space:]]+", " ", corpus)
32 :     if (toLower)
33 :     corpus <- tolower(corpus)
34 :     origin <- "Not yet implemented"
35 :     heading <- "Not yet implemented"
36 :    
37 :     l[[i]] <- new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
38 :     description = description, id = id, origin = origin, heading = heading)
39 :     }
40 :     tdcl <- new("textdoccol", .Data = l)
41 : feinerer 23 },
42 :     # Read in text documents in Reuters-21578 XML (not SGML) format
43 :     # Typically the first argument will be a directory where we can
44 :     # find the files reut2-000.xml ... reut2-021.xml
45 :     "REUT21578" = {
46 : feinerer 24 tdl <- sapply(dir(object,
47 :     pattern = ".xml",
48 :     full.names = TRUE),
49 :     function(file) {
50 :     tree <- xmlTreeParse(file)
51 :     xmlApply(xmlRoot(tree), parseReuters, stripWhiteSpace, toLower)
52 :     })
53 : feinerer 22
54 : feinerer 24 tdcl <- new("textdoccol", .Data = tdl)
55 :     })
56 : feinerer 21 tdcl
57 :     })
58 :    
59 : feinerer 23 # Parse a <newsitem></newsitem> element from a well-formed RCV1 XML file
60 : feinerer 21 parseNewsItem <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
61 : feinerer 17 author <- "Not yet implemented"
62 : feinerer 18 timestamp <- xmlAttrs(node)[["date"]]
63 : feinerer 17 description <- "Not yet implemented"
64 :     id <- as.integer(xmlAttrs(node)[["itemid"]])
65 : feinerer 36 origin <- "Reuters Corpus Volume 1 XML"
66 : feinerer 18 corpus <- unlist(xmlApply(node[["text"]], xmlValue), use.names = FALSE)
67 : feinerer 21
68 :     if (stripWhiteSpace)
69 :     corpus <- gsub("[[:space:]]+", " ", corpus)
70 :     if (toLower)
71 :     corpus <- tolower(corpus)
72 :    
73 : feinerer 18 heading <- xmlValue(node[["title"]])
74 : feinerer 17
75 : feinerer 21 new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
76 :     description = description, id = id, origin = origin, heading = heading)
77 : feinerer 19 }
78 : feinerer 23
79 :     # Parse a <REUTERS></REUTERS> element from a well-formed Reuters-21578 XML file
80 :     parseReuters <- function(node, stripWhiteSpace = FALSE, toLower = FALSE) {
81 : feinerer 36 # The <AUTHOR></AUTHOR> tag is unfortunately NOT obligatory!
82 :     if (!is.null(node[["TEXT"]][["AUTHOR"]]))
83 :     author <- xmlValue(node[["TEXT"]][["AUTHOR"]])
84 :     else
85 :     author <- ""
86 :    
87 : feinerer 23 timestamp <- xmlValue(node[["DATE"]])
88 : feinerer 36 description <- ""
89 : feinerer 23 id <- as.integer(xmlAttrs(node)[["NEWID"]])
90 :    
91 : feinerer 36 origin <- "Reuters-21578 XML"
92 : feinerer 23
93 :     # The <BODY></BODY> tag is unfortunately NOT obligatory!
94 :     if (!is.null(node[["TEXT"]][["BODY"]]))
95 :     corpus <- xmlValue(node[["TEXT"]][["BODY"]])
96 :     else
97 :     corpus <- ""
98 :    
99 :     if (stripWhiteSpace)
100 :     corpus <- gsub("[[:space:]]+", " ", corpus)
101 :     if (toLower)
102 :     corpus <- tolower(corpus)
103 :    
104 :     # The <TITLE></TITLE> tag is unfortunately NOT obligatory!
105 :     if (!is.null(node[["TEXT"]][["TITLE"]]))
106 :     heading <- xmlValue(node[["TEXT"]][["TITLE"]])
107 :     else
108 :     heading <- ""
109 :    
110 :     new("textdocument", .Data = corpus, author = author, timestamp = timestamp,
111 :     description = description, id = id, origin = origin, heading = heading)
112 :     }

R-Forge@R-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business University of Wisconsin - Madison Powered By FusionForge