SCM

SCM Repository

[tm] Diff of /pkg/R/corpus.R
ViewVC logotype

Diff of /pkg/R/corpus.R

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

revision 1300, Fri Mar 21 14:30:05 2014 UTC revision 1306, Tue Mar 25 08:37:05 2014 UTC
# Line 16  Line 16 
16    
17  PCorpus <-  PCorpus <-
18  function(x,  function(x,
19           readerControl = list(reader = x$DefaultReader, language = "en"),           readerControl = list(reader = x$defaultreader, language = "en"),
20           dbControl = list(dbName = "", dbType = "DB1"))           dbControl = list(dbName = "", dbType = "DB1"))
21  {  {
22      stopifnot(inherits(x, "Source"))      stopifnot(inherits(x, "Source"))
23    
24      readerControl <- prepareReader(readerControl, x$DefaultReader)      readerControl <- prepareReader(readerControl, x$defaultreader)
25    
26      if (is.function(readerControl$init))      if (is.function(readerControl$init))
27          readerControl$init()          readerControl$init()
# Line 34  Line 34 
34      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)      db <- filehash::dbInit(dbControl$dbName, dbControl$dbType)
35    
36      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
37      tdl <- if (x$Length > 0)      tdl <- if (x$length > 0)
38          vector("list", as.integer(x$Length))          vector("list", as.integer(x$length))
39      else      else
40          list()          list()
41    
# Line 43  Line 43 
43      while (!eoi(x)) {      while (!eoi(x)) {
44          x <- stepNext(x)          x <- stepNext(x)
45          elem <- getElem(x)          elem <- getElem(x)
46          id <- if (is.null(x$Names) || is.na(x$Names))          id <- if (is.null(x$names) || is.na(x$names))
47                  as.character(counter)                  as.character(counter)
48              else              else
49                  x$Names[counter]                  x$names[counter]
50          doc <- readerControl$reader(elem, readerControl$language, id)          doc <- readerControl$reader(elem, readerControl$language, id)
51          filehash::dbInsert(db, meta(doc, "ID"), doc)          filehash::dbInsert(db, meta(doc, "id"), doc)
52          if (x$Length > 0) tdl[[counter]] <- meta(doc, "ID")          if (x$length > 0) tdl[[counter]] <- meta(doc, "id")
53          else tdl <- c(tdl, meta(doc, "ID"))          else tdl <- c(tdl, meta(doc, "id"))
54          counter <- counter + 1          counter <- counter + 1
55      }      }
56      if (!is.null(x$Names) && !is.na(x$Names))      if (!is.null(x$names) && !is.na(x$names))
57          names(tdl) <- x$Names          names(tdl) <- x$names
58    
59      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
60      filehash::dbInsert(db, "DMetaData", df)      filehash::dbInsert(db, "DMetaData", df)
# Line 74  Line 74 
74    
75  VCorpus <-  VCorpus <-
76  Corpus <-  Corpus <-
77  function(x, readerControl = list(reader = x$DefaultReader, language = "en"))  function(x, readerControl = list(reader = x$defaultreader, language = "en"))
78  {  {
79      stopifnot(inherits(x, "Source"))      stopifnot(inherits(x, "Source"))
80    
81      readerControl <- prepareReader(readerControl, x$DefaultReader)      readerControl <- prepareReader(readerControl, x$defaultreader)
82    
83      if (is.function(readerControl$init))      if (is.function(readerControl$init))
84          readerControl$init()          readerControl$init()
# Line 87  Line 87 
87          on.exit(readerControl$exit())          on.exit(readerControl$exit())
88    
89      # Allocate memory in advance if length is known      # Allocate memory in advance if length is known
90      tdl <- if (x$Length > 0)      tdl <- if (x$length > 0)
91          vector("list", as.integer(x$Length))          vector("list", as.integer(x$length))
92      else      else
93          list()          list()
94    
95      if (x$Vectorized)      if (x$vectorized)
96          tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id),          tdl <- mapply(function(elem, id) readerControl$reader(elem, readerControl$language, id),
97                        pGetElem(x),                        pGetElem(x),
98                        id = if (is.null(x$Names) || is.na(x$Names)) as.character(seq_len(x$Length)) else x$Names,                        id = if (is.null(x$names) || is.na(x$names)) as.character(seq_len(x$length)) else x$names,
99                        SIMPLIFY = FALSE)                        SIMPLIFY = FALSE)
100      else {      else {
101          counter <- 1          counter <- 1
102          while (!eoi(x)) {          while (!eoi(x)) {
103              x <- stepNext(x)              x <- stepNext(x)
104              elem <- getElem(x)              elem <- getElem(x)
105              id <- if (is.null(x$Names) || is.na(x$Names))              id <- if (is.null(x$names) || is.na(x$names))
106                  as.character(counter)                  as.character(counter)
107              else              else
108                  x$Names[counter]                  x$names[counter]
109              doc <- readerControl$reader(elem, readerControl$language, id)              doc <- readerControl$reader(elem, readerControl$language, id)
110              if (x$Length > 0)              if (x$length > 0)
111                  tdl[[counter]] <- doc                  tdl[[counter]] <- doc
112              else              else
113                  tdl <- c(tdl, list(doc))                  tdl <- c(tdl, list(doc))
114              counter <- counter + 1              counter <- counter + 1
115          }          }
116      }      }
117      if (!is.null(x$Names) && !is.na(x$Names))      if (!is.null(x$names) && !is.na(x$names))
118          names(tdl) <- x$Names          names(tdl) <- x$names
119      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)      df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE)
120      .VCorpus(tdl, .MetaDataNode(), df)      .VCorpus(tdl, .MetaDataNode(), df)
121  }  }
# Line 155  Line 155 
155  {  {
156      if (is.character(i)) {      if (is.character(i)) {
157          if (is.null(names(x)))          if (is.null(names(x)))
158              match(i, meta(x, "ID", type = "local"))              match(i, meta(x, "id", type = "local"))
159          else          else
160              match(i, names(x))              match(i, names(x))
161      }      }
# Line 397  Line 397 
397  function(x, path = ".", filenames = NULL)  function(x, path = ".", filenames = NULL)
398  {  {
399      filenames <- file.path(path,      filenames <- file.path(path,
400                             if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", meta(x, "ID"))))        if (is.null(filenames))
401              sprintf("%s.txt", as.character(meta(x, "id", "local")))
402                             else filenames)                             else filenames)
403      i <- 1  
404      for (o in x) {      stopifnot(length(x) == length(filenames))
405          writeLines(as.PlainTextDocument(o), filenames[i])  
406          i <- i + 1      mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames)
407      }  
408        invisible(x)
409  }  }

Legend:
Removed from v.1300  
changed lines
  Added in v.1306

root@r-forge.r-project.org
ViewVC Help
Powered by ViewVC 1.0.0  
Thanks to:
Vienna University of Economics and Business Powered By FusionForge