16 |
|
|
17 |
PCorpus <- |
PCorpus <- |
18 |
function(x, |
function(x, |
19 |
readerControl = list(reader = x$DefaultReader, language = "en"), |
readerControl = list(reader = x$defaultreader, language = "en"), |
20 |
dbControl = list(dbName = "", dbType = "DB1")) |
dbControl = list(dbName = "", dbType = "DB1")) |
21 |
{ |
{ |
22 |
stopifnot(inherits(x, "Source")) |
stopifnot(inherits(x, "Source")) |
23 |
|
|
24 |
readerControl <- prepareReader(readerControl, x$DefaultReader) |
readerControl <- prepareReader(readerControl, x$defaultreader) |
25 |
|
|
26 |
if (is.function(readerControl$init)) |
if (is.function(readerControl$init)) |
27 |
readerControl$init() |
readerControl$init() |
34 |
db <- filehash::dbInit(dbControl$dbName, dbControl$dbType) |
db <- filehash::dbInit(dbControl$dbName, dbControl$dbType) |
35 |
|
|
36 |
# Allocate memory in advance if length is known |
# Allocate memory in advance if length is known |
37 |
tdl <- if (x$Length > 0) |
tdl <- if (x$length > 0) |
38 |
vector("list", as.integer(x$Length)) |
vector("list", as.integer(x$length)) |
39 |
else |
else |
40 |
list() |
list() |
41 |
|
|
43 |
while (!eoi(x)) { |
while (!eoi(x)) { |
44 |
x <- stepNext(x) |
x <- stepNext(x) |
45 |
elem <- getElem(x) |
elem <- getElem(x) |
46 |
id <- if (is.null(x$Names) || is.na(x$Names)) |
id <- if (is.null(x$names) || is.na(x$names)) |
47 |
as.character(counter) |
as.character(counter) |
48 |
else |
else |
49 |
x$Names[counter] |
x$names[counter] |
50 |
doc <- readerControl$reader(elem, readerControl$language, id) |
doc <- readerControl$reader(elem, readerControl$language, id) |
51 |
filehash::dbInsert(db, meta(doc, "ID"), doc) |
filehash::dbInsert(db, meta(doc, "id"), doc) |
52 |
if (x$Length > 0) tdl[[counter]] <- meta(doc, "ID") |
if (x$length > 0) tdl[[counter]] <- meta(doc, "id") |
53 |
else tdl <- c(tdl, meta(doc, "ID")) |
else tdl <- c(tdl, meta(doc, "id")) |
54 |
counter <- counter + 1 |
counter <- counter + 1 |
55 |
} |
} |
56 |
if (!is.null(x$Names) && !is.na(x$Names)) |
if (!is.null(x$names) && !is.na(x$names)) |
57 |
names(tdl) <- x$Names |
names(tdl) <- x$names |
58 |
|
|
59 |
df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE) |
df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE) |
60 |
filehash::dbInsert(db, "DMetaData", df) |
filehash::dbInsert(db, "DMetaData", df) |
74 |
|
|
75 |
VCorpus <- |
VCorpus <- |
76 |
Corpus <- |
Corpus <- |
77 |
function(x, readerControl = list(reader = x$DefaultReader, language = "en")) |
function(x, readerControl = list(reader = x$defaultreader, language = "en")) |
78 |
{ |
{ |
79 |
stopifnot(inherits(x, "Source")) |
stopifnot(inherits(x, "Source")) |
80 |
|
|
81 |
readerControl <- prepareReader(readerControl, x$DefaultReader) |
readerControl <- prepareReader(readerControl, x$defaultreader) |
82 |
|
|
83 |
if (is.function(readerControl$init)) |
if (is.function(readerControl$init)) |
84 |
readerControl$init() |
readerControl$init() |
87 |
on.exit(readerControl$exit()) |
on.exit(readerControl$exit()) |
88 |
|
|
89 |
# Allocate memory in advance if length is known |
# Allocate memory in advance if length is known |
90 |
tdl <- if (x$Length > 0) |
tdl <- if (x$length > 0) |
91 |
vector("list", as.integer(x$Length)) |
vector("list", as.integer(x$length)) |
92 |
else |
else |
93 |
list() |
list() |
94 |
|
|
95 |
if (x$Vectorized) |
if (x$vectorized) |
96 |
tdl <- mapply(function(x, id) readerControl$reader(x, readerControl$language, id), |
tdl <- mapply(function(elem, id) readerControl$reader(elem, readerControl$language, id), |
97 |
pGetElem(x), |
pGetElem(x), |
98 |
id = if (is.null(x$Names) || is.na(x$Names)) as.character(seq_len(x$Length)) else x$Names, |
id = if (is.null(x$names) || is.na(x$names)) as.character(seq_len(x$length)) else x$names, |
99 |
SIMPLIFY = FALSE) |
SIMPLIFY = FALSE) |
100 |
else { |
else { |
101 |
counter <- 1 |
counter <- 1 |
102 |
while (!eoi(x)) { |
while (!eoi(x)) { |
103 |
x <- stepNext(x) |
x <- stepNext(x) |
104 |
elem <- getElem(x) |
elem <- getElem(x) |
105 |
id <- if (is.null(x$Names) || is.na(x$Names)) |
id <- if (is.null(x$names) || is.na(x$names)) |
106 |
as.character(counter) |
as.character(counter) |
107 |
else |
else |
108 |
x$Names[counter] |
x$names[counter] |
109 |
doc <- readerControl$reader(elem, readerControl$language, id) |
doc <- readerControl$reader(elem, readerControl$language, id) |
110 |
if (x$Length > 0) |
if (x$length > 0) |
111 |
tdl[[counter]] <- doc |
tdl[[counter]] <- doc |
112 |
else |
else |
113 |
tdl <- c(tdl, list(doc)) |
tdl <- c(tdl, list(doc)) |
114 |
counter <- counter + 1 |
counter <- counter + 1 |
115 |
} |
} |
116 |
} |
} |
117 |
if (!is.null(x$Names) && !is.na(x$Names)) |
if (!is.null(x$names) && !is.na(x$names)) |
118 |
names(tdl) <- x$Names |
names(tdl) <- x$names |
119 |
df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE) |
df <- data.frame(MetaID = rep(0, length(tdl)), stringsAsFactors = FALSE) |
120 |
.VCorpus(tdl, .MetaDataNode(), df) |
.VCorpus(tdl, .MetaDataNode(), df) |
121 |
} |
} |
155 |
{ |
{ |
156 |
if (is.character(i)) { |
if (is.character(i)) { |
157 |
if (is.null(names(x))) |
if (is.null(names(x))) |
158 |
match(i, meta(x, "ID", type = "local")) |
match(i, meta(x, "id", type = "local")) |
159 |
else |
else |
160 |
match(i, names(x)) |
match(i, names(x)) |
161 |
} |
} |
397 |
function(x, path = ".", filenames = NULL) |
function(x, path = ".", filenames = NULL) |
398 |
{ |
{ |
399 |
filenames <- file.path(path, |
filenames <- file.path(path, |
400 |
if (is.null(filenames)) unlist(lapply(x, function(x) sprintf("%s.txt", meta(x, "ID")))) |
if (is.null(filenames)) |
401 |
|
sprintf("%s.txt", as.character(meta(x, "id", "local"))) |
402 |
else filenames) |
else filenames) |
403 |
i <- 1 |
|
404 |
for (o in x) { |
stopifnot(length(x) == length(filenames)) |
405 |
writeLines(as.PlainTextDocument(o), filenames[i]) |
|
406 |
i <- i + 1 |
mapply(function(doc, f) writeLines(as.character(doc), f), x, filenames) |
407 |
} |
|
408 |
|
invisible(x) |
409 |
} |
} |