diff --git a/NEWS.md b/NEWS.md index 80f468c..c1cc7e3 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,7 @@ - The embeddings with the file-based (word2vec.character) and list-based approach (word2vec.list) are proven to be the same if the tokenisation is the same and the hyperparameters of the model are the same - In order to make sure the embeddings are the same the vocabulary had to be sorted according to the number of times it appears in the corpus as well as the token itself in case the number of times the 2 tokens occur is the same. This has as a consequence that the embeddings generated with version 0.4.0 will be slightly different as the ones obtained with package version < 0.4.0 due to a possible ordering difference in the vocabulary - examples provided in the help of ?word2vec and in the README +- writing text data to files before training for the file-based approach (word2vec.character) now uses useBytes = TRUE (see issue #7) ## CHANGES IN word2vec VERSION 0.3.4 diff --git a/R/word2vec.R b/R/word2vec.R index 32071f5..081e429 100644 --- a/R/word2vec.R +++ b/R/word2vec.R @@ -134,6 +134,7 @@ word2vec <- function(x, #' @param encoding the encoding of \code{x} and \code{stopwords}. Defaults to 'UTF-8'. #' Calculating the model always starts from files allowing to build a model on large corpora. The encoding argument #' is passed on to \code{file} when writing \code{x} to hard disk in case you provided it as a character vector. +#' @param useBytes logical passed on to \code{\link{writeLines}} when writing the text and stopwords on disk before building the model. Defaults to \code{TRUE}. #' @export word2vec.character <- function(x, type = c("cbow", "skip-gram"), @@ -144,6 +145,7 @@ word2vec.character <- function(x, split = c(" \n,.-!?:;/\"#$%&'()*+<=>@[]\\^_`{|}~\t\v\f\r", ".\n?!"), encoding = "UTF-8", + useBytes = TRUE, ...){ type <- match.arg(type) stopw <- stopwords @@ -153,7 +155,7 @@ word2vec.character <- function(x, } file_stopwords <- tempfile() filehandle_stopwords <- file(file_stopwords, open = "wt", encoding = encoding) - writeLines(stopw, con = filehandle_stopwords) + writeLines(stopw, con = filehandle_stopwords, useBytes = useBytes) close(filehandle_stopwords) on.exit({ if (file.exists(file_stopwords)) file.remove(file_stopwords) @@ -167,7 +169,7 @@ word2vec.character <- function(x, if (file.exists(file_train)) file.remove(file_train) }) filehandle_train <- file(file_train, open = "wt", encoding = encoding) - writeLines(text = x, con = filehandle_train) + writeLines(text = x, con = filehandle_train, useBytes = useBytes) close(filehandle_train) } #expTableSize <- 1000L diff --git a/man/word2vec.character.Rd b/man/word2vec.character.Rd index 30fc249..13c2a02 100644 --- a/man/word2vec.character.Rd +++ b/man/word2vec.character.Rd @@ -19,6 +19,7 @@ threads = 1L, split = c(" \\n,.-!?:;/\\"#$\%&'()*+<=>@[]\\\\^_`{|}~\\t\\v\\f\\r", ".\\n?!"), encoding = "UTF-8", + useBytes = TRUE, ... ) } @@ -53,6 +54,8 @@ Calculating the model always starts from files allowing to build a model on large corpora. The encoding argument is passed on to \code{file} when writing \code{x} to hard disk in case you provided it as a character vector.} +\item{useBytes}{logical passed on to \code{\link{writeLines}} when writing the text and stopwords on disk before building the model. Defaults to \code{TRUE}.} + \item{...}{further arguments passed on to the C++ function \code{w2v_train} - for expert use only} } \value{