Unnest tokens for each label in a labelled text — prep_tidy

Creates table with tokens for each class (if any).

prep_tidy_text(x, target_col_name = NULL, text_col_name)

Arguments

x	x A data frame with one or more columns: the column with the classes (if `target_col_name` is not `NULL`); and the column with the text. Any other columns will be ignored.
target_col_name	A string with the column name of the target variable. Defaults to `NULL`.
text_col_name	A string with the column name of the text variable.

Value

A data frame with two or three columns: classes (if target_col_name is not NULL); line numbers; and tokens.

Examples

#' library(experienceAnalysis)
books <- janeaustenr::austen_books() # Jane Austen books
emma <- paste(books[books$book == "Emma", ], collapse = " ") # String with whole book
pp <- paste(books[books$book == "Pride & Prejudice", ], collapse = " ") # String with whole book

# Make data frame with books Emma and Pride & Prejudice
x <- data.frame(
  text = c(emma, pp),
  book = c("Emma", "Pride & Prejudice")
)

# Tokens for both books, without the `target_col_name` column
prep_tidy_text(x, target_col_name = NULL, text_col_name = "text") %>%
    head()
#>   linenumber   word
#> 1          1      c
#> 2          1   emma
#> 3          1     by
#> 4          1   jane
#> 5          1 austen
#> 6          1 volume

# Tokens for both books, with the `target_col_name` column this time
prep_tidy_text(x, target_col_name = "book", text_col_name = "text") %>%
    split(.$book) %>%
    purrr::map(~ head(.))
#> $Emma
#>   book linenumber   word
#> 1 Emma          1      c
#> 2 Emma          1   emma
#> 3 Emma          1     by
#> 4 Emma          1   jane
#> 5 Emma          1 austen
#> 6 Emma          1 volume
#> 
#> $`Pride & Prejudice`
#>                     book linenumber      word
#> 177234 Pride & Prejudice          2         c
#> 177235 Pride & Prejudice          2     pride
#> 177236 Pride & Prejudice          2       and
#> 177237 Pride & Prejudice          2 prejudice
#> 177238 Pride & Prejudice          2        by
#> 177239 Pride & Prejudice          2      jane
#> 

# Tokens for Pride & Prejudice
prep_tidy_text(x, target_col_name = "book", text_col_name = "text") %>%
    dplyr::filter(book == "Pride & Prejudice") %>%
    head()
#>                book linenumber      word
#> 1 Pride & Prejudice          2         c
#> 2 Pride & Prejudice          2     pride
#> 3 Pride & Prejudice          2       and
#> 4 Pride & Prejudice          2 prejudice
#> 5 Pride & Prejudice          2        by
#> 6 Pride & Prejudice          2      jane