pacman::p_load(dplyr, ggplot2, purrr, tidyr, stringr, keras)
ggplot2::theme_set(theme_bw())
# keras::install_keras()
load("_data/reviews_sample.Rdata")
reviews_sample %>% glimpse
## Observations: 50,000
## Variables: 10
## $ id <int> 362139, 90817, 541427, 361424, 167474, 43767, ...
## $ rating <dbl> 1, 1, 3, 1, 5, 3, 4, 3, 4, 5, 5, 2, 4, 4, 3, 3...
## $ target <dbl> 0, 0, 1, 0, 2, 1, 2, 1, 2, 2, 2, 0, 2, 2, 1, 1...
## $ binary <dbl> 0, 0, NA, 0, 1, NA, 1, NA, 1, 1, 1, 0, 1, 1, N...
## $ text <chr> "Ich weiß nicht, wieso der Film so gut bewerte...
## $ text_lemma <chr> "ich wissen nicht P_COMMA wieso der film gut b...
## $ text_word <chr> "ich weiß nicht P_COMMA wieso der film gut bew...
## $ text_lemma_noun <chr> "ich wissen nicht P_COMMA wieso der noun gut b...
## $ text_word_noun <chr> "ich weiß nicht P_COMMA wieso der noun gut bew...
## $ nwords <int> 28, 36, 30, 59, 24, 140, 46, 29, 53, 23, 23, 1...
final <- reviews_sample %>%
filter(!duplicated(text)) %>%
select(id, text, binary, nwords) %>%
mutate(nchars = nchar(text)) %>%
arrange(sample(id, size = n())) %>%
drop_na(binary) %>%
mutate(split_id = sample(c(T, F), size = n(), replace = T, prob = c(.9, .1)))
train <- final %>% filter(split_id)
test <- final %>% filter(!split_id)
train %>%
ggplot(aes(nchars)) +
geom_density(fill = "gray", color = NA, alpha = .7) +
xlim(0, 1000)
## Warning: Removed 5466 rows containing non-finite values (stat_density).
library(keras)
max_token <- 50
batch_size <- 32
maxlen <- 500
tokenizer <- text_tokenizer(num_words = max_token, char_level = T)
fit_text_tokenizer(tokenizer, x = train$text)
#keras::save_text_tokenizer(tokenizer, "_models/tokenizer")
#tokenizer <- keras::load_text_tokenizer("_models/tokenizer")
train_seq <- tokenizer %>%
texts_to_sequences(train$text) %>%
pad_sequences(maxlen = maxlen, value = 0)
test_seq <- tokenizer %>%
texts_to_sequences(test$text) %>%
pad_sequences(maxlen = maxlen, value = 0)
tokenizer %>%
.$index_word %>%
map_chr(1) %>%
tibble(
char = .,
index = names(tokenizer$index_word)
) %>%
slice(1:50)
## # A tibble: 50 x 2
## char index
## <chr> <chr>
## 1 " " 1
## 2 e 2
## 3 n 3
## 4 i 4
## 5 r 5
## 6 s 6
## 7 t 7
## 8 a 8
## 9 h 9
## 10 d 10
## # ... with 40 more rows
embed_size <- 25
filter_sizes <- c(1, 2, 3, 5)
num_filters <- 32
inp <- keras::layer_input(shape = list(maxlen))
x <- inp %>%
layer_embedding(
input_dim = max_token,
output_dim = embed_size,
input_length = maxlen
) %>%
#layer_spatial_dropout_1d(0.2) %>%
layer_reshape(list(maxlen, embed_size, 1))
conv_1 <- x %>%
layer_conv_2d(
num_filters,
kernel_size = list(filter_sizes[1], embed_size),
kernel_initializer = 'normal',
activation='elu'
)
conv_2 <- x %>%
layer_conv_2d(
num_filters,
kernel_size = list(filter_sizes[2], embed_size),
kernel_initializer = 'normal',
activation='elu'
)
conv_3 <- x %>%
layer_conv_2d(
num_filters,
kernel_size = list(filter_sizes[3], embed_size),
kernel_initializer = 'normal',
activation='elu'
)
conv_4 <- x %>%
layer_conv_2d(
num_filters,
kernel_size = list(filter_sizes[4], embed_size),
kernel_initializer = 'normal',
activation='elu'
)
max_pool1 <- conv_1 %>%
layer_max_pooling_2d(pool_size=list(maxlen - filter_sizes[1] + 1, 1))
max_pool2 <- conv_2 %>%
layer_max_pooling_2d(pool_size=list(maxlen - filter_sizes[2] + 1, 1))
max_pool3 <- conv_3 %>%
layer_max_pooling_2d(pool_size=list(maxlen - filter_sizes[3] + 1, 1))
max_pool4 <- conv_4 %>%
layer_max_pooling_2d(pool_size=list(maxlen - filter_sizes[4] + 1, 1))
z <- layer_concatenate(list(max_pool1, max_pool2, max_pool3, max_pool4), axis = 1) %>%
layer_flatten()
outp <- z %>%
layer_dense(units = 1, activation = "sigmoid")
multi_model <- keras::keras_model(inp, outp) %>%
compile(
loss = "binary_crossentropy",
optimizer = "adam",
metrics = "accuracy"
)
summary(multi_model)
## ___________________________________________________________________________
## Layer (type) Output Shape Param # Connected to
## ===========================================================================
## input_1 (InputLayer) (None, 500) 0
## ___________________________________________________________________________
## embedding_1 (Embedding) (None, 500, 25) 1250 input_1[0][0]
## ___________________________________________________________________________
## reshape_1 (Reshape) (None, 500, 25, 0 embedding_1[0][0]
## ___________________________________________________________________________
## conv2d_1 (Conv2D) (None, 500, 1, 3 832 reshape_1[0][0]
## ___________________________________________________________________________
## conv2d_2 (Conv2D) (None, 499, 1, 3 1632 reshape_1[0][0]
## ___________________________________________________________________________
## conv2d_3 (Conv2D) (None, 498, 1, 3 2432 reshape_1[0][0]
## ___________________________________________________________________________
## conv2d_4 (Conv2D) (None, 496, 1, 3 4032 reshape_1[0][0]
## ___________________________________________________________________________
## max_pooling2d_1 (MaxPoo (None, 1, 1, 32) 0 conv2d_1[0][0]
## ___________________________________________________________________________
## max_pooling2d_2 (MaxPoo (None, 1, 1, 32) 0 conv2d_2[0][0]
## ___________________________________________________________________________
## max_pooling2d_3 (MaxPoo (None, 1, 1, 32) 0 conv2d_3[0][0]
## ___________________________________________________________________________
## max_pooling2d_4 (MaxPoo (None, 1, 1, 32) 0 conv2d_4[0][0]
## ___________________________________________________________________________
## concatenate_1 (Concaten (None, 4, 1, 32) 0 max_pooling2d_1[0][0]
## max_pooling2d_2[0][0]
## max_pooling2d_3[0][0]
## max_pooling2d_4[0][0]
## ___________________________________________________________________________
## flatten_1 (Flatten) (None, 128) 0 concatenate_1[0][0]
## ___________________________________________________________________________
## dense_1 (Dense) (None, 1) 129 flatten_1[0][0]
## ===========================================================================
## Total params: 10,307
## Trainable params: 10,307
## Non-trainable params: 0
## ___________________________________________________________________________
hist_hist <- multi_model %>%
keras::fit(
x = train_seq,
y = train$binary,
batch_size = batch_size,
epochs = 1,
validation_split = .1
)
pred_multi <- predict(multi_model, x = test_seq) %>%
as.vector()
pred_multi <- ifelse(pred_multi > .5, 1, 0)
mean(pred_multi == test$binary) #%>% glue::glue("Accuracy of {.}") # 0.7697401
print(object.size(multi_model), units = "MB")