Sentiment Prediction using Keras

2018/10/28

Code and Links

Load Packages

pacman::p_load(dplyr, ggplot2, purrr, tidyr, stringr, keras)
ggplot2::theme_set(theme_bw())
# keras::install_keras()

Load Data

load("_data/reviews_sample.Rdata")
reviews_sample %>% glimpse
## Observations: 50,000
## Variables: 10
## $ id              <int> 362139, 90817, 541427, 361424, 167474, 43767, ...
## $ rating          <dbl> 1, 1, 3, 1, 5, 3, 4, 3, 4, 5, 5, 2, 4, 4, 3, 3...
## $ target          <dbl> 0, 0, 1, 0, 2, 1, 2, 1, 2, 2, 2, 0, 2, 2, 1, 1...
## $ binary          <dbl> 0, 0, NA, 0, 1, NA, 1, NA, 1, 1, 1, 0, 1, 1, N...
## $ text            <chr> "Ich weiß nicht, wieso der Film so gut bewerte...
## $ text_lemma      <chr> "ich wissen nicht P_COMMA wieso der film gut b...
## $ text_word       <chr> "ich weiß nicht P_COMMA wieso der film gut bew...
## $ text_lemma_noun <chr> "ich wissen nicht P_COMMA wieso der noun gut b...
## $ text_word_noun  <chr> "ich weiß nicht P_COMMA wieso der noun gut bew...
## $ nwords          <int> 28, 36, 30, 59, 24, 140, 46, 29, 53, 23, 23, 1...

Train/ Test Split

final <- reviews_sample %>% 
  filter(!duplicated(text)) %>% 
  select(id, text, binary, nwords) %>% 
  mutate(nchars = nchar(text)) %>% 
  arrange(sample(id, size = n())) %>% 
  drop_na(binary) %>% 
  mutate(split_id = sample(c(T, F), size = n(), replace = T, prob = c(.9, .1))) 

train <- final %>% filter(split_id)
test <- final %>% filter(!split_id)
train %>% 
  ggplot(aes(nchars)) +
  geom_density(fill = "gray", color = NA, alpha = .7) + 
  xlim(0, 1000) 
## Warning: Removed 5466 rows containing non-finite values (stat_density).

Sequence Building

library(keras)
max_token <- 50
batch_size <- 32
maxlen <- 500

tokenizer <- text_tokenizer(num_words = max_token, char_level = T)
fit_text_tokenizer(tokenizer, x = train$text)
#keras::save_text_tokenizer(tokenizer, "_models/tokenizer")
#tokenizer <- keras::load_text_tokenizer("_models/tokenizer")

train_seq <- tokenizer %>% 
  texts_to_sequences(train$text) %>% 
  pad_sequences(maxlen = maxlen, value = 0)

test_seq <- tokenizer %>% 
  texts_to_sequences(test$text) %>% 
  pad_sequences(maxlen = maxlen, value = 0)

tokenizer %>%
  .$index_word %>% 
  map_chr(1) %>% 
  tibble(
    char = ., 
    index = names(tokenizer$index_word)
  ) %>% 
  slice(1:50)
## # A tibble: 50 x 2
##    char  index
##    <chr> <chr>
##  1 " "   1    
##  2 e     2    
##  3 n     3    
##  4 i     4    
##  5 r     5    
##  6 s     6    
##  7 t     7    
##  8 a     8    
##  9 h     9    
## 10 d     10   
## # ... with 40 more rows

Keras

Define Model

embed_size <- 25
filter_sizes <- c(1, 2, 3, 5)
num_filters <- 32

inp <- keras::layer_input(shape = list(maxlen))
  
x <- inp %>%
  layer_embedding(
    input_dim = max_token, 
    output_dim = embed_size, 
    input_length = maxlen
  ) %>% 
  #layer_spatial_dropout_1d(0.2) %>% 
  layer_reshape(list(maxlen, embed_size, 1))

conv_1 <- x %>% 
  layer_conv_2d(
    num_filters, 
    kernel_size = list(filter_sizes[1], embed_size), 
    kernel_initializer = 'normal',
    activation='elu'
  )

conv_2 <- x %>% 
  layer_conv_2d(
    num_filters, 
    kernel_size = list(filter_sizes[2], embed_size), 
    kernel_initializer = 'normal',
    activation='elu'
  )

conv_3 <- x %>% 
  layer_conv_2d(
    num_filters, 
    kernel_size = list(filter_sizes[3], embed_size), 
    kernel_initializer = 'normal',
    activation='elu'
  )

conv_4 <- x %>% 
  layer_conv_2d(
    num_filters, 
    kernel_size = list(filter_sizes[4], embed_size), 
    kernel_initializer = 'normal',
    activation='elu'
  )


max_pool1 <- conv_1 %>% 
  layer_max_pooling_2d(pool_size=list(maxlen - filter_sizes[1] + 1, 1))

max_pool2 <- conv_2 %>% 
  layer_max_pooling_2d(pool_size=list(maxlen - filter_sizes[2] + 1, 1))

max_pool3 <- conv_3 %>% 
  layer_max_pooling_2d(pool_size=list(maxlen - filter_sizes[3] + 1, 1))

max_pool4 <- conv_4 %>% 
  layer_max_pooling_2d(pool_size=list(maxlen - filter_sizes[4] + 1, 1))

z <- layer_concatenate(list(max_pool1, max_pool2, max_pool3, max_pool4), axis = 1) %>% 
  layer_flatten()

outp <- z %>% 
  layer_dense(units = 1, activation = "sigmoid")

multi_model <- keras::keras_model(inp, outp) %>%
  compile(
    loss = "binary_crossentropy",
    optimizer = "adam",
    metrics = "accuracy"
  )  

summary(multi_model)
## ___________________________________________________________________________
## Layer (type)            Output Shape     Param #  Connected to             
## ===========================================================================
## input_1 (InputLayer)    (None, 500)      0                                 
## ___________________________________________________________________________
## embedding_1 (Embedding) (None, 500, 25)  1250     input_1[0][0]            
## ___________________________________________________________________________
## reshape_1 (Reshape)     (None, 500, 25,  0        embedding_1[0][0]        
## ___________________________________________________________________________
## conv2d_1 (Conv2D)       (None, 500, 1, 3 832      reshape_1[0][0]          
## ___________________________________________________________________________
## conv2d_2 (Conv2D)       (None, 499, 1, 3 1632     reshape_1[0][0]          
## ___________________________________________________________________________
## conv2d_3 (Conv2D)       (None, 498, 1, 3 2432     reshape_1[0][0]          
## ___________________________________________________________________________
## conv2d_4 (Conv2D)       (None, 496, 1, 3 4032     reshape_1[0][0]          
## ___________________________________________________________________________
## max_pooling2d_1 (MaxPoo (None, 1, 1, 32) 0        conv2d_1[0][0]           
## ___________________________________________________________________________
## max_pooling2d_2 (MaxPoo (None, 1, 1, 32) 0        conv2d_2[0][0]           
## ___________________________________________________________________________
## max_pooling2d_3 (MaxPoo (None, 1, 1, 32) 0        conv2d_3[0][0]           
## ___________________________________________________________________________
## max_pooling2d_4 (MaxPoo (None, 1, 1, 32) 0        conv2d_4[0][0]           
## ___________________________________________________________________________
## concatenate_1 (Concaten (None, 4, 1, 32) 0        max_pooling2d_1[0][0]    
##                                                   max_pooling2d_2[0][0]    
##                                                   max_pooling2d_3[0][0]    
##                                                   max_pooling2d_4[0][0]    
## ___________________________________________________________________________
## flatten_1 (Flatten)     (None, 128)      0        concatenate_1[0][0]      
## ___________________________________________________________________________
## dense_1 (Dense)         (None, 1)        129      flatten_1[0][0]          
## ===========================================================================
## Total params: 10,307
## Trainable params: 10,307
## Non-trainable params: 0
## ___________________________________________________________________________

Fit Model

hist_hist <- multi_model %>% 
  keras::fit(
    x = train_seq, 
    y = train$binary,
    batch_size = batch_size,
    epochs = 1,
    validation_split = .1
  )

Test Model

pred_multi <- predict(multi_model, x = test_seq) %>% 
  as.vector() 
pred_multi <- ifelse(pred_multi > .5, 1, 0)
mean(pred_multi == test$binary) #%>% glue::glue("Accuracy of {.}") # 0.7697401
print(object.size(multi_model), units = "MB")