First, we load the same IMDB data we used previously:# Load all files from a directory in a DataFrame.
def load_directory_data(directory): data = {} data["sentence"] = [] data["sentiment"] = [] for file_path in os.
listdir(directory): with tf.
gfile.
GFile(os.
path.
join(directory, file_path), "r") as f: data["sentence"].
append(f.
read()) data["sentiment"].
append(re.
match("d+_(d+).
txt", file_path).
group(1)) return pd.
DataFrame.
from_dict(data)# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory): pos_df = load_directory_data(os.
path.
join(directory, "pos")) neg_df = load_directory_data(os.
path.
join(directory, "neg")) pos_df["polarity"] = 1 neg_df["polarity"] = 0 return pd.
concat([pos_df, neg_df]).
sample(frac=1).
reset_index(drop=True)# Download and process the dataset files.
def download_and_load_datasets(force_download=False): dataset = tf.
keras.
utils.
get_file( fname="aclImdb.
tar.
gz", origin="http://ai.
stanford.
edu/~amaas/data/sentiment/aclImdb_v1.
tar.
gz", extract=True)train_df = load_dataset(os.
path.
join(os.
path.
dirname(dataset), "aclImdb", "train")) test_df = load_dataset(os.
path.
join(os.
path.
dirname(dataset), "aclImdb", "test"))return train_df, test_df# Reduce logging output.
tf.
logging.
set_verbosity(tf.
logging.
ERROR)train_df, test_df = download_and_load_datasets()# Create datasets (Only take up to `max_seq_length` words for memory)train_text = train_df['sentence'].
tolist()train_text = [' '.
join(t.
split()[0:max_seq_length]) for t in train_text]train_text = np.
array(train_text, dtype=object)[:, np.
newaxis]train_label = train_df['polarity'].
tolist()test_text = test_df['sentence'].
tolist()test_text = [' '.
join(t.
split()[0:max_seq_length]) for t in test_text]test_text = np.
array(test_text, dtype=object)[:, np.
newaxis]test_label = test_df['polarity'].
tolist()Next, we tokenize the data using the tf-hub model, which simplifies preprocessing:# Instantiate tokenizertokenizer = create_tokenizer_from_hub_module()# Convert data to InputExample formattrain_examples = convert_text_to_examples(train_text, train_label)test_examples = convert_text_to_examples(test_text, test_label)# Convert to features(train_input_ids, train_input_masks, train_segment_ids, train_labels ) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)(test_input_ids, test_input_masks, test_segment_ids, test_labels) = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_seq_length)We next build a custom layer using Keras, integrating BERT from tf-hub.
The model is very large (110,302,011 parameters!!!) so we fine tune a subset of layers.
class BertLayer(tf.
layers.
Layer): def __init__(self, n_fine_tune_layers=10, **kwargs): self.
n_fine_tune_layers = n_fine_tune_layers self.
trainable = True self.
output_size = 768 super(BertLayer, self).
__init__(**kwargs)def build(self, input_shape): self.
bert = hub.
Module( bert_path, trainable=self.
trainable, name="{}_module".
format(self.
name) )trainable_vars = self.
bert.
variables# Remove unused layers trainable_vars = [var for var in trainable_vars if not "/cls/" in var.
name]# Select how many layers to fine tune trainable_vars = trainable_vars[-self.
n_fine_tune_layers :]# Add to trainable weights for var in trainable_vars: self.
_trainable_weights.
append(var) for var in self.
bert.
variables: if var not in self.
_trainable_weights: self.
_non_trainable_weights.
append(var)super(BertLayer, self).
build(input_shape)def call(self, inputs): inputs = [K.
cast(x, dtype="int32") for x in inputs] input_ids, input_mask, segment_ids = inputs bert_inputs = dict( input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids ) result = self.
bert(inputs=bert_inputs, signature="tokens", as_dict=True)[ "pooled_output" ] return resultdef compute_output_shape(self, input_shape): return (input_shape[0], self.
output_size)Now, we can easily build and train our model using the BERT layer:# Build modelin_id = tf.
keras.
layers.
Input(shape=(max_seq_length,), name="input_ids")in_mask = tf.
keras.
layers.
Input(shape=(max_seq_length,), name="input_masks")in_segment = tf.
keras.
layers.
Input(shape=(max_seq_length,), name="segment_ids")bert_inputs = [in_id, in_mask, in_segment]bert_output = BertLayer(n_fine_tune_layers=10)(bert_inputs)dense = tf.
keras.
layers.
Dense(256, activation='relu')(bert_output)pred = tf.
keras.
layers.
Dense(1, activation='sigmoid')(dense)model = tf.
keras.
models.
Model(inputs=bert_inputs, outputs=pred)model.
compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])model.
fit( [train_input_ids, train_input_masks, train_segment_ids], train_labels, validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels), epochs=1, batch_size=32)Using a GPU for large models like BERT is advised!Pretty easy!.See the full notebook on Github and build cool stuff!.